Beispiel #1
class ClassBasedOutput(Softmax):
    def __init__(self, n_clusters = None, classclusterpath= None, **kwargs):
        super(ClassBasedOutput, self).__init__(**kwargs)
        self.n_clusters = n_clusters

        del self.b
        self.b_class = sharedX(np.zeros((self.n_clusters, self.n_classes)), name = 'softmax_b_class')
        self.b_cluster = sharedX( np.zeros((self.n_clusters)), name = 'softmax_b_clusters')
        npz_clust = serial.load(classclusterpath)        
        array_clusters = npz_clust['wordwithclusters']
        keys = range(n_clusters)
        self.clusters_scope = dict(zip(keys, np.bincount(array_clusters.astype(int))))
        #self._group_dot = _group_dot
        self.array_clusters = sharedX(array_clusters)
    def set_input_space(self, space):
        self.input_space = space
        if not isinstance(space, Space):
            raise TypeError("Expected Space, got "+
                    str(space)+" of type "+str(type(space)))

        self.input_dim = space.get_total_dimension()
        self.needs_reformat = not isinstance(space, VectorSpace)
        if self.no_affine:
            desired_dim = self.n_classes
            assert self.input_dim == desired_dim
            desired_dim = self.input_dim
        self.desired_space = VectorSpace(desired_dim)

        if not self.needs_reformat:
            assert self.desired_space == self.input_space

        rng = self.mlp.rng

        if self.no_affine:
            self._params = []
            if self.irange is not None:
                assert self.istdev is None
                assert self.sparse_init is None
                W_cluster = rng.uniform(-self.irange,self.irange, (self.input_dim, self.n_clusters))
                W_class = rng.uniform(-self.irange,self.irange, (self.n_clusters, self.input_dim, self.n_classes))
            elif self.istdev is not None:
                assert self.sparse_init is None
                W_cluster = rng.randn(self.input_dim, self.n_clusters) * self.istdev
                W_class = rng.randn(self.n_clusters, self.input_dim, self.n_classes) * self.istdev
                raise NotImplementedError()

            # set the extra dummy weights to 0
            for key in self.clusters_scope.keys():
                W_class[int(key), :, :self.clusters_scope[key]] = 0.

            self.W_class = sharedX(W_class,  'softmax_W_class' )
            self.W_cluster = sharedX(W_cluster,  'softmax_W_cluster' )

            self._params = [self.b_class, self.W_class, self.b_cluster, self.W_cluster]

    def get_layer_monitoring_channels(self, state_below=None,
                                    state=None, targets=NotImplementedError):

        if self.no_affine:
            return OrderedDict()

        W_class = self.W_class
        W_cluster = self.W_cluster

        assert W_class.ndim == 3
        assert W_cluster.ndim == 2

        sq_W = T.sqr(W_cluster)
        sq_W_class = T.sqr(W_class)

        row_norms = T.sqrt(sq_W.sum(axis=1))
        col_norms = T.sqrt(sq_W.sum(axis=0))

        row_norms_class = T.sqrt(sq_W_class.sum(axis=1))
        col_norms_class = T.sqrt(sq_W_class.sum(axis=0))

        rval = OrderedDict([
                            ('row_norms_min'  , row_norms.min()),
                            ('row_norms_mean' , row_norms.mean()),
                            ('row_norms_max'  , row_norms.max()),
                            ('col_norms_min'  , col_norms.min()),
                            ('col_norms_mean' , col_norms.mean()),
                            ('col_norms_max'  , col_norms.max()),
                            ('class_row_norms_min'  , row_norms_class.min()),
                            ('class_row_norms_mean' , row_norms_class.mean()),
                            ('class_row_norms_max'  , row_norms_class.max()),
                            ('class_col_norms_min'  , col_norms_class.min()),
                            ('class_col_norms_mean' , col_norms_class.mean()),
                            ('class_col_norms_max'  , col_norms_class.max()),

        if (state_below is not None) or (state is not None):
            if state is None:

                #for value in get_debug_values(state_below):
                    #print 'value is'+ value
                state=self.fprop (state_below,targets)
            #print state
            probclass, probcluster = state
            mx = probclass.max(axis=1)
                                     ('max_max_class' , mx.max()),
                                     ('min_max_class' , mx.min())
            if targets is not None:
                rval['nll'] = self.cost(Y=targets,Y_hat=(probclass,probcluster))
                rval['perplexity'] = 10 ** (rval['nll']/np.log(10).astype('float32'))
                rval['entropy'] = rval['nll']/np.log(2).astype('float32')
        return rval
    def cost(self, Y, Y_hat):
        Y must be one-hot binary. Y_hat is a softmax estimate.
        of Y. Returns negative log probability of Y under the Y_hat
        y_probclass, y_probcluster = Y_hat
        #Y = self._group_dot.fprop(Y, Y_hat)
        CLS = self.array_clusters[T.cast(T.argmax(Y,axis=1),'int32')]
        #theano.printing.Print('value of cls')(CLS)
        assert hasattr(y_probclass, 'owner')
        owner = y_probclass.owner
        assert owner is not None
        op = owner.op
        if isinstance(op, Print):
          assert len(owner.inputs) == 1
          y_probclass, = owner.inputs
          owner = y_probclass.owner
          op = owner.op
        assert isinstance(op, T.nnet.Softmax)

        z_class ,= owner.inputs
        assert z_class.ndim == 2

        assert hasattr(y_probcluster, 'owner')
        owner = y_probcluster.owner
        assert owner is not None
        op = owner.op
        if isinstance(op, Print):
            assert len(owner.inputs) == 1
            y_probcluster, = owner.inputs
            owner = y_probcluster.owner
            op = owner.op
        assert isinstance(op, T.nnet.Softmax)
        z_cluster ,= owner.inputs
        assert z_cluster.ndim == 2

        z_class = z_class - z_class.max(axis=1).dimshuffle(0, 'x')
        log_prob = z_class - T.log(T.exp(z_class).sum(axis=1).dimshuffle(0, 'x'))
        # we use sum and not mean because this is really one variable per row
        # Y = OneHotFormatter(self.n_classes).theano_expr(
        #                         T.addbroadcast(Y,0,1).dimshuffle(0).astype('uint32'))
        log_prob_of = (Y * log_prob).sum(axis=1)
        assert log_prob_of.ndim == 1

        # cluster
        z_cluster = z_cluster - z_cluster.max(axis=1).dimshuffle(0, 'x')
        log_prob_cls = z_cluster - T.log(T.exp(z_cluster).sum(axis=1).dimshuffle(0, 'x'))

        out = OneHotFormatter(self.n_clusters).theano_expr(CLS.astype('int32'))
        #CLS = OneHotFormatter(self.n_clusters).theano_expr(
         #                        T.addbroadcast(CLS, 1).dimshuffle(0).astype('uint32'))
        log_prob_of_cls = (out * log_prob_cls).sum(axis=1)
        assert log_prob_of_cls.ndim == 1

        # p(w|history) = p(c|s) * p(w|c,s)
        log_prob_of = log_prob_of + log_prob_of_cls
        rval = log_prob_of.mean()        
        return - rval

    def fprop(self, state_below,targets):
        if self.needs_reformat:
            state_below = self.input_space.format_as(state_below, self.desired_space)
        for value in get_debug_values(state_below):
            if self.mlp.batch_size is not None and value.shape[0] != self.mlp.batch_size:
                raise ValueError("state_below should have batch size "+str(self.dbm.batch_size)+" but has "+str(value.shape[0]))
        assert state_below.ndim == 2
        if not hasattr(self, 'no_affine'):
            self.no_affine = False
        if self.no_affine:
            raise NotImplementedError()

        assert self.W_class.ndim == 3
        assert self.W_cluster.ndim == 2

        #we get the cluster by doing hW_cluster + b_cluster
        probcluster =, self.W_cluster) + self.b_cluster
        probcluster = T.nnet.softmax(probcluster)

        #check this line again
        batch_clusters = self.array_clusters[T.cast(T.argmax(targets).flatten(),'int32')]
        Z = T.nnet.GroupDot(self.n_clusters)(state_below,
        probclass = T.nnet.softmax(Z)
        for value in get_debug_values(probclass):
             if self.mlp.batch_size is not None:
                assert value.shape[0] == self.mlp.batch_size
        return probclass, probcluster

    def get_weights_format(self):
        return ('v', 'h', 'h_c')

    def get_biases(self):
        return self.b_class.get_value(), self.b_cluster.get_value()

    def get_weights(self):
        if not isinstance(self.input_space, VectorSpace):
            raise NotImplementedError()
        return self.W_cluster.get_value(), self.W_class.get_value()
Beispiel #2
class MultiSoftmax(Layer):

    def __init__(self, n_groups, n_classes, layer_name, irange = None,
                 istdev = None, sparse_init = None, W_lr_scale = None,
                 b_lr_scale = None, max_row_norm = None,
                 no_affine = False, max_col_norm = None):

        if isinstance(W_lr_scale, str):
            W_lr_scale = float(W_lr_scale)

        del self.self

        assert isinstance(n_classes, py_integer_types)

        self.output_space = MatrixSpace(n_groups, n_classes)
        self.b = sharedX( np.zeros((n_groups, n_classes,)), name = 'softmax_b')

    def get_lr_scalers(self):

        rval = OrderedDict()

        if self.W_lr_scale is not None:
            assert isinstance(self.W_lr_scale, float)
            rval[self.W] = self.W_lr_scale

        if not hasattr(self, 'b_lr_scale'):
            self.b_lr_scale = None

        if self.b_lr_scale is not None:
            assert isinstance(self.b_lr_scale, float)
            rval[self.b] = self.b_lr_scale

        return rval

    def get_monitoring_channels(self):
        return OrderedDict()

    def get_monitoring_channels_from_state(self, state, target=None):
        return OrderedDict()
    def set_input_space(self, space):
        self.input_space = space

        if not isinstance(space, Space):
            raise TypeError("Expected Space, got "+
                    str(space)+" of type "+str(type(space)))

        self.input_dim = space.get_total_dimension()
        self.needs_reformat = not isinstance(space, VectorSpace)

        if self.no_affine:
            desired_dim = self.n_classes
            assert self.input_dim == desired_dim
            desired_dim = self.input_dim
        self.desired_space = VectorSpace(desired_dim)

        if not self.needs_reformat:
            assert self.desired_space == self.input_space

        rng = self.mlp.rng

        if self.irange is not None:
            assert self.istdev is None
            assert self.sparse_init is None
            W = rng.uniform(-self.irange,self.irange, (self.input_dim,self.n_groups,self.n_classes))
        elif self.istdev is not None:
            assert self.sparse_init is None
            W = rng.randn(self.input_dim,self.n_groups,self.n_classes) * self.istdev
            raise NotImplementedError()

        self.W = sharedX(W,  'softmax_W' )

        self._params = [ self.b, self.W ]

    def get_weights_topo(self):
        if not isinstance(self.input_space, Conv2DSpace):
            raise NotImplementedError()
        desired = self.W.get_value().T
        ipt = self.desired_space.format_as(desired, self.input_space)
        rval = Conv2DSpace.convert_numpy(ipt, self.input_space.axes, ('b', 0, 1, 'c'))
        return rval

    def get_weights(self):
        if not isinstance(self.input_space, VectorSpace):
            raise NotImplementedError()

        return self.W.get_value()

    def set_weights(self, weights):

    def set_biases(self, biases):

    def get_biases(self):
        return self.b.get_value()

    def get_weights_format(self):
        return ('v', 'h')

    def fprop(self, state_below):


        if self.needs_reformat:
            state_below = self.input_space.format_as(state_below, self.desired_space)

        for value in get_debug_values(state_below):
            if self.mlp.batch_size is not None and value.shape[0] != self.mlp.batch_size:
                raise ValueError("state_below should have batch size "+str(self.dbm.batch_size)+" but has "+str(value.shape[0]))

        assert state_below.ndim == 2

        assert self.W.ndim == 3

        Z = T.tensordot(state_below, self.W, axes=[[1],[0]]) + self.b

        rval = batched_softmax(Z)

        for value in get_debug_values(rval):
            if self.mlp.batch_size is not None:
                assert value.shape[0] == self.mlp.batch_size

        return rval

    def cost(self, Y, Y_hat):
        return self.cost_from_cost_matrix(self.cost_matrix(Y, Y_hat))

    def cost_from_cost_matrix(self, cost_matrix):
        return cost_matrix.sum(axis=2).mean()

    def cost_matrix(self, Y, Y_hat):
        return -Y * T.log(Y_hat)

    def get_weight_decay(self, coeff):
        if isinstance(coeff, str):
            coeff = float(coeff)
        assert isinstance(coeff, float) or hasattr(coeff, 'dtype')
        return coeff * T.sqr(self.W).sum()

    def get_l1_weight_decay(self, coeff):
        if isinstance(coeff, str):
            coeff = float(coeff)
        assert isinstance(coeff, float) or hasattr(coeff, 'dtype')
        W = self.W
        return coeff * abs(W).sum()

    def censor_updates(self, updates):
        if self.max_row_norm is not None:
            W = self.W
            if W in updates:
                updated_W = updates[W]
                row_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=1))
                desired_norms = T.clip(row_norms, 0, self.max_row_norm)
                updates[W] = updated_W * (desired_norms / (1e-7 + row_norms)).dimshuffle(0, 'x')
        if self.max_col_norm is not None:
            assert self.max_row_norm is None
            W = self.W
            if W in updates:
                updated_W = updates[W]
                col_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=0))
                desired_norms = T.clip(col_norms, 0, self.max_col_norm)
                updates[W] = updated_W * (desired_norms / (1e-7 + col_norms))
Beispiel #3
class BoltzmannIsingVisible(VisibleLayer):
    An IsingVisible whose parameters are defined in Boltzmann machine

    def __init__(self,
            bias_from_marginals = None):
            nvis: the dimension of the space
            bias_from_marginals: a dataset, whose marginals are used to
                            initialize the visible biases

        del self.self
        # Don't serialize the dataset
        del self.bias_from_marginals = VectorSpace(nvis)
        self.input_space =

        origin =

        if bias_from_marginals is None:
            init_bias = np.zeros((nvis,))
            # data is in [-1, 1], but want biases for a sigmoid
            init_bias = init_sigmoid_bias_from_array(bias_from_marginals.X / 2. + 0.5)
            # init_bias =
        self.boltzmann_bias = sharedX(init_bias, 'visible_bias')

    def get_biases(self):
        assert False # not really sure what this should do for this layer

    def set_biases(self, biases, recenter=False):
        assert False # not really sure what this should do for this layer

    def ising_bias(self, for_sampling=False):
        if for_sampling and self.layer_above.sampling_b_stdev is not None:
            return self.noisy_sampling_b
        return 0.5 * self.boltzmann_bias + 0.25 * self.layer_above.W.sum(axis=1)

    def ising_bias_numpy(self):
        return 0.5 * self.boltzmann_bias.get_value() + 0.25 * self.layer_above.W.get_value().sum(axis=1)

    def upward_state(self, total_state):
        return total_state

    def get_params(self):
        rval =  [self.boltzmann_bias]
        return rval

    def sample(self, state_below = None, state_above = None,
            layer_above = None,
            theano_rng = None):

        assert state_below is None

        msg = layer_above.downward_message(state_above, for_sampling=True)

        bias = self.ising_bias(for_sampling=True)

        z = msg + bias

        phi = T.nnet.sigmoid(2. * z)

        rval = theano_rng.binomial(size = phi.shape, p = phi, dtype = phi.dtype,
                       n = 1 )

        return rval * 2. - 1.

    def make_state(self, num_examples, numpy_rng):
        driver = numpy_rng.uniform(0.,1., (num_examples, self.nvis))
        on_prob = sigmoid_numpy(2. * self.ising_bias_numpy())
        sample = 2. * (driver < on_prob) - 1.

        rval = sharedX(sample, name = 'v_sample_shared')

        return rval

    def make_symbolic_state(self, num_examples, theano_rng):
        mean = T.nnet.sigmoid(2. * self.ising_bias())
        rval = theano_rng.binomial(size=(num_examples, self.nvis), p=mean)
        rval = 2. * (rval) - 1.

        return rval

    def expected_energy_term(self, state, average, state_below = None, average_below = None):

        # state = Print('v_state', attrs=['min', 'max'])(state)

        assert state_below is None
        assert average_below is None
        assert average in [True, False]

        # Energy function is linear so it doesn't matter if we're averaging or not
        rval =, self.ising_bias())

        assert rval.ndim == 1

        return rval

    def get_monitoring_channels(self):
        rval = OrderedDict()

        ising_b = self.ising_bias()

        rval['ising_b_min'] = ising_b.min()
        rval['ising_b_max'] = ising_b.max()

        if hasattr(self, 'noisy_sampling_b'):
            rval['noisy_sampling_b_min'] = self.noisy_sampling_b.min()
            rval['noisy_sampling_b_max'] = self.noisy_sampling_b.max()

        return rval
Beispiel #4
class IsingVisible(VisibleLayer):
    A DBM visible layer consisting of random variables living
    in a VectorSpace, with values in {-1, 1}
    Implements the energy function term
    -b^T h

    def __init__(self,
            bias_from_marginals = None):
            nvis: the dimension of the space
            bias_from_marginals: a dataset, whose marginals are used to
                            initialize the visible biases

        del self.self
        # Don't serialize the dataset
        del self.bias_from_marginals = VectorSpace(nvis)
        self.input_space =

        origin =

        if bias_from_marginals is None:
            init_bias = np.zeros((nvis,))
            init_bias = init_tanh_bias_from_marginals(bias_from_marginals)

        self.bias = sharedX(init_bias, 'visible_bias')

    def get_biases(self):
        return self.bias.get_value()

    def set_biases(self, biases, recenter=False):
        if recenter:

    def upward_state(self, total_state):
        return total_state

    def get_params(self):
        return [self.bias]

    def sample(self, state_below = None, state_above = None,
            layer_above = None,
            theano_rng = None):

        assert state_below is None

        msg = layer_above.downward_message(state_above)

        bias = self.bias

        z = msg + bias

        phi = T.nnet.sigmoid(2. * z)

        rval = theano_rng.binomial(size = phi.shape, p = phi, dtype = phi.dtype,
                       n = 1 )

        return rval * 2. - 1.

    def make_state(self, num_examples, numpy_rng):
        driver = numpy_rng.uniform(0.,1., (num_examples, self.nvis))
        on_prob = sigmoid_numpy(2. * self.bias.get_value())
        sample = 2. * (driver < on_prob) - 1.

        rval = sharedX(sample, name = 'v_sample_shared')

        return rval

    def make_symbolic_state(self, num_examples, theano_rng):
        mean = T.nnet.sigmoid(2. * self.b)
        rval = theano_rng.binomial(size=(num_examples, self.nvis), p=mean)
        rval = 2. * (rval) - 1.

        return rval

    def expected_energy_term(self, state, average, state_below = None, average_below = None):

        assert state_below is None
        assert average_below is None
        assert average in [True, False]

        # Energy function is linear so it doesn't matter if we're averaging or not
        rval =, self.bias)

        assert rval.ndim == 1

        return rval
Beispiel #5
class ToyRNNPhone(Model):
    def __init__(self,
        allowed_non_linearities = {'sigmoid': T.nnet.sigmoid, 'tanh': T.tanh}
        self.nvis = nvis
        self.nhid = nhid
        self.hidden_transition_model = hidden_transition_model
        self.use_ground_truth = use_ground_truth
        self.alpha = sharedX(1)
        self.alpha_decrease_rate = 0.999

        assert non_linearity in allowed_non_linearities
        self.non_linearity = allowed_non_linearities[non_linearity]

        # Space initialization
        self.input_space = VectorSpace(dim=self.nvis)
        self.hidden_space = VectorSpace(dim=self.nhid)
        self.output_space = VectorSpace(dim=1)
        self.input_source = 'features'
        self.target_source = 'targets'

        # Features-to-hidden matrix
        W_value = numpy.random.uniform(low=-irange,
                                       size=(self.nvis, self.nhid))
        self.W = sharedX(W_value, name='W')
        # Hidden biases
        b_value = numpy.zeros(self.nhid)
        self.b = sharedX(b_value, name='b')
        # Hidden-to-out matrix
        U_value = numpy.random.uniform(low=-irange,
                                       size=(self.nhid, 1))
        self.U = sharedX(U_value, name='U')
        # Output bias
        c_value = numpy.zeros(1)
        self.c = sharedX(c_value, name='c')

    def fprop_step(self, features, h_tm1, out):
        h_tm1 = self.hidden_space.format_as(
            h_tm1.dimshuffle('x', 0), self.hidden_transition_model.input_space)
        h = T.nnet.sigmoid(
  , self.W) +
            self.hidden_transition_model.fprop(h_tm1).flatten() + self.b)
        out =, self.U) + self.c
        return h, out

    def fprop_step_prime(self, truth, features, h_tm1, out):
        features = T.set_subtensor(features[-1],
                                   (1 - self.alpha) * features[-1] +
                                   self.alpha * truth[-1])
        h_tm1 = self.hidden_space.format_as(
            h_tm1.dimshuffle('x', 0), self.hidden_transition_model.input_space)
        h = T.nnet.sigmoid(
  , self.W) +
            self.hidden_transition_model.fprop(h_tm1).flatten() + self.b)
        out =, self.U) + self.c
        features = T.concatenate([features[1:], out])
        return features, h, out

    def fprop(self, data):
        if self.use_ground_truth:
            features = data

            init_h = T.alloc(numpy.cast[theano.config.floatX](0), self.nhid)
            init_out = T.alloc(numpy.cast[theano.config.floatX](0), 1)
            init_out = T.unbroadcast(init_out, 0)

            fn = lambda f, h, o: self.fprop_step(f, h, o)

            ((h, out), updates) = theano.scan(
                outputs_info=[dict(initial=init_h, taps=[-1]), init_out])
            return out
            features = data

            init_in = features[0]
            init_h = T.alloc(numpy.cast[theano.config.floatX](0), self.nhid)
            init_out = T.alloc(numpy.cast[theano.config.floatX](0), 1)
            init_out = T.unbroadcast(init_out, 0)

            fn = lambda t, f, h, o: self.fprop_step_prime(t, f, h, o)

            ((f, h, out), updates) = theano.scan(fn=fn,
                                                          taps=[-1]), init_out
            return out

    def predict_next(self, features, h_tm1):
        h_tm1 = self.hidden_space.format_as(
            h_tm1.dimshuffle('x', 0), self.hidden_transition_model.input_space)
        h = T.nnet.sigmoid(
  , self.W) +
            self.hidden_transition_model.fprop(h_tm1).flatten() + self.b)
        out =, self.U) + self.c
        return h, out

    def get_params(self):
        return [self.W, self.b, self.U, self.c] + \

    def get_input_source(self):
        return self.input_source

    def get_target_source(self):
        return self.target_source

    def censor_updates(self, updates):
        updates[self.alpha] = self.alpha_decrease_rate * self.alpha

    def get_monitoring_channels(self, data):
        rval = OrderedDict()
        rval['alpha'] = self.alpha
        return rval
Beispiel #6
class HingeLoss(Layer):

    def __init__(self, n_classes, layer_name, irange = None,
                 istdev = None,
                 sparse_init = None):
        super(HingeLoss, self).__init__();

        del self.self

        self.output_space = VectorSpace(n_classes)

        if not self.no_affine:
            self.b = sharedX(np.zeros((n_classes,)), name = 'hingeloss_b')

    def get_monitoring_channels(self):

        if self.no_affine:
            return OrderedDict()

        W = self.W

        assert W.ndim == 2

        sq_W = T.sqr(W)

        row_norms = T.sqrt(sq_W.sum(axis=1))
        col_norms = T.sqrt(sq_W.sum(axis=0))

        return OrderedDict([
                            ('row_norms_min'  , row_norms.min()),
                            ('row_norms_mean' , row_norms.mean()),
                            ('row_norms_max'  , row_norms.max()),
                            ('col_norms_min'  , col_norms.min()),
                            ('col_norms_mean' , col_norms.mean()),
                            ('col_norms_max'  , col_norms.max()),

    def get_layer_monitoring_channels(self, state_below=None,
                                    state=None, targets=None):

        # channels that does not require state information
#         if self.no_affine:
#             rval = OrderedDict()
#         W = self.W
#         assert W.ndim == 2
#         sq_W = T.sqr(W)
#         row_norms = T.sqrt(sq_W.sum(axis=1))
#         col_norms = T.sqrt(sq_W.sum(axis=0))
#         rval = OrderedDict([('row_norms_min',  row_norms.min()),
#                             ('row_norms_mean', row_norms.mean()),
#                             ('row_norms_max',  row_norms.max()),
#                             ('col_norms_min',  col_norms.min()),
#                             ('col_norms_mean', col_norms.mean()),
#                             ('col_norms_max',  col_norms.max()), ])

        rval = OrderedDict()
        if (state_below is not None) or (state is not None):
            if state is None:
                state = self.fprop(state_below)

            mx = state.max(axis=1)

                                ('mean_max_class', mx.mean()),
                                ('max_max_class', mx.max()),
                                ('min_max_class', mx.min())]))

            if targets is not None:
                y_hat = self.target_convert(T.argmax(state, axis=1))
                #Assume target is in [0,1] as binary one-hot
                y = self.target_convert(T.argmax(targets, axis=1))
                misclass = T.neq(y, y_hat).mean()
                misclass = T.cast(misclass, config.floatX)
                rval['misclass'] = misclass
                rval['nll'] = self.cost(Y_hat=state, Y=targets)

        return rval

    def get_monitoring_channels_from_state(self, state, target=None):
        warnings.warn("Layer.get_monitoring_channels_from_state is " + \
                    "deprecated. Use get_layer_monitoring_channels " + \
                    "instead. Layer.get_monitoring_channels_from_state " + \
                    "will be removed on or after september 24th 2014",

        mx = state.max(axis=1)

        rval =  OrderedDict([
                ('mean_max_class' , mx.mean()),
                ('max_max_class' , mx.max()),
                ('min_max_class' , mx.min())

        if target is not None:
            y_hat = self.target_convert(T.argmax(state, axis=1))
            #Assume target is in [0,1] as binary one-hot
            y = self.target_convert(T.argmax(target, axis=1))
            misclass = T.neq(y, y_hat).mean()
            misclass = T.cast(misclass, config.floatX)
            rval['misclass'] = misclass
            rval['nll'] = self.cost(Y_hat=state, Y=target)

        return rval

    def set_input_space(self, space):
        self.input_space = space

        if not isinstance(space, Space):
            raise TypeError("Expected Space, got "+
                    str(space)+" of type "+str(type(space)))

        self.input_dim = space.get_total_dimension()
        self.needs_reformat = not isinstance(space, VectorSpace)

        desired_dim = self.input_dim
        self.desired_space = VectorSpace(desired_dim)

        if not self.needs_reformat:
            assert self.desired_space == self.input_space

        rng = self.mlp.rng

        if self.no_affine:
            self._params = []
            if self.irange is not None:
                assert self.istdev is None
                assert self.sparse_init is None
                W = rng.uniform(-self.irange,self.irange, (self.input_dim,self.n_classes))
            elif self.istdev is not None:
                assert self.sparse_init is None
                W = rng.randn(self.input_dim, self.n_classes) * self.istdev
                assert self.sparse_init is not None
                W = np.zeros((self.input_dim, self.n_classes))
                for i in xrange(self.n_classes):
                    for j in xrange(self.sparse_init):
                        idx = rng.randint(0, self.input_dim)
                        while W[idx, i] != 0.:
                            idx = rng.randint(0, self.input_dim)
                        W[idx, i] = rng.randn()

            self.W = sharedX(W,  'hingeloss_W' )

            self._params = [ self.b, self.W ]

    def get_weights_topo(self):
        if not isinstance(self.input_space, Conv2DSpace):
            raise NotImplementedError()
        desired = self.W.get_value().T
        ipt = self.desired_space.np_format_as(desired, self.input_space)
        rval = Conv2DSpace.convert_numpy(ipt,
                                         ('b', 0, 1, 'c'))
        return rval

    def get_weights(self):
        if not isinstance(self.input_space, VectorSpace):
            raise NotImplementedError()

        return self.W.get_value()

    def set_weights(self, weights):

    def set_biases(self, biases):

    def get_biases(self):
        return self.b.get_value()

    def get_weights_format(self):
        return ('v', 'h')

    def fprop(self, state_below):

        if self.needs_reformat:
            state_below = self.input_space.format_as(state_below, self.desired_space)

        for value in get_debug_values(state_below):
            if self.mlp.batch_size is not None and value.shape[0] != self.mlp.batch_size:
                raise ValueError("state_below should have batch size "+str(self.dbm.batch_size)+" but has "+str(value.shape[0]))

        assert state_below.ndim == 2

        if not hasattr(self, 'no_affine'):
            self.no_affine = False

        if self.no_affine:
            rval = state_below
            assert self.W.ndim == 2
            b = self.b
            W = self.W

            rval =, W) + b

        for value in get_debug_values(rval):
            if self.mlp.batch_size is not None:
                assert value.shape[0] == self.mlp.batch_size

        return rval

    def target_convert(self, Y):
        converts target [0,1] to [-1, 1]
        Y_t = 2. * Y - 1.
        return Y_t

    # def hinge_cost(self, W, Y, Y_hat, C=1.):
    def hinge_cost(self, Y, Y_hat):
        #prob = .5 *, self.W) + C * (T.maximum(1 - Y * Y_hat, 0) ** 2.).sum(axis=1)
        prob = (T.maximum(1 - Y * Y_hat, 0) ** 2.).sum(axis=1)
        return prob

    def cost(self, Y, Y_hat):
        Y must be one-hot binary. Y_hat is a hinge loss estimate.
        of Y.

        assert hasattr(Y_hat, 'owner')
        owner = Y_hat.owner
        assert owner is not None
        op = owner.op
        if isinstance(op, Print):
            assert len(owner.inputs) == 1
            Y_hat, = owner.inputs
            owner = Y_hat.owner
            op = owner.op
        assert Y_hat.ndim == 2
        Y_t = self.target_convert(Y)
        # prob = self.hinge_cost(self.W, Y_t, Y_hat)
        prob = self.hinge_cost(Y_t, Y_hat)
        assert prob.ndim == 1
        rval = prob.mean()

        return rval

    def cost_matrix(self, Y, Y_hat):
        Y must be one-hot binary. Y_hat is a hinge loss estimate.
        of Y.

        assert hasattr(Y_hat, 'owner')
        owner = Y_hat.owner
        assert owner is not None
        op = owner.op
        if isinstance(op, Print):
            assert len(owner.inputs) == 1
            Y_hat, = owner.inputs
            owner = Y_hat.owner
            op = owner.op

        assert Y_hat.ndim == 2
        Y_t = self.target_convert(Y)
        # prob = self.hinge_cost(self.W, Y_t, Y_hat)
        prob = self.hinge_cost(Y_t, Y_hat)
        return prob

    def get_weight_decay(self, coeff):
        if isinstance(coeff, str):
            coeff = float(coeff)
        assert isinstance(coeff, float) or hasattr(coeff, 'dtype')
        return coeff * T.sqr(self.W).sum()

    def get_l1_weight_decay(self, coeff):
        if isinstance(coeff, str):
            coeff = float(coeff)
        assert isinstance(coeff, float) or hasattr(coeff, 'dtype')
        W = self.W
        return coeff * abs(W).sum()

    def _modify_updates(self, updates):

        if self.no_affine:
class Factorized(Softmax):
    def __init__(self,
                 irange = None,
                 b_lr_scale = None,
                 V_lr_scale = None,
                 U_lr_scale = None,
                 Q_lr_scale = None,
                 Ui_lr_scale = None

        del self.self
        assert isinstance(n_classes, py_integer_types)

        self.output_space = VectorSpace(n_classes)

    def set_input_space(self, space):
        self.input_space = space

        if not isinstance(space, Space):
            raise TypeError("Expected Space, got "+
                    str(space)+" of type "+str(type(space)))

        self.input_dim = space.get_total_dimension()
        self.needs_reformat = not isinstance(space, VectorSpace)

        desired_dim = self.input_dim
        self.desired_space = VectorSpace(desired_dim)

        if not self.needs_reformat:
            assert self.desired_space == self.input_space

        rng = self.mlp.rng

        self._params = []
        V = np.zeros((self.n_classes, self.input_dim),dtype=np.float32)
        self.V = sharedX(V,   self.layer_name + "_V" )

        U = np.identity( self.input_dim)
        self.U = sharedX(U, self.layer_name + "_U")

        Q =  np.zeros((self.input_dim, self.input_dim),dtype=np.float32)
        self.Q = sharedX(Q, self.layer_name + "_Q")

        Ui =  np.identity(self.input_dim,dtype=np.float32)
        self.Ui = sharedX(Ui, self.layer_name + "_Ui")

        self._params = [ self.U, self.Ui, self.V, self.Q]

    def fprop(self, state_below):


        if self.needs_reformat:
            state_below = self.input_space.format_as(state_below, self.desired_space)

        for value in get_debug_values(state_below):
            if self.mlp.batch_size is not None and value.shape[0] != self.mlp.batch_size:
                raise ValueError("state_below should have batch size "+str(self.dbm.batch_size)+" but has "+str(value.shape[0]))

        assert state_below.ndim == 2

        W =, self.U)
        assert W.ndim == 2

        Z =, W.T)

        rval = Z

        for value in get_debug_values(rval):
            if self.mlp.batch_size is not None:
                assert value.shape[0] == self.mlp.batch_size

        return (rval, state_below)

    def get_params(self):
        rval = []

        return rval

    def get_lr_scalers(self):

        if not hasattr(self, 'b_lr_scale'):
            self.b_lr_scale = None

        if not hasattr(self, 'V_lr_scale'):
            self.V_lr_scale = None

        if not hasattr(self, 'U_lr_scale'):
            self.U_lr_scale = None

        if not hasattr(self, 'Q_lr_scale'):
            self.Q_lr_scale = None

        if not hasattr(self, 'Ui_lr_scale'):
            self.Ui_lr_scale = None

        rval = OrderedDict()

        if self.b_lr_scale is not None:
            rval[self.b] = self.b_lr_scale

        if self.V_lr_scale is not None:
            rval[self.V] = self.V_lr_scale

        if self.U_lr_scale is not None:
            rval[self.U] = self.U_lr_scale

        if self.Q_lr_scale is not None:
            rval[self.Q] = self.Q_lr_scale

        if self.Ui_lr_scale is not None:
            rval[self.Ui] = self.Ui_lr_scale

        return rval

    def cost(self, Y, Y_hat):
        Y_hat_true, h = Y_hat
        assert hasattr(Y_hat_true, 'owner')
        owner = Y_hat_true.owner
        assert owner is not None
        val = SqLoss()([h, self.Q, self.U, self.Ui, self.V, Y])[0]
        return (T.mean(val,  dtype='float32'), (h, T.mean(val, axis=0)))

    def get_monitoring_channels(self):
        W =,self.U)
        assert W.ndim == 2

        sq_W = T.sqr(W)

        row_norms = T.sqrt(sq_W.sum(axis=1))
        col_norms = T.sqrt(sq_W.sum(axis=0))

        return OrderedDict([
                            ('row_norms_min'  , row_norms.min()),
                            ('row_norms_mean' , row_norms.mean()),
                            ('row_norms_max'  , row_norms.max()),
                            ('col_norms_min'  , col_norms.min()),
                            ('col_norms_mean' , col_norms.mean()),
                            ('col_norms_max'  , col_norms.max()),

    def censor_updates(self, updates):
Beispiel #8
class BinaryVector(VisibleLayer):
    A DBM visible layer consisting of binary random variables living
    in a VectorSpace.
    def __init__(self, nvis, bias_from_marginals=None):
            nvis: the dimension of the space
            bias_from_marginals: a dataset, whose marginals are used to
                            initialize the visible biases

        del self.self
        # Don't serialize the dataset
        del self.bias_from_marginals = VectorSpace(nvis)
        self.input_space =

        origin =

        if bias_from_marginals is None:
            init_bias = np.zeros((nvis, ))
            X = bias_from_marginals.get_design_matrix()
            assert X.max() == 1.
            assert X.min() == 0.
            assert not np.any((X > 0.) * (X < 1.))

            mean = X.mean(axis=0)

            mean = np.clip(mean, 1e-7, 1 - 1e-7)

            init_bias = inverse_sigmoid_numpy(mean)

        self.bias = sharedX(init_bias, 'visible_bias')

    def get_biases(self):
        return self.bias.get_value()

    def set_biases(self, biases):

    def get_total_state_space(self):
        return self.get_input_space()

    def get_params(self):
        return set([self.bias])

    def sample(self,

        assert state_below is None

        msg = layer_above.downward_message(state_above)

        bias = self.bias

        z = msg + bias

        phi = T.nnet.sigmoid(z)

        rval = theano_rng.binomial(size=phi.shape, p=phi, dtype=phi.dtype, n=1)

        return rval

    def make_state(self, num_examples, numpy_rng):

        driver = numpy_rng.uniform(0., 1., (num_examples, self.nvis))
        mean = sigmoid_numpy(self.bias.get_value())
        sample = driver < mean

        rval = sharedX(sample, name='v_sample_shared')

        return rval

    def expected_energy_term(self,

        assert state_below is None
        assert average_below is None
        assert average in [True, False]

        # Energy function is linear so it doesn't matter if we're averaging or not
        rval =, self.bias)

        assert rval.ndim == 1

        return rval
Beispiel #9
class BinaryVector(VisibleLayer):
    A DBM visible layer consisting of binary random variables living
    in a VectorSpace.

    def __init__(self,
            bias_from_marginals = None):
            nvis: the dimension of the space
            bias_from_marginals: a dataset, whose marginals are used to
                            initialize the visible biases

        del self.self
        # Don't serialize the dataset
        del self.bias_from_marginals = VectorSpace(nvis)
        self.input_space =

        origin =

        if bias_from_marginals is None:
            init_bias = np.zeros((nvis,))
            X = bias_from_marginals.get_design_matrix()
            assert X.max() == 1.
            assert X.min() == 0.
            assert not np.any( (X > 0.) * (X < 1.) )

            mean = X.mean(axis=0)

            mean = np.clip(mean, 1e-7, 1-1e-7)

            init_bias = inverse_sigmoid_numpy(mean)

        self.bias = sharedX(init_bias, 'visible_bias')

    def get_biases(self):
        return self.bias.get_value()

    def set_biases(self, biases):

    def get_total_state_space(self):
        return self.get_input_space()

    def get_params(self):
        return set([self.bias])

    def sample(self, state_below = None, state_above = None,
            layer_above = None,
            theano_rng = None):

        assert state_below is None

        msg = layer_above.downward_message(state_above)

        bias = self.bias

        z = msg + bias

        phi = T.nnet.sigmoid(z)

        rval = theano_rng.binomial(size = phi.shape, p = phi, dtype = phi.dtype,
                       n = 1 )

        return rval

    def make_state(self, num_examples, numpy_rng):

        driver = numpy_rng.uniform(0.,1., (num_examples, self.nvis))
        mean = sigmoid_numpy(self.bias.get_value())
        sample = driver < mean

        rval = sharedX(sample, name = 'v_sample_shared')

        return rval

    def expected_energy_term(self, state, average, state_below = None, average_below = None):

        assert state_below is None
        assert average_below is None
        assert average in [True, False]

        # Energy function is linear so it doesn't matter if we're averaging or not
        rval =, self.bias)

        assert rval.ndim == 1

        return rval
Beispiel #10
class BinaryVectorMaxPool(HiddenLayer):
        A hidden layer that does max-pooling on binary vectors.
        It has two sublayers, the detector layer and the pooling
        layer. The detector layer is its downward state and the pooling
        layer is its upward state.

        TODO: this layer uses (pooled, detector) as its total state,
              which can be confusing when listing all the states in
              the network left to right. Change this and
              pylearn2.expr.probabilistic_max_pooling to use
              (detector, pooled)

    def __init__(self,
            irange = None,
            sparse_init = None,
            include_prob = 1.0,
            init_bias = 0.):

            include_prob: probability of including a weight element in the set
                    of weights initialized to U(-irange, irange). If not included
                    it is initialized to 0.

        del self.self

        self.b = sharedX( np.zeros((self.detector_layer_dim,)) + init_bias, name = layer_name + '_b')

    def set_input_space(self, space):
        """ Note: this resets parameters! """

        self.input_space = space

        if isinstance(space, VectorSpace):
            self.requires_reformat = False
            self.input_dim = space.dim
            self.requires_reformat = True
            self.input_dim = space.get_total_dimension()
            self.desired_space = VectorSpace(self.input_dim)

        if not (self.detector_layer_dim % self.pool_size == 0):
            raise ValueError("detector_layer_dim = %d, pool_size = %d. Should be divisible but remainder is %d" %
                    (self.detector_layer_dim, self.pool_size, self.detector_layer_dim % self.pool_size))

        self.h_space = VectorSpace(self.detector_layer_dim)
        self.pool_layer_dim = self.detector_layer_dim / self.pool_size
        self.output_space = VectorSpace(self.pool_layer_dim)

        rng = self.dbm.rng
        if self.irange is not None:
            assert self.sparse_init is None
            W = rng.uniform(-self.irange,
                                 (self.input_dim, self.detector_layer_dim)) * \
                    (rng.uniform(0.,1., (self.input_dim, self.detector_layer_dim))
                     < self.include_prob)
            assert self.sparse_init is not None
            W = np.zeros((self.input_dim, self.detector_layer_dim))
            for i in xrange(self.detector_layer_dim):
                for j in xrange(self.sparse_init):
                    idx = rng.randint(0, self.input_dim)
                    while W[idx, i] != 0:
                        idx = rng.randint(0, self.input_dim)
                    W[idx, i] = rng.randn()

        W = sharedX(W) = self.layer_name + '_W'

        self.transformer = MatrixMul(W)

        W ,= self.transformer.get_params()
        assert is not None

    def get_total_state_space(self):
        return CompositeSpace((self.output_space, self.h_space))

    def get_params(self):
        assert is not None
        W ,= self.transformer.get_params()
        assert is not None
        return self.transformer.get_params().union([self.b])

    def get_weight_decay(self, coeff):
        if isinstance(coeff, str):
            coeff = float(coeff)
        assert isinstance(coeff, float)
        W ,= self.transformer.get_params()
        return coeff * T.sqr(W).sum()

    def get_weights(self):
        if self.requires_reformat:
            # This is not really an unimplemented case.
            # We actually don't know how to format the weights
            # in design space. We got the data in topo space
            # and we don't have access to the dataset
            raise NotImplementedError()
        W ,= self.transformer.get_params()
        return W.get_value()

    def set_weights(self, weights):
        W, = self.transformer.get_params()

    def set_biases(self, biases):

    def get_biases(self):
        return self.b.get_value()

    def get_weights_format(self):
        return ('v', 'h')

    def get_weights_view_shape(self):
        total = self.detector_layer_dim
        cols = self.pool_size
        if cols == 1:
            # Let the PatchViewer decidew how to arrange the units
            # when they're not pooled
            raise NotImplementedError()
        # When they are pooled, make each pooling unit have one row
        rows = total / cols
        return rows, cols

    def get_weights_topo(self):

        if not isinstance(self.input_space, Conv2DSpace):
            raise NotImplementedError()

        W ,= self.transformer.get_params()

        W = W.T

        W = W.reshape((self.detector_layer_dim, self.input_space.shape[0],
            self.input_space.shape[1], self.input_space.nchannels))

        W = Conv2DSpace.convert(W, self.input_space.axes, ('b', 0, 1, 'c'))

        return function([], W)()

    def upward_state(self, total_state):
        p,h = total_state
        return p

    def downward_state(self, total_state):
        p,h = total_state
        return h

    def get_monitoring_channels_from_state(self, state):

        P, H = state

        rval ={}

        if self.pool_size == 1:
            vars_and_prefixes = [ (P,'') ]
            vars_and_prefixes = [ (P, 'p_'), (H, 'h_') ]

        for var, prefix in vars_and_prefixes:
            v_max = var.max(axis=0)
            v_min = var.min(axis=0)
            v_mean = var.mean(axis=0)
            v_range = v_max - v_min

            for key, val in [
                    ('max_max', v_max.max()),
                    ('max_mean', v_max.mean()),
                    ('max_min', v_max.min()),
                    ('min_max', v_min.max()),
                    ('min_mean', v_min.mean()),
                    ('min_max', v_min.max()),
                    ('range_max', v_range.max()),
                    ('range_mean', v_range.mean()),
                    ('range_min', v_range.min()),
                    ('mean_max', v_mean.max()),
                    ('mean_mean', v_mean.mean()),
                    ('mean_min', v_mean.min())
                rval[prefix+key] = val

        return rval

    def get_l1_act_cost(self, state, target, coeff, eps = None):
        rval = 0.

        P, H = state

        if self.pool_size == 1:
            # If the pool size is 1 then pools = detectors
            # and we should not penalize pools and detectors separately
            assert len(state) == 2
            assert isinstance(target, float)
            assert isinstance(coeff, float)
            _, state = state
            state = [state]
            target = [target]
            coeff = [coeff]
            if eps is None:
                eps = [0.]
                eps = [eps]
            assert all([len(elem) == 2 for elem in [state, target, coeff]])
            if eps is None:
                eps = [0., 0.]
            if target[1] < target[0]:
                warnings.warn("Do you really want to regularize the detector units to be sparser than the pooling units?")

        for s, t, c, e in safe_zip(state, target, coeff, eps):
            assert all([isinstance(elem, float) for elem in [t, c, e]])
            if c == 0.:
            m = s.mean(axis=0)
            assert m.ndim == 1
            rval += T.maximum(abs(m-t)-e,0.).mean()*c

        return rval

    def sample(self, state_below = None, state_above = None,
            layer_above = None,
            theano_rng = None):

        if theano_rng is None:
            raise ValueError("theano_rng is required; it just defaults to None so that it may appear after layer_above / state_above in the list.")

        if state_above is not None:
            msg = layer_above.downward_message(state_above)
            msg = None

        if self.requires_reformat:
            state_below = self.input_space.format_as(state_below, self.desired_space)

        z = self.transformer.lmul(state_below) + self.b
        p, h, p_sample, h_sample = max_pool_channels(z,
                self.pool_size, msg, theano_rng)

        return p_sample, h_sample

    def downward_message(self, downward_state):
        rval = self.transformer.lmul_T(downward_state)

        if self.requires_reformat:
            rval = self.desired_space.format_as(rval, self.input_space)

        return rval

    def make_state(self, num_examples, numpy_rng):
        """ Returns a shared variable containing an actual state
           (not a mean field state) for this variable.

        t1 = time.time()

        empty_input = self.h_space.get_origin_batch(num_examples)
        h_state = sharedX(empty_input)

        default_z = T.zeros_like(h_state) + self.b

        theano_rng = MRG_RandomStreams(numpy_rng.randint(2 ** 16))

        p_exp, h_exp, p_sample, h_sample = max_pool_channels(
                z = default_z,
                pool_size = self.pool_size,
                theano_rng = theano_rng)

        assert h_sample.dtype == default_z.dtype

        p_state = sharedX( self.output_space.get_origin_batch(

        t2 = time.time()

        f = function([], updates = {
            p_state : p_sample,
            h_state : h_sample

        t3 = time.time()


        t4 = time.time()

        print str(self)+'.make_state took',t4-t1
        print '\tcompose time:',t2-t1
        print '\tcompile time:',t3-t2
        print '\texecute time:',t4-t3 = 'p_sample_shared' = 'h_sample_shared'

        return p_state, h_state

    def expected_energy_term(self, state, average, state_below, average_below):


        if self.requires_reformat:
            if not isinstance(state_below, tuple):
                for sb in get_debug_values(state_below):
                    if sb.shape[0] != self.dbm.batch_size:
                        raise ValueError("self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0]))
                    assert reduce(lambda x,y: x * y, sb.shape[1:]) == self.input_dim

            state_below = self.input_space.format_as(state_below, self.desired_space)

        downward_state = self.downward_state(state)

        # Energy function is linear so it doesn't matter if we're averaging or not
        # Specifically, our terms are -u^T W d - b^T d where u is the upward state of layer below
        # and d is the downward state of this layer

        bias_term =, self.b)
        weights_term = (self.transformer.lmul(state_below) * downward_state).sum(axis=1)

        rval = -bias_term - weights_term

        assert rval.ndim == 1

        return rval

    def mf_update(self, state_below, state_above, layer_above = None, double_weights = False, iter_name = None):


        if self.requires_reformat:
            if not isinstance(state_below, tuple):
                for sb in get_debug_values(state_below):
                    if sb.shape[0] != self.dbm.batch_size:
                        raise ValueError("self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0]))
                    assert reduce(lambda x,y: x * y, sb.shape[1:]) == self.input_dim

            state_below = self.input_space.format_as(state_below, self.desired_space)

        if iter_name is None:
            iter_name = 'anon'

        if state_above is not None:
            assert layer_above is not None
            msg = layer_above.downward_message(state_above)
   = 'msg_from_'+layer_above.layer_name+'_to_'+self.layer_name+'['+iter_name+']'
            msg = None

        if double_weights:
            state_below = 2. * state_below
   = self.layer_name + '_'+iter_name + '_2state'
        z = self.transformer.lmul(state_below) + self.b
        if self.layer_name is not None and iter_name is not None:
   = self.layer_name + '_' + iter_name + '_z'
        p,h = max_pool_channels(z, self.pool_size, msg) = self.layer_name + '_p_' + iter_name = self.layer_name + '_h_' + iter_name

        return p, h
Beispiel #11
class Softmax(HiddenLayer):

    def __init__(self, n_classes, layer_name, irange = None,
                 sparse_init = None, W_lr_scale = None):

        if isinstance(W_lr_scale, str):
            W_lr_scale = float(W_lr_scale)

        del self.self

        assert isinstance(n_classes, int)

        self.output_space = VectorSpace(n_classes)
        self.b = sharedX( np.zeros((n_classes,)), name = 'softmax_b')

    def get_lr_scalers(self):

        rval = {}

        # Patch old pickle files
        if not hasattr(self, 'W_lr_scale'):
            self.W_lr_scale = None

        if self.W_lr_scale is not None:
            assert isinstance(self.W_lr_scale, float)
            rval[self.W] = self.W_lr_scale

        return rval

    def get_total_state_space(self):
        return self.output_space

    def get_monitoring_channels_from_state(self, state):

        mx = state.max(axis=1)

        return {
                'mean_max_class' : mx.mean(),
                'max_max_class' : mx.max(),
                'min_max_class' : mx.min()

    def set_input_space(self, space):
        self.input_space = space

        if not isinstance(space, Space):
            raise TypeError("Expected Space, got "+
                    str(space)+" of type "+str(type(space)))

        self.input_dim = space.get_total_dimension()
        self.needs_reformat = not isinstance(space, VectorSpace)

        self.desired_space = VectorSpace(self.input_dim)

        if not self.needs_reformat:
            assert self.desired_space == self.input_space

        rng = self.dbm.rng

        if self.irange is not None:
            assert self.sparse_init is None
            W = rng.uniform(-self.irange,self.irange, (self.input_dim,self.n_classes))
            assert self.sparse_init is not None
            W = np.zeros((self.input_dim, self.n_classes))
            for i in xrange(self.n_classes):
                for j in xrange(self.sparse_init):
                    idx = rng.randint(0, self.input_dim)
                    while W[idx, i] != 0.:
                        idx = rng.randint(0, self.input_dim)
                    W[idx, i] = rng.randn()

        self.W = sharedX(W,  'softmax_W' )

        self._params = [ self.b, self.W ]
    def get_weights_topo(self):
        if not isinstance(self.input_space, Conv2DSpace):
            raise NotImplementedError()
        desired = self.W.get_value().T
        ipt = self.desired_space.format_as(desired, self.input_space)
        rval = Conv2DSpace.convert_numpy(ipt, self.input_space.axes, ('b', 0, 1, 'c'))
        return rval

    def get_weights(self):
        if not isinstance(self.input_space, VectorSpace):
            raise NotImplementedError()

        return self.W.get_value()

    def set_weights(self, weights):

    def set_biases(self, biases):

    def get_biases(self):
        return self.b.get_value()

    def get_weights_format(self):
        return ('v', 'h')

    def sample(self, state_below = None, state_above = None,
            layer_above = None,
            theano_rng = None):

        if state_above is not None:
            # If you implement this case, also add a unit test for it.
            # Or at least add a warning that it is not tested.
            raise NotImplementedError()

        if theano_rng is None:
            raise ValueError("theano_rng is required; it just defaults to None so that it may appear after layer_above / state_above in the list.")


        # patch old pickle files
        if not hasattr(self, 'needs_reformat'):
            self.needs_reformat = self.needs_reshape
            del self.needs_reshape

        if self.needs_reformat:
            state_below = self.input_space.format_as(state_below, self.desired_space)


        z =, self.W) + self.b
        h_exp = T.nnet.softmax(z)
        h_sample = theano_rng.multinomial(pvals = h_exp, dtype = h_exp.dtype)

        return h_sample

    def mf_update(self, state_below, state_above = None, layer_above = None, double_weights = False, iter_name = None):
        if state_above is not None:
            raise NotImplementedError()

        if double_weights:
            raise NotImplementedError()


        # patch old pickle files
        if not hasattr(self, 'needs_reformat'):
            self.needs_reformat = self.needs_reshape
            del self.needs_reshape

        if self.needs_reformat:
            state_below = self.input_space.format_as(state_below, self.desired_space)


        from pylearn2.utils import serial
        X = serial.load('/u/goodfeli/galatea/dbm/inpaint/expdir/cifar10_N3_interm_2_features.pkl')
        state_below = Verify(X,'features')(state_below)

        assert self.W.ndim == 2
        assert state_below.ndim == 2

        b = self.b

        Z =, self.W) + b

        #Z = Print('Z')(Z)

        rval = T.nnet.softmax(Z)

        return rval

    def downward_message(self, downward_state):

        rval =, self.W.T)

        rval = self.desired_space.format_as(rval, self.input_space)

        return rval

    def recons_cost(self, Y, Y_hat_unmasked, drop_mask_Y, scale):
            scale is because the visible layer also goes into the
            cost. it uses the mean over units and examples, so that
            the scale of the cost doesn't change too much with batch
            size or example size.
            we need to multiply this cost by scale to make sure that
            it is put on the same scale as the reconstruction cost
            for the visible units. ie, scale should be 1/nvis

        Y_hat = Y_hat_unmasked
        assert hasattr(Y_hat, 'owner')
        owner = Y_hat.owner
        assert owner is not None
        op = owner.op
        if isinstance(op, Print):
            assert len(owner.inputs) == 1
            Y_hat, = owner.inputs
            owner = Y_hat.owner
            op = owner.op
        assert isinstance(op, T.nnet.Softmax)
        z ,= owner.inputs
        assert z.ndim == 2

        z = z - z.max(axis=1).dimshuffle(0, 'x')
        log_prob = z - T.exp(z).sum(axis=1).dimshuffle(0, 'x')
        # we use sum and not mean because this is really one variable per row
        log_prob_of = (Y * log_prob).sum(axis=1)
        masked = log_prob_of * drop_mask_Y
        assert masked.ndim == 1

        rval = masked.mean() * scale

        return - rval

    def make_state(self, num_examples, numpy_rng):
        """ Returns a shared variable containing an actual state
           (not a mean field state) for this variable.

        t1 = time.time()

        empty_input = self.output_space.get_origin_batch(num_examples)
        h_state = sharedX(empty_input)

        default_z = T.zeros_like(h_state) + self.b

        theano_rng = MRG_RandomStreams(numpy_rng.randint(2 ** 16))

        h_exp = T.nnet.softmax(default_z)

        h_sample = theano_rng.multinomial(pvals = h_exp, dtype = h_exp.dtype)

        p_state = sharedX( self.output_space.get_origin_batch(

        t2 = time.time()

        f = function([], updates = {
            h_state : h_sample

        t3 = time.time()


        t4 = time.time()

        print str(self)+'.make_state took',t4-t1
        print '\tcompose time:',t2-t1
        print '\tcompile time:',t3-t2
        print '\texecute time:',t4-t3 = 'softmax_sample_shared'

        return h_state

    def get_weight_decay(self, coeff):
        if isinstance(coeff, str):
            coeff = float(coeff)
        assert isinstance(coeff, float)
        return coeff * T.sqr(self.W).sum()

    def expected_energy_term(self, state, average, state_below, average_below):

        if self.needs_reformat:
            state_below = self.input_space.format_as(state_below, self.desired_space)

        # Energy function is linear so it doesn't matter if we're averaging or not
        # Specifically, our terms are -u^T W d - b^T d where u is the upward state of layer below
        # and d is the downward state of this layer

        bias_term =, self.b)
        weights_term = (, self.W) * state).sum(axis=1)

        rval = -bias_term - weights_term

        assert rval.ndim == 1

        return rval
Beispiel #12
class MultiSoftmax(Layer):
    def __init__(self,

        if isinstance(W_lr_scale, str):
            W_lr_scale = float(W_lr_scale)

        del self.self

        assert isinstance(n_classes, py_integer_types)

        self.output_space = MatrixSpace(n_groups, n_classes)
        self.b = sharedX(np.zeros((
        )), name='softmax_b')

    def get_lr_scalers(self):

        rval = OrderedDict()

        if self.W_lr_scale is not None:
            assert isinstance(self.W_lr_scale, float)
            rval[self.W] = self.W_lr_scale

        if not hasattr(self, 'b_lr_scale'):
            self.b_lr_scale = None

        if self.b_lr_scale is not None:
            assert isinstance(self.b_lr_scale, float)
            rval[self.b] = self.b_lr_scale

        return rval

    def get_monitoring_channels(self):
        return OrderedDict()

    def get_monitoring_channels_from_state(self, state, target=None):
        return OrderedDict()

    def set_input_space(self, space):
        self.input_space = space

        if not isinstance(space, Space):
            raise TypeError("Expected Space, got " + str(space) + " of type " +

        self.input_dim = space.get_total_dimension()
        self.needs_reformat = not isinstance(space, VectorSpace)

        if self.no_affine:
            desired_dim = self.n_classes
            assert self.input_dim == desired_dim
            desired_dim = self.input_dim
        self.desired_space = VectorSpace(desired_dim)

        if not self.needs_reformat:
            assert self.desired_space == self.input_space

        rng = self.mlp.rng

        if self.irange is not None:
            assert self.istdev is None
            assert self.sparse_init is None
            W = rng.uniform(-self.irange, self.irange,
                            (self.input_dim, self.n_groups, self.n_classes))
        elif self.istdev is not None:
            assert self.sparse_init is None
            W = rng.randn(self.input_dim, self.n_groups,
                          self.n_classes) * self.istdev
            raise NotImplementedError()

        self.W = sharedX(W, 'softmax_W')

        self._params = [self.b, self.W]

    def get_weights_topo(self):
        if not isinstance(self.input_space, Conv2DSpace):
            raise NotImplementedError()
        desired = self.W.get_value().T
        ipt = self.desired_space.format_as(desired, self.input_space)
        rval = Conv2DSpace.convert_numpy(ipt, self.input_space.axes,
                                         ('b', 0, 1, 'c'))
        return rval

    def get_weights(self):
        if not isinstance(self.input_space, VectorSpace):
            raise NotImplementedError()

        return self.W.get_value()

    def set_weights(self, weights):

    def set_biases(self, biases):

    def get_biases(self):
        return self.b.get_value()

    def get_weights_format(self):
        return ('v', 'h')

    def fprop(self, state_below):


        if self.needs_reformat:
            state_below = self.input_space.format_as(state_below,

        for value in get_debug_values(state_below):
            if self.mlp.batch_size is not None and value.shape[
                    0] != self.mlp.batch_size:
                raise ValueError("state_below should have batch size " +
                                 str(self.dbm.batch_size) + " but has " +

        assert state_below.ndim == 2

        assert self.W.ndim == 3

        Z = T.tensordot(state_below, self.W, axes=[[1], [0]]) + self.b

        rval = batched_softmax(Z)

        for value in get_debug_values(rval):
            if self.mlp.batch_size is not None:
                assert value.shape[0] == self.mlp.batch_size

        return rval

    def cost(self, Y, Y_hat):
        return self.cost_from_cost_matrix(self.cost_matrix(Y, Y_hat))

    def cost_from_cost_matrix(self, cost_matrix):
        return cost_matrix.sum(axis=2).mean()

    def cost_matrix(self, Y, Y_hat):
        return -Y * T.log(Y_hat + 0.000001)

    def get_weight_decay(self, coeff):
        if isinstance(coeff, str):
            coeff = float(coeff)
        assert isinstance(coeff, float) or hasattr(coeff, 'dtype')
        return coeff * T.sqr(self.W).sum()

    def get_l1_weight_decay(self, coeff):
        if isinstance(coeff, str):
            coeff = float(coeff)
        assert isinstance(coeff, float) or hasattr(coeff, 'dtype')
        W = self.W
        return coeff * abs(W).sum()

    def censor_updates(self, updates):
        if self.max_row_norm is not None:
            W = self.W
            if W in updates:
                updated_W = updates[W]
                row_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=1))
                desired_norms = T.clip(row_norms, 0, self.max_row_norm)
                updates[W] = updated_W * (desired_norms /
                                          (1e-7 + row_norms)).dimshuffle(
                                              0, 'x')
        if self.max_col_norm is not None:
            assert self.max_row_norm is None
            W = self.W
            if W in updates:
                updated_W = updates[W]
                col_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=0))
                desired_norms = T.clip(col_norms, 0, self.max_col_norm)
                updates[W] = updated_W * (desired_norms / (1e-7 + col_norms))
class ClassBasedOutput(Softmax):
    # TODO cleanup target, class name mess, it's confusing
    def __init__(self, n_clusters = None, classclusterpath= None, clusters_scope = None, **kwargs):
        super(ClassBasedOutput, self).__init__(**kwargs)
        self.n_clusters = n_clusters

        del self.b
        self.b_class = sharedX(np.zeros((self.n_clusters, self.n_classes)), name = 'softmax_b_class')
        self.b_cluster = sharedX( np.zeros((self.n_clusters)), name = 'softmax_b_clusters')
        npz_data = serial.load("${PYLEARN2_DATA_PATH}/PennTreebankCorpus/" + classclusterpath)

        self.cluster_targets = np.random.randint(0,n_clusters,size=(self.n_classes))
        #cluster_targets is a nx1 array which tells which cluster the word

        keys = range(n_clusters)
        self.clusters_scope = dict(zip(keys, np.bincount(self.cluster_targets)))

        #self._group_dot = _group_dot

    def set_input_space(self, space):
        self.input_space = space
        if not isinstance(space, Space):
            raise TypeError("Expected Space, got "+
                    str(space)+" of type "+str(type(space)))

        self.input_dim = space.get_total_dimension()
        self.needs_reformat = not isinstance(space, VectorSpace)
        if self.no_affine:
            desired_dim = self.n_classes
            assert self.input_dim == desired_dim
            desired_dim = self.input_dim
        self.desired_space = VectorSpace(desired_dim)

        if not self.needs_reformat:
            assert self.desired_space == self.input_space

        rng = self.mlp.rng

        if self.no_affine:
            self._params = []
            if self.irange is not None:
                assert self.istdev is None
                assert self.sparse_init is None
                W_cluster = rng.uniform(-self.irange,self.irange, (self.input_dim, self.n_clusters))
                W_class = rng.uniform(-self.irange,self.irange, (self.n_clusters, self.input_dim, self.n_classes))
            elif self.istdev is not None:
                assert self.sparse_init is None
                W_cluster = rng.randn(self.input_dim, self.n_clusters) * self.istdev
                W_class = rng.randn(self.n_clusters, self.input_dim, self.n_classes) * self.istdev
                raise NotImplementedError()

            # set the extra dummy weights to 0
            for key in self.clusters_scope.keys():
		#print key
                #should probably be reverse
                W_class[int(key), :, :self.clusters_scope[key]] = 0.

            self.W_class = sharedX(W_class,  'softmax_W_class' )
            self.W_cluster = sharedX(W_cluster,  'softmax_W_cluster' )

            self._params = [self.b_class, self.W_class, self.b_cluster, self.W_cluster]

    def get_layer_monitoring_channels(self, state_below=None,
                                    state=None, targets=NotImplementedError):

        if self.no_affine:
            return OrderedDict()

        W_class = self.W_class
        W_cluster = self.W_cluster

        assert W_class.ndim == 3
        assert W_cluster.ndim == 2

        sq_W = T.sqr(W_cluster)
        sq_W_class = T.sqr(W_class)

        row_norms = T.sqrt(sq_W.sum(axis=1))
        col_norms = T.sqrt(sq_W.sum(axis=0))

        row_norms_class = T.sqrt(sq_W_class.sum(axis=1))
        col_norms_class = T.sqrt(sq_W_class.sum(axis=0))

        rval = OrderedDict([
                            ('row_norms_min'  , row_norms.min()),
                            ('row_norms_mean' , row_norms.mean()),
                            ('row_norms_max'  , row_norms.max()),
                            ('col_norms_min'  , col_norms.min()),
                            ('col_norms_mean' , col_norms.mean()),
                            ('col_norms_max'  , col_norms.max()),
                            ('class_row_norms_min'  , row_norms_class.min()),
                            ('class_row_norms_mean' , row_norms_class.mean()),
                            ('class_row_norms_max'  , row_norms_class.max()),
                            ('class_col_norms_min'  , col_norms_class.min()),
                            ('class_col_norms_mean' , col_norms_class.mean()),
                            ('class_col_norms_max'  , col_norms_class.max()),

        if (state_below is not None) or (state is not None):
            if state is None:

                for value in get_debug_values(state_below):
                    print 'value is'+ value
                state=self.fprop (state_below)
            #print state
            state, cls = state
            mx = state.max(axis=1)
                                     ('max_max_class' , mx.max()),
                                     ('min_max_class' , mx.min())
            if targets is not None:
                rval['nll'] = self.cost(Y_hat=(state,cls), Y=targets)
                rval['perplexity'] = 10 ** (rval['nll']/np.log(10).astype('float32'))
                rval['entropy'] = rval['nll']/np.log(2).astype('float32')
        return rval
        # state, cls = state

    def cost(self, Y, Y_hat):
        Y must be one-hot binary. Y_hat is a softmax estimate.
        of Y. Returns negative log probability of Y under the Y_hat
        y_hat, y_cls = Y_hat
        #have to change y as argmax
        #also make cls a shared variable and use that
        #CLS = self.classclusters[Y]
        #Y = self._group_dot.fprop(Y, Y_hat)
        CLS = self.cluster_targets

        assert hasattr(y_hat, 'owner')
        owner = y_hat.owner
        assert owner is not None
        op = owner.op
        if isinstance(op, Print):
          assert len(owner.inputs) == 1
          y_hat, = owner.inputs
          owner = y_hat.owner
          op = owner.op
        assert isinstance(op, T.nnet.Softmax)

        #print 'own'
        #print owner,op
        z ,= owner.inputs
        #print 'z:'
        #print z
        assert z.ndim == 2

        assert hasattr(y_cls, 'owner')
        owner = y_cls.owner
        assert owner is not None
        op = owner.op
        if isinstance(op, Print):
            assert len(owner.inputs) == 1
            y_cls, = owner.inputs
            owner = y_cls.owner
            op = owner.op
        assert isinstance(op, T.nnet.Softmax)
        z_cls ,= owner.inputs
        #print 'z_cls:'
        #print z_cls
        assert z_cls.ndim == 2

        # Y
        #print z
        z = z - z.max(axis=1).dimshuffle(0, 'x')
        log_prob = z - T.log(T.exp(z).sum(axis=1).dimshuffle(0, 'x'))
        #print log_prob
        #print Y.ndim

        # we use sum and not mean because this is really one variable per row
        # Y = OneHotFormatter(self.n_classes).theano_expr(
        #                         T.addbroadcast(Y,0,1).dimshuffle(0).astype('uint32'))
        log_prob_of = (Y * log_prob).sum(axis=1)
        assert log_prob_of.ndim == 1

        # cls
        z_cls = z_cls - z_cls.max(axis=1).dimshuffle(0, 'x')
        log_prob_cls = z_cls - T.log(T.exp(z_cls).sum(axis=1).dimshuffle(0, 'x'))

        # CLS = OneHotFormatter(self.n_clusters).theano_expr(
        #                         T.addbroadcast(CLS, 1).dimshuffle(0).astype('uint32'))
        log_prob_of_cls = (CLS * log_prob_cls).sum(axis=1)
        assert log_prob_of_cls.ndim == 1

        # p(w|history) = p(c|s) * p(w|c,s)
        log_prob_of = log_prob_of + log_prob_of_cls
        rval = log_prob_of.mean()        
        return - rval

    def fprop(self, state_below):
        #change model to add new variable which sends which indices of the data are here

        if self.needs_reformat:
            state_below = self.input_space.format_as(state_below, self.desired_space)
        for value in get_debug_values(state_below):
            print 'getting debug values'
            print value
        #     if self.mlp.batch_size is not None and value.shape[0] != self.mlp.batch_size:
        #         raise ValueError("state_below should have batch size "+str(self.dbm.batch_size)+" but has "+str(value.shape[0]))
        assert state_below.ndim == 2
        if not hasattr(self, 'no_affine'):
            self.no_affine = False
        if self.no_affine:
            raise NotImplementedError()

        assert self.W_class.ndim == 3
        assert self.W_cluster.ndim == 2

        #we get the cluster by doing hW_cluster + b_cluster
        probcluster =, self.W_cluster) + self.b_cluster
        probcluster = T.nnet.softmax(probcluster)
        for value in get_debug_values(probcluster):
            print 'val is'
            print val

        print 'type of state below is'
        print state_below.type
        print state_below.dtype
        print state_below.ndim
        self.cluster_targets = range(5)

        #need the predicted clusters for this batch
        Z = T.nnet.GroupDot(self.n_clusters)(state_below,
        probclass = T.nnet.softmax(Z)
        for value in get_debug_values(probclass):
             if self.mlp.batch_size is not None:
                assert value.shape[0] == self.mlp.batch_size
        return probclass, probcluster

    def get_weights_format(self):
        return ('v', 'h', 'h_c')

    def get_biases(self):
        return self.b_class.get_value(), self.b_cluster.get_value()

    def get_weights(self):
        if not isinstance(self.input_space, VectorSpace):
            raise NotImplementedError()
        return self.W_cluster.get_value(), self.W_class.get_value()
class Softmax(Layer):

    def __init__(self, n_classes, layer_name, irange = None,
            istdev = None,
                 sparse_init = None, W_lr_scale = None,
                 b_lr_scale = None, max_row_norm = None):

        if isinstance(W_lr_scale, str):
            W_lr_scale = float(W_lr_scale)

        del self.self

        assert isinstance(n_classes, int)

        self.output_space = VectorSpace(n_classes)
        self.b = sharedX( np.zeros((n_classes,)), name = 'softmax_b')

    def get_lr_scalers(self):

        rval = OrderedDict()

        if self.W_lr_scale is not None:
            assert isinstance(self.W_lr_scale, float)
            rval[self.W] = self.W_lr_scale

        if not hasattr(self, 'b_lr_scale'):
            self.b_lr_scale = None

        if self.b_lr_scale is not None:
            assert isinstance(self.b_lr_scale, float)
            rval[self.b] = self.b_lr_scale

        return rval

    def get_monitoring_channels_from_state(self, state, target=None):

        mx = state.max(axis=1)

        rval =  OrderedDict([
                ('mean_max_class' , mx.mean()),
                ('max_max_class' , mx.max()),
                ('min_max_class' , mx.min())

        if target is not None:
            y_hat = T.argmax(state, axis=1)
            y = T.argmax(target, axis=1)
            misclass = T.neq(y, y_hat).mean()
            misclass = T.cast(misclass, config.floatX)
            rval['misclass'] = misclass

        return rval

    def set_input_space(self, space):
        self.input_space = space

        if not isinstance(space, Space):
            raise TypeError("Expected Space, got "+
                    str(space)+" of type "+str(type(space)))

        self.input_dim = space.get_total_dimension()
        self.needs_reformat = not isinstance(space, VectorSpace)

        self.desired_space = VectorSpace(self.input_dim)

        if not self.needs_reformat:
            assert self.desired_space == self.input_space

        rng = self.mlp.rng

        if self.irange is not None:
            assert self.istdev is None
            assert self.sparse_init is None
            W = rng.uniform(-self.irange,self.irange, (self.input_dim,self.n_classes))
        elif self.istdev is not None:
            assert self.sparse_init is None
            W = rng.randn(self.input_dim, self.n_classes) * self.istdev
            assert self.sparse_init is not None
            W = np.zeros((self.input_dim, self.n_classes))
            for i in xrange(self.n_classes):
                for j in xrange(self.sparse_init):
                    idx = rng.randint(0, self.input_dim)
                    while W[idx, i] != 0.:
                        idx = rng.randint(0, self.input_dim)
                    W[idx, i] = rng.randn()

        self.W = sharedX(W,  'softmax_W' )

        self._params = [ self.b, self.W ]

    def get_weights_topo(self):
        if not isinstance(self.input_space, Conv2DSpace):
            raise NotImplementedError()
        desired = self.W.get_value().T
        ipt = self.desired_space.format_as(desired, self.input_space)
        rval = Conv2DSpace.convert_numpy(ipt, self.input_space.axes, ('b', 0, 1, 'c'))
        return rval

    def get_weights(self):
        if not isinstance(self.input_space, VectorSpace):
            raise NotImplementedError()

        return self.W.get_value()

    def set_weights(self, weights):

    def set_biases(self, biases):

    def get_biases(self):
        return self.b.get_value()

    def get_weights_format(self):
        return ('v', 'h')

    def fprop(self, state_below):


        if self.needs_reformat:
            state_below = self.input_space.format_as(state_below, self.desired_space)

        for value in get_debug_values(state_below):
            if value.shape[0] != self.mlp.batch_size:
                raise ValueError("state_below should have batch size "+str(self.dbm.batch_size)+" but has "+str(value.shape[0]))


        assert self.W.ndim == 2
        assert state_below.ndim == 2

        b = self.b

        Z =, self.W) + b

        rval = T.nnet.softmax(Z)

        for value in get_debug_values(rval):
            assert value.shape[0] == self.mlp.batch_size

        return rval

    def cost(self, Y, Y_hat):
        Y must be one-hot binary. Y_hat is a softmax estimate.
        of Y. Returns negative log probability of Y under the Y_hat

        assert hasattr(Y_hat, 'owner')
        owner = Y_hat.owner
        assert owner is not None
        op = owner.op
        if isinstance(op, Print):
            assert len(owner.inputs) == 1
            Y_hat, = owner.inputs
            owner = Y_hat.owner
            op = owner.op
        assert isinstance(op, T.nnet.Softmax)
        z ,= owner.inputs
        assert z.ndim == 2

        z = z - z.max(axis=1).dimshuffle(0, 'x')
        log_prob = z - T.log(T.exp(z).sum(axis=1).dimshuffle(0, 'x'))
        # we use sum and not mean because this is really one variable per row
        log_prob_of = (Y * log_prob).sum(axis=1)
        assert log_prob_of.ndim == 1

        rval = log_prob_of.mean()

        return - rval

    def get_weight_decay(self, coeff):
        if isinstance(coeff, str):
            coeff = float(coeff)
        assert isinstance(coeff, float) or hasattr(coeff, 'dtype')
        return coeff * T.sqr(self.W).sum()

    def censor_updates(self, updates):
        if self.max_row_norm is not None:
            W = self.W
            if W in updates:
                updated_W = updates[W]
                row_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=1))
                desired_norms = T.clip(row_norms, 0, self.max_row_norm)
                updates[W] = updated_W * (desired_norms / (1e-7 + row_norms)).dimshuffle(0, 'x')
class HingeLoss(Layer):

    def __init__(self, n_classes, layer_name, irange = None,
                 istdev = None,
                 sparse_init = None):

        del self.self

        self.output_space = VectorSpace(n_classes)
        self.b = sharedX(np.zeros((n_classes,)), name = 'hingeloss_b')

    def get_monitoring_channels(self):

        W = self.W

        assert W.ndim == 2

        sq_W = T.sqr(W)

        row_norms = T.sqrt(sq_W.sum(axis=1))
        col_norms = T.sqrt(sq_W.sum(axis=0))

        return OrderedDict([
                            ('row_norms_min'  , row_norms.min()),
                            ('row_norms_mean' , row_norms.mean()),
                            ('row_norms_max'  , row_norms.max()),
                            ('col_norms_min'  , col_norms.min()),
                            ('col_norms_mean' , col_norms.mean()),
                            ('col_norms_max'  , col_norms.max()),

    def get_monitoring_channels_from_state(self, state, target=None):

        mx = state.max(axis=1)

        rval =  OrderedDict([
                ('mean_max_class' , mx.mean()),
                ('max_max_class' , mx.max()),
                ('min_max_class' , mx.min())

        if target is not None:
            y_hat = self.target_convert(T.argmax(state, axis=1))
            #Assume target is in [0,1] as binary one-hot
            y = self.target_convert(T.argmax(target, axis=1))
            misclass = T.neq(y, y_hat).mean()
            misclass = T.cast(misclass, config.floatX)
            rval['misclass'] = misclass
            rval['nll'] = self.cost(Y_hat=state, Y=target)

        return rval

    def set_input_space(self, space):
        self.input_space = space

        if not isinstance(space, Space):
            raise TypeError("Expected Space, got "+
                    str(space)+" of type "+str(type(space)))

        self.input_dim = space.get_total_dimension()
        self.needs_reformat = not isinstance(space, VectorSpace)

        desired_dim = self.input_dim
        self.desired_space = VectorSpace(desired_dim)

        if not self.needs_reformat:
            assert self.desired_space == self.input_space

        rng = self.mlp.rng

        if self.irange is not None:
            assert self.istdev is None
            assert self.sparse_init is None
            W = rng.uniform(-self.irange,self.irange, (self.input_dim,self.n_classes))
        elif self.istdev is not None:
            assert self.sparse_init is None
            W = rng.randn(self.input_dim, self.n_classes) * self.istdev
            assert self.sparse_init is not None
            W = np.zeros((self.input_dim, self.n_classes))
            for i in xrange(self.n_classes):
                for j in xrange(self.sparse_init):
                    idx = rng.randint(0, self.input_dim)
                    while W[idx, i] != 0.:
                        idx = rng.randint(0, self.input_dim)
                    W[idx, i] = rng.randn()

        self.W = sharedX(W,  'hingeloss_W' )

        self._params = [ self.b, self.W ]

    def get_weights_topo(self):
        if not isinstance(self.input_space, Conv2DSpace):
            raise NotImplementedError()
        desired = self.W.get_value().T
        ipt = self.desired_space.format_as(desired, self.input_space)
        rval = Conv2DSpace.convert_numpy(ipt, self.input_space.axes, ('b', 0, 1, 'c'))
        return rval

    def get_weights(self):
        if not isinstance(self.input_space, VectorSpace):
            raise NotImplementedError()

        return self.W.get_value()

    def set_weights(self, weights):

    def set_biases(self, biases):

    def get_biases(self):
        return self.b.get_value()

    def get_weights_format(self):
        return ('v', 'h')

    def fprop(self, state_below):

        if self.needs_reformat:
            state_below = self.input_space.format_as(state_below, self.desired_space)

        for value in get_debug_values(state_below):
            if self.mlp.batch_size is not None and value.shape[0] != self.mlp.batch_size:
                raise ValueError("state_below should have batch size "+str(self.dbm.batch_size)+" but has "+str(value.shape[0]))

        assert state_below.ndim == 2

        assert self.W.ndim == 2
        b = self.b
        W = self.W

        rval =, W) + b

        for value in get_debug_values(rval):
            if self.mlp.batch_size is not None:
                assert value.shape[0] == self.mlp.batch_size

        return rval

    def target_convert(self, Y):
        converts target [0,1] to [-1, 1]
        Y_t = 2. * Y - 1.
        return Y_t

    def hinge_cost(self, W, Y, Y_hat, C=1.):
        #prob = .5 *, self.W) + C * (T.maximum(1 - Y * Y_hat, 0) ** 2.).sum(axis=1)
        prob = (T.maximum(1 - Y * Y_hat, 0) ** 2.).sum(axis=1)
        return prob

    def cost(self, Y, Y_hat):
        Y must be one-hot binary. Y_hat is a hinge loss estimate.
        of Y.

        assert hasattr(Y_hat, 'owner')
        owner = Y_hat.owner
        assert owner is not None
        op = owner.op
        if isinstance(op, Print):
            assert len(owner.inputs) == 1
            Y_hat, = owner.inputs
            owner = Y_hat.owner
            op = owner.op
        assert Y_hat.ndim == 2
        Y_t = self.target_convert(Y)
        prob = self.hinge_cost(self.W, Y_t, Y_hat)
        assert prob.ndim == 1
        rval = prob.mean()

        return rval

    def cost_matrix(self, Y, Y_hat):
        Y must be one-hot binary. Y_hat is a hinge loss estimate.
        of Y.

        assert hasattr(Y_hat, 'owner')
        owner = Y_hat.owner
        assert owner is not None
        op = owner.op
        if isinstance(op, Print):
            assert len(owner.inputs) == 1
            Y_hat, = owner.inputs
            owner = Y_hat.owner
            op = owner.op

        assert Y_hat.ndim == 2
        Y_t = self.target_convert(Y)
        prob = self.hinge_cost(self.W, Y_t, Y_hat)
        return prob

    def get_weight_decay(self, coeff):
        if isinstance(coeff, str):
            coeff = float(coeff)
        assert isinstance(coeff, float) or hasattr(coeff, 'dtype')
        return coeff * T.sqr(self.W).sum()

    def get_l1_weight_decay(self, coeff):
        if isinstance(coeff, str):
            coeff = float(coeff)
        assert isinstance(coeff, float) or hasattr(coeff, 'dtype')
        W = self.W
        return coeff * abs(W).sum()
class Softmax(HiddenLayer):
    def __init__(self,

        if isinstance(W_lr_scale, str):
            W_lr_scale = float(W_lr_scale)

        del self.self

        assert isinstance(n_classes, int)

        self.output_space = VectorSpace(n_classes)
        self.b = sharedX(np.zeros((n_classes, )), name='softmax_b')

    def get_lr_scalers(self):

        rval = {}

        # Patch old pickle files
        if not hasattr(self, 'W_lr_scale'):
            self.W_lr_scale = None

        if self.W_lr_scale is not None:
            assert isinstance(self.W_lr_scale, float)
            rval[self.W] = self.W_lr_scale

        return rval

    def get_total_state_space(self):
        return self.output_space

    def get_monitoring_channels_from_state(self, state):

        mx = state.max(axis=1)

        return {
            'mean_max_class': mx.mean(),
            'max_max_class': mx.max(),
            'min_max_class': mx.min()

    def set_input_space(self, space):
        self.input_space = space

        if not isinstance(space, Space):
            raise TypeError("Expected Space, got " + str(space) + " of type " +

        self.input_dim = space.get_total_dimension()
        self.needs_reformat = not isinstance(space, VectorSpace)

        self.desired_space = VectorSpace(self.input_dim)

        if not self.needs_reformat:
            assert self.desired_space == self.input_space

        rng = self.dbm.rng

        if self.irange is not None:
            assert self.sparse_init is None
            W = rng.uniform(-self.irange, self.irange,
                            (self.input_dim, self.n_classes))
            assert self.sparse_init is not None
            W = np.zeros((self.input_dim, self.n_classes))
            for i in xrange(self.n_classes):
                for j in xrange(self.sparse_init):
                    idx = rng.randint(0, self.input_dim)
                    while W[idx, i] != 0.:
                        idx = rng.randint(0, self.input_dim)
                    W[idx, i] = rng.randn()

        self.W = sharedX(W, 'softmax_W')

        self._params = [self.b, self.W]

    def get_weights_topo(self):
        if not isinstance(self.input_space, Conv2DSpace):
            raise NotImplementedError()
        desired = self.W.get_value().T
        ipt = self.desired_space.format_as(desired, self.input_space)
        rval = Conv2DSpace.convert_numpy(ipt, self.input_space.axes,
                                         ('b', 0, 1, 'c'))
        return rval

    def get_weights(self):
        if not isinstance(self.input_space, VectorSpace):
            raise NotImplementedError()

        return self.W.get_value()

    def set_weights(self, weights):

    def set_biases(self, biases):

    def get_biases(self):
        return self.b.get_value()

    def get_weights_format(self):
        return ('v', 'h')

    def sample(self,

        if state_above is not None:
            # If you implement this case, also add a unit test for it.
            # Or at least add a warning that it is not tested.
            raise NotImplementedError()

        if theano_rng is None:
            raise ValueError(
                "theano_rng is required; it just defaults to None so that it may appear after layer_above / state_above in the list."


        # patch old pickle files
        if not hasattr(self, 'needs_reformat'):
            self.needs_reformat = self.needs_reshape
            del self.needs_reshape

        if self.needs_reformat:
            state_below = self.input_space.format_as(state_below,


        z =, self.W) + self.b
        h_exp = T.nnet.softmax(z)
        h_sample = theano_rng.multinomial(pvals=h_exp, dtype=h_exp.dtype)

        return h_sample

    def mf_update(self,
        if state_above is not None:
            raise NotImplementedError()

        if double_weights:
            raise NotImplementedError()


        # patch old pickle files
        if not hasattr(self, 'needs_reformat'):
            self.needs_reformat = self.needs_reshape
            del self.needs_reshape

        if self.needs_reformat:
            state_below = self.input_space.format_as(state_below,

        from pylearn2.utils import serial
        X = serial.load('/u/goodfeli/galatea/dbm/inpaint/expdir/cifar10_N3_interm_2_features.pkl')
        state_below = Verify(X,'features')(state_below)

        assert self.W.ndim == 2
        assert state_below.ndim == 2

        b = self.b

        Z =, self.W) + b

        #Z = Print('Z')(Z)

        rval = T.nnet.softmax(Z)

        return rval

    def downward_message(self, downward_state):

        rval =, self.W.T)

        rval = self.desired_space.format_as(rval, self.input_space)

        return rval

    def recons_cost(self, Y, Y_hat_unmasked, drop_mask_Y, scale):
            scale is because the visible layer also goes into the
            cost. it uses the mean over units and examples, so that
            the scale of the cost doesn't change too much with batch
            size or example size.
            we need to multiply this cost by scale to make sure that
            it is put on the same scale as the reconstruction cost
            for the visible units. ie, scale should be 1/nvis

        Y_hat = Y_hat_unmasked
        assert hasattr(Y_hat, 'owner')
        owner = Y_hat.owner
        assert owner is not None
        op = owner.op
        if isinstance(op, Print):
            assert len(owner.inputs) == 1
            Y_hat, = owner.inputs
            owner = Y_hat.owner
            op = owner.op
        assert isinstance(op, T.nnet.Softmax)
        z, = owner.inputs
        assert z.ndim == 2

        z = z - z.max(axis=1).dimshuffle(0, 'x')
        log_prob = z - T.exp(z).sum(axis=1).dimshuffle(0, 'x')
        # we use sum and not mean because this is really one variable per row
        log_prob_of = (Y * log_prob).sum(axis=1)
        masked = log_prob_of * drop_mask_Y
        assert masked.ndim == 1

        rval = masked.mean() * scale

        return -rval

    def make_state(self, num_examples, numpy_rng):
        """ Returns a shared variable containing an actual state
           (not a mean field state) for this variable.

        t1 = time.time()

        empty_input = self.output_space.get_origin_batch(num_examples)
        h_state = sharedX(empty_input)

        default_z = T.zeros_like(h_state) + self.b

        theano_rng = MRG_RandomStreams(numpy_rng.randint(2**16))

        h_exp = T.nnet.softmax(default_z)

        h_sample = theano_rng.multinomial(pvals=h_exp, dtype=h_exp.dtype)

        p_state = sharedX(self.output_space.get_origin_batch(num_examples))

        t2 = time.time()

        f = function([], updates={h_state: h_sample})

        t3 = time.time()


        t4 = time.time()

        print str(self) + '.make_state took', t4 - t1
        print '\tcompose time:', t2 - t1
        print '\tcompile time:', t3 - t2
        print '\texecute time:', t4 - t3 = 'softmax_sample_shared'

        return h_state

    def get_weight_decay(self, coeff):
        if isinstance(coeff, str):
            coeff = float(coeff)
        assert isinstance(coeff, float)
        return coeff * T.sqr(self.W).sum()

    def expected_energy_term(self, state, average, state_below, average_below):

        if self.needs_reformat:
            state_below = self.input_space.format_as(state_below,

        # Energy function is linear so it doesn't matter if we're averaging or not
        # Specifically, our terms are -u^T W d - b^T d where u is the upward state of layer below
        # and d is the downward state of this layer

        bias_term =, self.b)
        weights_term = (, self.W) * state).sum(axis=1)

        rval = -bias_term - weights_term

        assert rval.ndim == 1

        return rval
class ToyRNNPhone(Model):
    def __init__(self, nvis, nhid, hidden_transition_model, irange=0.05,
                 non_linearity='sigmoid', use_ground_truth=True):
        allowed_non_linearities = {'sigmoid': T.nnet.sigmoid,
                                   'tanh': T.tanh}
        self.nvis = nvis
        self.nhid = nhid
        self.hidden_transition_model = hidden_transition_model
        self.use_ground_truth = use_ground_truth
        self.alpha = sharedX(1)
        self.alpha_decrease_rate = 0.999

        assert non_linearity in allowed_non_linearities
        self.non_linearity = allowed_non_linearities[non_linearity]

        # Space initialization
        self.input_space = VectorSpace(dim=self.nvis)
        self.hidden_space = VectorSpace(dim=self.nhid)
        self.output_space = VectorSpace(dim=1)
        self.input_source = 'features'
        self.target_source = 'targets'

        # Features-to-hidden matrix
        W_value = numpy.random.uniform(low=-irange, high=irange,
                                       size=(self.nvis, self.nhid))
        self.W = sharedX(W_value, name='W')
        # Hidden biases
        b_value = numpy.zeros(self.nhid)
        self.b = sharedX(b_value, name='b')
        # Hidden-to-out matrix
        U_value = numpy.random.uniform(low=-irange, high=irange,
                                       size=(self.nhid, 1))
        self.U = sharedX(U_value, name='U')
        # Output bias
        c_value = numpy.zeros(1)
        self.c = sharedX(c_value, name='c')

    def fprop_step(self, features, h_tm1, out):
        h_tm1 = self.hidden_space.format_as(h_tm1.dimshuffle('x', 0),
        h = T.nnet.sigmoid(, self.W) +
                           self.hidden_transition_model.fprop(h_tm1).flatten() +
        out =, self.U) + self.c
        return h, out

    def fprop_step_prime(self, truth, features, h_tm1, out):
        features = T.set_subtensor(features[-1], (1 - self.alpha) *
                                   features[-1] + self.alpha * truth[-1])
        h_tm1 = self.hidden_space.format_as(h_tm1.dimshuffle('x', 0),
        h = T.nnet.sigmoid(, self.W) +
                           self.hidden_transition_model.fprop(h_tm1).flatten() +
        out =, self.U) + self.c
        features = T.concatenate([features[1:], out])
        return features, h, out

    def fprop(self, data):
        if self.use_ground_truth:
            features = data

            init_h = T.alloc(numpy.cast[theano.config.floatX](0), self.nhid)
            init_out = T.alloc(numpy.cast[theano.config.floatX](0), 1)
            init_out = T.unbroadcast(init_out, 0)

            fn = lambda f, h, o: self.fprop_step(f, h, o)

            ((h, out), updates) = theano.scan(fn=fn,
            return out
            features = data

            init_in = features[0]
            init_h = T.alloc(numpy.cast[theano.config.floatX](0), self.nhid)
            init_out = T.alloc(numpy.cast[theano.config.floatX](0), 1)
            init_out = T.unbroadcast(init_out, 0)

            fn = lambda t, f, h, o: self.fprop_step_prime(t, f, h, o)

            ((f, h, out), updates) = theano.scan(fn=fn,
            return out

    def predict_next(self, features, h_tm1):
        h_tm1 = self.hidden_space.format_as(h_tm1.dimshuffle('x', 0),
        h = T.nnet.sigmoid(, self.W) +
                           self.hidden_transition_model.fprop(h_tm1).flatten() +
        out =, self.U) + self.c
        return h, out

    def get_params(self):
        return [self.W, self.b, self.U, self.c] + \

    def get_input_source(self):
        return self.input_source

    def get_target_source(self):
        return self.target_source

    def censor_updates(self, updates):
        updates[self.alpha] = self.alpha_decrease_rate * self.alpha

    def get_monitoring_channels(self, data):
        rval = OrderedDict()
        rval['alpha'] = self.alpha
        return rval
class BinaryVectorMaxPool(HiddenLayer):
        A hidden layer that does max-pooling on binary vectors.
        It has two sublayers, the detector layer and the pooling
        layer. The detector layer is its downward state and the pooling
        layer is its upward state.

        TODO: this layer uses (pooled, detector) as its total state,
              which can be confusing when listing all the states in
              the network left to right. Change this and
              pylearn2.expr.probabilistic_max_pooling to use
              (detector, pooled)
    def __init__(self,

            include_prob: probability of including a weight element in the set
                    of weights initialized to U(-irange, irange). If not included
                    it is initialized to 0.

        del self.self

        self.b = sharedX(np.zeros((self.detector_layer_dim, )) + init_bias,
                         name=layer_name + '_b')

    def set_input_space(self, space):
        """ Note: this resets parameters! """

        self.input_space = space

        if isinstance(space, VectorSpace):
            self.requires_reformat = False
            self.input_dim = space.dim
            self.requires_reformat = True
            self.input_dim = space.get_total_dimension()
            self.desired_space = VectorSpace(self.input_dim)

        if not (self.detector_layer_dim % self.pool_size == 0):
            raise ValueError(
                "detector_layer_dim = %d, pool_size = %d. Should be divisible but remainder is %d"
                % (self.detector_layer_dim, self.pool_size,
                   self.detector_layer_dim % self.pool_size))

        self.h_space = VectorSpace(self.detector_layer_dim)
        self.pool_layer_dim = self.detector_layer_dim / self.pool_size
        self.output_space = VectorSpace(self.pool_layer_dim)

        rng = self.dbm.rng
        if self.irange is not None:
            assert self.sparse_init is None
            W = rng.uniform(-self.irange,
                                 (self.input_dim, self.detector_layer_dim)) * \
                    (rng.uniform(0.,1., (self.input_dim, self.detector_layer_dim))
                     < self.include_prob)
            assert self.sparse_init is not None
            W = np.zeros((self.input_dim, self.detector_layer_dim))
            for i in xrange(self.detector_layer_dim):
                for j in xrange(self.sparse_init):
                    idx = rng.randint(0, self.input_dim)
                    while W[idx, i] != 0:
                        idx = rng.randint(0, self.input_dim)
                    W[idx, i] = rng.randn()

        W = sharedX(W) = self.layer_name + '_W'

        self.transformer = MatrixMul(W)

        W, = self.transformer.get_params()
        assert is not None

    def get_total_state_space(self):
        return CompositeSpace((self.output_space, self.h_space))

    def get_params(self):
        assert is not None
        W, = self.transformer.get_params()
        assert is not None
        return self.transformer.get_params().union([self.b])

    def get_weight_decay(self, coeff):
        if isinstance(coeff, str):
            coeff = float(coeff)
        assert isinstance(coeff, float)
        W, = self.transformer.get_params()
        return coeff * T.sqr(W).sum()

    def get_weights(self):
        if self.requires_reformat:
            # This is not really an unimplemented case.
            # We actually don't know how to format the weights
            # in design space. We got the data in topo space
            # and we don't have access to the dataset
            raise NotImplementedError()
        W, = self.transformer.get_params()
        return W.get_value()

    def set_weights(self, weights):
        W, = self.transformer.get_params()

    def set_biases(self, biases):

    def get_biases(self):
        return self.b.get_value()

    def get_weights_format(self):
        return ('v', 'h')

    def get_weights_view_shape(self):
        total = self.detector_layer_dim
        cols = self.pool_size
        if cols == 1:
            # Let the PatchViewer decidew how to arrange the units
            # when they're not pooled
            raise NotImplementedError()
        # When they are pooled, make each pooling unit have one row
        rows = total / cols
        return rows, cols

    def get_weights_topo(self):

        if not isinstance(self.input_space, Conv2DSpace):
            raise NotImplementedError()

        W, = self.transformer.get_params()

        W = W.T

        W = W.reshape((self.detector_layer_dim, self.input_space.shape[0],
                       self.input_space.shape[1], self.input_space.nchannels))

        W = Conv2DSpace.convert(W, self.input_space.axes, ('b', 0, 1, 'c'))

        return function([], W)()

    def upward_state(self, total_state):
        p, h = total_state
        return p

    def downward_state(self, total_state):
        p, h = total_state
        return h

    def get_monitoring_channels_from_state(self, state):

        P, H = state

        rval = {}

        if self.pool_size == 1:
            vars_and_prefixes = [(P, '')]
            vars_and_prefixes = [(P, 'p_'), (H, 'h_')]

        for var, prefix in vars_and_prefixes:
            v_max = var.max(axis=0)
            v_min = var.min(axis=0)
            v_mean = var.mean(axis=0)
            v_range = v_max - v_min

            for key, val in [('max_max', v_max.max()),
                             ('max_mean', v_max.mean()),
                             ('max_min', v_max.min()),
                             ('min_max', v_min.max()),
                             ('min_mean', v_min.mean()),
                             ('min_max', v_min.max()),
                             ('range_max', v_range.max()),
                             ('range_mean', v_range.mean()),
                             ('range_min', v_range.min()),
                             ('mean_max', v_mean.max()),
                             ('mean_mean', v_mean.mean()),
                             ('mean_min', v_mean.min())]:
                rval[prefix + key] = val

        return rval

    def get_l1_act_cost(self, state, target, coeff, eps=None):
        rval = 0.

        P, H = state

        if self.pool_size == 1:
            # If the pool size is 1 then pools = detectors
            # and we should not penalize pools and detectors separately
            assert len(state) == 2
            assert isinstance(target, float)
            assert isinstance(coeff, float)
            _, state = state
            state = [state]
            target = [target]
            coeff = [coeff]
            if eps is None:
                eps = [0.]
                eps = [eps]
            assert all([len(elem) == 2 for elem in [state, target, coeff]])
            if eps is None:
                eps = [0., 0.]
            if target[1] < target[0]:
                    "Do you really want to regularize the detector units to be sparser than the pooling units?"

        for s, t, c, e in safe_zip(state, target, coeff, eps):
            assert all([isinstance(elem, float) for elem in [t, c, e]])
            if c == 0.:
            m = s.mean(axis=0)
            assert m.ndim == 1
            rval += T.maximum(abs(m - t) - e, 0.).mean() * c

        return rval

    def sample(self,

        if theano_rng is None:
            raise ValueError(
                "theano_rng is required; it just defaults to None so that it may appear after layer_above / state_above in the list."

        if state_above is not None:
            msg = layer_above.downward_message(state_above)
            msg = None

        if self.requires_reformat:
            state_below = self.input_space.format_as(state_below,

        z = self.transformer.lmul(state_below) + self.b
        p, h, p_sample, h_sample = max_pool_channels(z, self.pool_size, msg,

        return p_sample, h_sample

    def downward_message(self, downward_state):
        rval = self.transformer.lmul_T(downward_state)

        if self.requires_reformat:
            rval = self.desired_space.format_as(rval, self.input_space)

        return rval

    def make_state(self, num_examples, numpy_rng):
        """ Returns a shared variable containing an actual state
           (not a mean field state) for this variable.

        t1 = time.time()

        empty_input = self.h_space.get_origin_batch(num_examples)
        h_state = sharedX(empty_input)

        default_z = T.zeros_like(h_state) + self.b

        theano_rng = MRG_RandomStreams(numpy_rng.randint(2**16))

        p_exp, h_exp, p_sample, h_sample = max_pool_channels(
            z=default_z, pool_size=self.pool_size, theano_rng=theano_rng)

        assert h_sample.dtype == default_z.dtype

        p_state = sharedX(self.output_space.get_origin_batch(num_examples))

        t2 = time.time()

        f = function([], updates={p_state: p_sample, h_state: h_sample})

        t3 = time.time()


        t4 = time.time()

        print str(self) + '.make_state took', t4 - t1
        print '\tcompose time:', t2 - t1
        print '\tcompile time:', t3 - t2
        print '\texecute time:', t4 - t3 = 'p_sample_shared' = 'h_sample_shared'

        return p_state, h_state

    def expected_energy_term(self, state, average, state_below, average_below):


        if self.requires_reformat:
            if not isinstance(state_below, tuple):
                for sb in get_debug_values(state_below):
                    if sb.shape[0] != self.dbm.batch_size:
                        raise ValueError(
                            "self.dbm.batch_size is %d but got shape of %d" %
                            (self.dbm.batch_size, sb.shape[0]))
                    assert reduce(lambda x, y: x * y,
                                  sb.shape[1:]) == self.input_dim

            state_below = self.input_space.format_as(state_below,

        downward_state = self.downward_state(state)

        # Energy function is linear so it doesn't matter if we're averaging or not
        # Specifically, our terms are -u^T W d - b^T d where u is the upward state of layer below
        # and d is the downward state of this layer

        bias_term =, self.b)
        weights_term = (self.transformer.lmul(state_below) *

        rval = -bias_term - weights_term

        assert rval.ndim == 1

        return rval

    def mf_update(self,


        if self.requires_reformat:
            if not isinstance(state_below, tuple):
                for sb in get_debug_values(state_below):
                    if sb.shape[0] != self.dbm.batch_size:
                        raise ValueError(
                            "self.dbm.batch_size is %d but got shape of %d" %
                            (self.dbm.batch_size, sb.shape[0]))
                    assert reduce(lambda x, y: x * y,
                                  sb.shape[1:]) == self.input_dim

            state_below = self.input_space.format_as(state_below,

        if iter_name is None:
            iter_name = 'anon'

        if state_above is not None:
            assert layer_above is not None
            msg = layer_above.downward_message(state_above)
   = 'msg_from_' + layer_above.layer_name + '_to_' + self.layer_name + '[' + iter_name + ']'
            msg = None

        if double_weights:
            state_below = 2. * state_below
   = self.layer_name + '_' + iter_name + '_2state'
        z = self.transformer.lmul(state_below) + self.b
        if self.layer_name is not None and iter_name is not None:
   = self.layer_name + '_' + iter_name + '_z'
        p, h = max_pool_channels(z, self.pool_size, msg) = self.layer_name + '_p_' + iter_name = self.layer_name + '_h_' + iter_name

        return p, h
class L2SquareHinge(Layer):
    A layer that can apply an affine transformation
    and use a l2 regularized square hinge loss.

    n_classes : int
        Number of classes for softmax targets.
    layer_name : string
        Name of Softmax layers.
    irange : float
        If specified, initialized each weight randomly in
        U(-irange, irange).
    istdev : float
        If specified, initialize each weight randomly from
    sparse_init : int
        If specified, initial sparse_init number of weights
        for each unit from N(0,1).
    W_lr_scale : float
        Scale for weight learning rate.
    b_lr_scale : float
        Scale for bias learning rate.
    max_row_norm : float
        Maximum norm for a row of the weight matrix.
    no_affine : boolean
        If True, softmax nonlinearity is applied directly to
    max_col_norm : float
        Maximum norm for a column of the weight matrix.
    init_bias_target_marginals : dataset
        Take the probability distribution of the targets into account to
        intelligently initialize biases.
    binary_target_dim : int, optional
        If your targets are class labels (i.e. a binary vector) then set the
        number of targets here so that an IndexSpace of the proper dimension
        can be used as the target space. This allows the softmax to compute
        the cost much more quickly than if it needs to convert the targets
        into a VectorSpace.
    def __init__(self,

        super(L2SquareHinge, self).__init__()

        if isinstance(W_lr_scale, str):
            W_lr_scale = float(W_lr_scale)

        del self.self
        del self.init_bias_target_marginals

        assert isinstance(n_classes, py_integer_types)

        if binary_target_dim is not None:
            assert isinstance(binary_target_dim, py_integer_types)
            self._has_binary_target = True
            self._target_space = IndexSpace(dim=binary_target_dim,
            self._has_binary_target = False

        self.output_space = VectorSpace(n_classes)

        self.b = sharedX(np.zeros((n_classes, )), name='hinge_b')
        if init_bias_target_marginals:
            y = init_bias_target_marginals.y
            if init_bias_target_marginals.y_labels is None:
                marginals = y.mean(axis=0)
                # compute class frequencies
                if np.max(y.shape) !=
                    raise AssertionError("Use of "
                                         "`init_bias_target_marginals` "
                                         "requires that each example has "
                                         "a single label.")
            marginals = np.bincount(y.flat) / float(y.shape[0])

            assert marginals.ndim == 1
            b = pseudoinverse_softmax_numpy(marginals).astype(self.b.dtype)
            assert b.ndim == 1
            assert b.dtype == self.b.dtype
            assert init_bias_target_marginals is None

    def get_lr_scalers(self):

        rval = OrderedDict()
        if self.W_lr_scale is not None:
            assert isinstance(self.W_lr_scale, float)
            rval[self.W] = self.W_lr_scale

        if not hasattr(self, 'b_lr_scale'):
            self.b_lr_scale = None

        if self.b_lr_scale is not None:
            assert isinstance(self.b_lr_scale, float)
            rval[self.b] = self.b_lr_scale

        return rval

    def get_monitoring_channels(self):
        warnings.warn("Layer.get_monitoring_channels is " + \
                    "deprecated. Use get_layer_monitoring_channels " + \
                    "instead. Layer.get_monitoring_channels " + \
                    "will be removed on or after september 24th 2014",

        W = self.W
        assert W.ndim == 2
        sq_W = T.sqr(W)
        row_norms = T.sqrt(sq_W.sum(axis=1))
        col_norms = T.sqrt(sq_W.sum(axis=0))
        return OrderedDict([
            ('row_norms_min', row_norms.min()),
            ('row_norms_mean', row_norms.mean()),
            ('row_norms_max', row_norms.max()),
            ('col_norms_min', col_norms.min()),
            ('col_norms_mean', col_norms.mean()),
            ('col_norms_max', col_norms.max()),

    def get_monitoring_channels_from_state(self, state, target=None):
        warnings.warn("Layer.get_monitoring_channels_from_state is " + \
                    "deprecated. Use get_layer_monitoring_channels " + \
                    "instead. Layer.get_monitoring_channels_from_state " + \
                    "will be removed on or after september 24th 2014",
        # channels that does not require state information
        W = self.W
        assert W.ndim == 2
        sq_W = T.sqr(W)
        row_norms = T.sqrt(sq_W.sum(axis=1))
        col_norms = T.sqrt(sq_W.sum(axis=0))
        rval = OrderedDict([
            ('row_norms_min', row_norms.min()),
            ('row_norms_mean', row_norms.mean()),
            ('row_norms_max', row_norms.max()),
            ('col_norms_min', col_norms.min()),
            ('col_norms_mean', col_norms.mean()),
            ('col_norms_max', col_norms.max()),

        mx = state.max(axis=1)
            OrderedDict([('mean_max_class', mx.mean()),
                         ('max_max_class', mx.max()),
                         ('min_max_class', mx.min())]))
        if target is not None:
            y_hat = T.argmax(state, axis=1)
            y = T.argmax(target, axis=1)
            misclass = T.neq(y, y_hat).mean()
            misclass = T.cast(misclass, config.floatX)
            rval['misclass'] = misclass
            rval['nll'] = self.cost(Y_hat=state, Y=target)

        return rval

    def get_layer_monitoring_channels(self,

        # channels that does not require state information
        W = self.W
        assert W.ndim == 2
        sq_W = T.sqr(W)
        row_norms = T.sqrt(sq_W.sum(axis=1))
        col_norms = T.sqrt(sq_W.sum(axis=0))
        rval = OrderedDict([
            ('row_norms_min', row_norms.min()),
            ('row_norms_mean', row_norms.mean()),
            ('row_norms_max', row_norms.max()),
            ('col_norms_min', col_norms.min()),
            ('col_norms_mean', col_norms.mean()),
            ('col_norms_max', col_norms.max()),

        if (state_below is not None) or (state is not None):
            if state is None:
                state = self.fprop(state_below)
            mx = state.max(axis=1)
                OrderedDict([('mean_max_class', mx.mean()),
                             ('max_max_class', mx.max()),
                             ('min_max_class', mx.min())]))

            if targets is not None:
                y_hat = T.argmax(state, axis=1)
                y = T.argmax(targets, axis=1)
                misclass = T.neq(y, y_hat).mean()
                misclass = T.cast(misclass, config.floatX)
                rval['misclass'] = misclass
                rval['nll'] = self.cost(Y_hat=state, Y=targets)
        return rval

    def set_input_space(self, space):
        self.input_space = space

        if not isinstance(space, Space):
            raise TypeError("Expected Space, got " + str(space) + " of type " +
        self.input_dim = space.get_total_dimension()
        self.needs_reformat = not isinstance(space, VectorSpace)
        desired_dim = self.input_dim
        self.desired_space = VectorSpace(desired_dim)

        if not self.needs_reformat:
            assert self.desired_space == self.input_space

        rng = self.mlp.rng
        if self.no_affine:
            self._params = []
            print(self.input_dim, self.n_classes)
            if self.irange is not None:
                assert self.istdev is None
                assert self.sparse_init is None
                W = rng.uniform(-self.irange, self.irange,
                                (self.input_dim, self.n_classes))
            elif self.istdev is not None:
                assert self.sparse_init is None
                W = rng.randn(self.input_dim, self.n_classes) * self.istdev
                assert self.sparse_init is not None
                W = np.zeros((self.input_dim, self.n_classes))
                for i in xrange(self.n_classes):
                    for j in xrange(self.sparse_init):
                        idx = rng.randint(0, self.input_dim)
                        while W[idx, i] != 0.:
                            idx = rng.randint(0, self.input_dim)
                        W[idx, i] = rng.randn()

            self.W = sharedX(W, 'hinge_W')

            self._params = [self.b, self.W]

    def get_weights_topo(self):

        if not isinstance(self.input_space, Conv2DSpace):
            raise NotImplementedError()
        desired = self.W.get_value().T
        ipt = self.desired_space.np_format_as(desired, self.input_space)
        rval = Conv2DSpace.convert_numpy(ipt, self.input_space.axes,
                                         ('b', 0, 1, 'c'))
        return rval

    def get_weights(self):
        if not isinstance(self.input_space, VectorSpace):
            raise NotImplementedError()
        return self.W.get_value()

    def set_weights(self, weights):

    def set_biases(self, biases):

    def get_biases(self):
        return self.b.get_value()

    def get_weights_format(self):
        return ('v', 'h')

    def fprop(self, state_below):
        ## Precondition
        if self.needs_reformat:
            state_below = self.input_space.format_as(state_below,
        assert state_below.ndim == 2
        assert self.W.ndim == 2

        ## Linear prediction
        rval =, self.W) + self.b
        return rval

    def hinge_cost(self, Y, Y_hat):
        ### print size of Y_hat

        #Y = Print(message="Y")(Y)
        #Y_hat = Print(message="Y_hat")(Y_hat)

        prob = (self.C * self.W.norm(2) +
                (T.maximum(0, 1 - (Y - Y_hat))**2.)).sum(axis=1)
        #.W = Print(message="W")(self.W)
        #prob = (T.maximum(1 - Y * Y_hat, 0) ** 2.).sum(axis=0)
        #prob = Print(message="prob")(prob)
        return prob

    def cost(self, Y, Y_hat):
        return self.hinge_cost(Y, Y_hat).mean()

    # @wraps(Layer.cost_matrix)
    # def cost_matrix(self, Y, Y_hat):
    #     # cost = self._cost(Y, Y_hat)
    #     # if self._has_binary_target:
    #     #     flat_Y = Y.flatten()
    #     #     flat_matrix = T.alloc(0, (Y.shape[0]*cost.shape[1]))
    #     #     flat_indices = flat_Y + T.extra_ops.repeat(
    #     #         T.arange(Y.shape[0])*cost.shape[1], Y.shape[1]
    #     #     )
    #     #     cost = T.set_subtensor(flat_matrix[flat_indices], flat_Y)

    #     # return cost
    #     return None

    def get_weight_decay(self, coeff):
        if isinstance(coeff, str):
            coeff = float(coeff)
        assert isinstance(coeff, float) or hasattr(coeff, 'dtype')
        return coeff * T.sqr(self.W).sum()

    def get_l1_weight_decay(self, coeff):
        if isinstance(coeff, str):
            coeff = float(coeff)
        assert isinstance(coeff, float) or hasattr(coeff, 'dtype')
        W = self.W
        return coeff * abs(W).sum()

    def _modify_updates(self, updates):
        if self.no_affine:
        if self.max_row_norm is not None:
            W = self.W
            if W in updates:
                updated_W = updates[W]
                row_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=1))
                desired_norms = T.clip(row_norms, 0, self.max_row_norm)
                scales = desired_norms / (1e-7 + row_norms)
                updates[W] = updated_W * scales.dimshuffle(0, 'x')
        if self.max_col_norm is not None:
            assert self.max_row_norm is None
            W = self.W
            if W in updates:
                updated_W = updates[W]
                col_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=0))
                desired_norms = T.clip(col_norms, 0, self.max_col_norm)
                updates[W] = updated_W * (desired_norms / (1e-7 + col_norms))