Ejemplo n.º 1
0
def get_adadelta_update(params, grads, rho, eps):
    # E[g^2]_{t-1}
    E_g_square = []
    for p in params:
        tmp = theano.shared(np.zeros(p.get_value().shape, dtype=theano.config.floatX), borrow=True)
        E_g_square.append(tmp)
    # E[g^2]_t = rho * E[g^2]_{t-1} + (1 - rho) * g_t^2
    E_g_square_next = []
    for e, g in izip(E_g_square, grads):
        tmp = rho * e + (1.0 - rho) * (g**2)
        E_g_square_next.append(tmp)
    # E[dW^2]_{t-1}
    E_dW_square = []
    for p in params:
        tmp = theano.shared(np.zeros(p.get_value().shape, dtype=theano.config.floatX), borrow=True)
        E_dW_square.append(tmp)
    # dW_t = - {sqrt(E[dW^2]_t + eps) / sqrt(E[g^2]_t + eps)} * g_t
    dW = []
    for ew, eg, g in izip(E_dW_square, E_g_square, grads):
        tmp = - (T.sqrt(ew + eps) / T.sqrt(eg + eps)) * g
        dW.append(tmp)
    # E[dW^2]_t = rho * E[dW^2]_{t-1} + (1 - rho) * dW_t^2
    E_dW_square_next = []
    for ew, d in izip(E_dW_square, dW):
        tmp = rho * ew + (1.0 - rho) * (d**2)
        E_dW_square_next.append(tmp)

    E_g_square_updates = zip(E_g_square, E_g_square_next)
    E_dW_square_updates = zip(E_dW_square, E_dW_square_next)
    params_updates = []
    for p, d in izip(params, dW):
        # W_t = W_{t-1} + dW
        params_updates.append((p, p + d))
    return E_g_square_updates + E_dW_square_updates + params_updates
Ejemplo n.º 2
0
def test_adadelta():
    """
    Make sure that learning_rule.AdaDelta obtains the same parameter values as
    with a hand-crafted AdaDelta implementation, given a dummy model and
    learning rate scaler for each parameter.

    Reference:
    "AdaDelta: An Adaptive Learning Rate Method", Matthew D. Zeiler.
    """

    # We include a cost other than SumOfParams so that data is actually
    # queried from the training set, and the expected number of updates
    # are applied.
    cost = SumOfCosts([SumOfOneHalfParamsSquared(), (0., DummyCost())])
    model = DummyModel(shapes, lr_scalers=scales)
    dataset = ArangeDataset(1)
    decay = 0.95

    sgd = SGD(cost=cost,
              learning_rate=learning_rate,
              learning_rule=AdaDelta(decay),
              batch_size=1)

    sgd.setup(model=model, dataset=dataset)

    state = {}
    for param in model.get_params():
        param_shape = param.get_value().shape
        state[param] = {}
        state[param]['g2'] = np.zeros(param_shape)
        state[param]['dx2'] = np.zeros(param_shape)

    def adadelta_manual(model, state):
        inc = []
        rval = []
        for scale, param in izip(scales, model.get_params()):
            pstate = state[param]
            param_val = param.get_value()
            # begin adadelta
            pstate['g2'] = decay * pstate['g2'] + (1 - decay) * param_val ** 2
            rms_g_t = np.sqrt(pstate['g2'] + scale * learning_rate)
            rms_dx_tm1 = np.sqrt(pstate['dx2'] + scale * learning_rate)
            dx_t = -rms_dx_tm1 / rms_g_t * param_val
            pstate['dx2'] = decay * pstate['dx2'] + (1 - decay) * dx_t ** 2
            rval += [param_val + dx_t]
        return rval

    manual = adadelta_manual(model, state)
    sgd.train(dataset=dataset)
    assert all(np.allclose(manual_param, sgd_param.get_value())
               for manual_param, sgd_param
               in izip(manual, model.get_params()))

    manual = adadelta_manual(model, state)
    sgd.train(dataset=dataset)
    assert all(np.allclose(manual_param, sgd_param.get_value())
               for manual_param, sgd_param in
               izip(manual, model.get_params()))
Ejemplo n.º 3
0
def test_adagrad():
    """
    Make sure that learning_rule.AdaGrad obtains the same parameter values as
    with a hand-crafted AdaGrad implementation, given a dummy model and
    learning rate scaler for each parameter.

    Reference:
    "Adaptive subgradient methods for online learning and
    stochastic optimization", Duchi J, Hazan E, Singer Y.
    """

    # We include a cost other than SumOfParams so that data is actually
    # queried from the training set, and the expected number of updates
    # are applied.
    cost = SumOfCosts([SumOfOneHalfParamsSquared(), (0., DummyCost())])
    model = DummyModel(shapes, lr_scalers=scales)
    dataset = ArangeDataset(1)

    sgd = SGD(cost=cost,
              learning_rate=learning_rate,
              learning_rule=AdaGrad(),
              batch_size=1)

    sgd.setup(model=model, dataset=dataset)

    state = {}
    for param in model.get_params():
        param_shape = param.get_value().shape
        state[param] = {}
        state[param]['sg2'] = np.zeros(param_shape)

    def adagrad_manual(model, state):
        rval = []
        for scale, param in izip(scales, model.get_params()):
            pstate = state[param]
            param_val = param.get_value()
            # begin adadelta
            pstate['sg2'] += param_val ** 2
            dx_t = - (scale * learning_rate
                      / np.sqrt(pstate['sg2'])
                      * param_val)
            rval += [param_val + dx_t]
        return rval

    manual = adagrad_manual(model, state)
    sgd.train(dataset=dataset)
    assert all(np.allclose(manual_param, sgd_param.get_value())
               for manual_param, sgd_param
               in izip(manual, model.get_params()))

    manual = adagrad_manual(model, state)
    sgd.train(dataset=dataset)
    assert all(np.allclose(manual_param, sgd_param.get_value())
               for manual_param, sgd_param in
               izip(manual, model.get_params()))
Ejemplo n.º 4
0
    def set_input_space(self, space):
        """ Note: this function will reset the parameters! """

        self.input_space = space

        if not isinstance(space, Conv2DSpace):
            raise BadInputSpaceError(self.__class__.__name__ +
                                     ".set_input_space "
                                     "expected a Conv2DSpace, got " +
                                     str(space) + " of type " +
                                     str(type(space)))

        rng = self.get_mlp().rng


        if self.pad != (0,0):
            output_shape = \
                [int(np.ceil((i_sh + 2. * k_pad - k_sh) / float(k_st))) + 1
                 for i_sh, k_sh, k_st, k_pad in izip(self.input_space.shape,
                                                     self.kernel_shape,
                                                     self.kernel_stride,
                                                     self.pad)]

        elif self.border_mode == 'valid':
            output_shape = [(self.input_space.shape[0] - self.kernel_shape[0])
                            / self.kernel_stride[0] + 1,
                            (self.input_space.shape[1] - self.kernel_shape[1])
                            / self.kernel_stride[1] + 1]
        elif self.border_mode == 'full':
            output_shape = [(self.input_space.shape[0] + self.kernel_shape[0])
                            / self.kernel_stride[0] - 1,
                            (self.input_space.shape[1] + self.kernel_shape[1])
                            / self.kernel_stride[1] - 1]

        print "In:", self.layer_name, self.input_space.shape, self.kernel_shape, self.kernel_stride, self.pad
        print "Out:", self.layer_name, output_shape


        self.detector_space = Conv2DSpace(shape=output_shape,
                                          num_channels=self.output_channels,
                                          axes=('b', 'c', 0, 1))

        self.initialize_transformer(rng)

        W, = self.transformer.get_params()
        W.name = self.layer_name + '_W'

        assert self.tied_b
        if self.tied_b:
            self.b = sharedX(np.zeros((self.detector_space.num_channels)) +
                             self.init_bias)
        else:
            self.b = sharedX(self.detector_space.get_origin() + self.init_bias)

        self.b.name = self.layer_name + '_b'

        logger.info('Input shape: {0}'.format(self.input_space.shape))
        logger.info('Detector space: {0}'.format(self.detector_space.shape))

        self.initialize_output_space()
Ejemplo n.º 5
0
    def get_gradients(self, model, data, ** kwargs):
        """
        Provides the gradients of the cost function with respect to the model
        parameters.

        These are not necessarily those obtained by theano.tensor.grad
        --you may wish to use approximate or even intentionally incorrect
        gradients in some cases.

        Parameters
        ----------
        model : a pylearn2 Model instance
        data : a batch in cost.get_data_specs() form
        kwargs : dict
            Optional extra arguments, not used by the base class.

        Returns
        -------
        gradients : OrderedDict
            a dictionary mapping from the model's parameters
            to their gradients
            The default implementation is to compute the gradients
            using T.grad applied to the value returned by expr.
            However, subclasses may return other values for the gradient.
            For example, an intractable cost may return a sampling-based
            approximation to its gradient.
        updates : OrderedDict
            a dictionary mapping shared variables to updates that must
            be applied to them each time these gradients are computed.
            This is to facilitate computation of sampling-based approximate
            gradients.
            The parameters should never appear in the updates dictionary.
            This would imply that computing their gradient changes
            their value, thus making the gradient value outdated.
        """

        try:
            cost = self.expr(model=model, data=data, **kwargs)
        except TypeError:
            # If anybody knows how to add type(self) to the exception message
            # but still preserve the stack trace, please do so
            # The current code does neither
            message = "Error while calling " + str(type(self)) + ".expr"
            reraise_as(TypeError(message))

        if cost is None:
            raise NotImplementedError(str(type(self)) +
                                      " represents an intractable cost and "
                                      "does not provide a gradient "
                                      "approximation scheme.")

        params = list(model.get_params())

        grads = T.grad(cost, params, disconnected_inputs='ignore')

        gradients = OrderedDict(izip(params, grads))

        updates = OrderedDict()

        return gradients, updates
Ejemplo n.º 6
0
    def get_gradients(self, model, data, ** kwargs):
        """
        Provides the gradients of the cost function with respect to the model
        parameters.

        These are not necessarily those obtained by theano.tensor.grad
        --you may wish to use approximate or even intentionally incorrect
        gradients in some cases.

        Parameters
        ----------
        model : a pylearn2 Model instance
        data : a batch in cost.get_data_specs() form
        kwargs : dict
            Optional extra arguments, not used by the base class.

        Returns
        -------
        gradients : OrderedDict
            a dictionary mapping from the model's parameters
            to their gradients
            The default implementation is to compute the gradients
            using T.grad applied to the value returned by expr.
            However, subclasses may return other values for the gradient.
            For example, an intractable cost may return a sampling-based
            approximation to its gradient.
        updates : OrderedDict
            a dictionary mapping shared variables to updates that must
            be applied to them each time these gradients are computed.
            This is to facilitate computation of sampling-based approximate
            gradients.
            The parameters should never appear in the updates dictionary.
            This would imply that computing their gradient changes
            their value, thus making the gradient value outdated.
        """

        try:
            cost,mask = self.expr(model=model, data=data, **kwargs)
        except TypeError:
            # If anybody knows how to add type(self) to the exception message
            # but still preserve the stack trace, please do so
            # The current code does neither
            message = "Error while calling " + str(type(self)) + ".expr"
            reraise_as(TypeError(message))

        if cost is None:
            raise NotImplementedError(str(type(self)) +
                                      " represents an intractable cost and "
                                      "does not provide a gradient "
                                      "approximation scheme.")

        params = list(model.get_params())

        grads = T.grad(cost, params, disconnected_inputs='ignore')

        gradients = OrderedDict(izip(params, grads))

        updates = OrderedDict()

        return gradients, updates
Ejemplo n.º 7
0
    def set_input_space(self, space):
        """ Note: this function will reset the parameters! """

        self.input_space = space

        if not isinstance(space, Conv2DSpace):
            raise BadInputSpaceError(self.__class__.__name__ +
                                     ".set_input_space "
                                     "expected a Conv2DSpace, got " +
                                     str(space) + " of type " +
                                     str(type(space)))

        rng = self.get_mlp().rng


        if self.pad != (0,0):
            output_shape = \
                [int(np.ceil((i_sh + 2. * k_pad - k_sh) / float(k_st))) + 1
                 for i_sh, k_sh, k_st, k_pad in izip(self.input_space.shape,
                                                     self.kernel_shape,
                                                     self.kernel_stride,
                                                     self.pad)]

        elif self.border_mode == 'valid':
            output_shape = [(self.input_space.shape[0] - self.kernel_shape[0])
                            / self.kernel_stride[0] + 1,
                            (self.input_space.shape[1] - self.kernel_shape[1])
                            / self.kernel_stride[1] + 1]
        elif self.border_mode == 'full':
            output_shape = [(self.input_space.shape[0] + self.kernel_shape[0])
                            / self.kernel_stride[0] - 1,
                            (self.input_space.shape[1] + self.kernel_shape[1])
                            / self.kernel_stride[1] - 1]

        print "In:", self.layer_name, self.input_space.shape, self.kernel_shape, self.kernel_stride, self.pad
        print "Out:", self.layer_name, output_shape


        self.detector_space = Conv2DSpace(shape=output_shape,
                                          num_channels=self.output_channels,
                                          axes=('b', 'c', 0, 1))

        self.initialize_transformer(rng)

        W, = self.transformer.get_params()
        W.name = self.layer_name + '_W'

        #assert self.tied_b
        if self.tied_b:
            self.b = sharedX(np.zeros((self.detector_space.num_channels)) +
                             self.init_bias)
        else:
            self.b = sharedX(self.detector_space.get_origin() + self.init_bias)

        self.b.name = self.layer_name + '_b'

        logger.info('Input shape: {0}'.format(self.input_space.shape))
        logger.info('Detector space: {0}'.format(self.detector_space.shape))

        self.initialize_output_space()
Ejemplo n.º 8
0
def build_stacked_ae(nvis,
                     nhids,
                     act_enc,
                     act_dec,
                     tied_weights=False,
                     irange=1e-3,
                     rng=None,
                     corruptor=None,
                     contracting=False):
    """
    .. todo::

        WRITEME properly

    Allocate a stack of autoencoders.
    """
    rng = make_np_rng(rng, which_method='randn')
    layers = []
    final = {}
    # "Broadcast" arguments if they are singular, or accept sequences if
    # they are the same length as nhids
    for c in [
            'corruptor', 'contracting', 'act_enc', 'act_dec', 'tied_weights',
            'irange'
    ]:
        if type(locals()[c]) is not str and hasattr(locals()[c], '__len__'):
            assert len(nhids) == len(locals()[c])
            final[c] = locals()[c]
        else:
            final[c] = [locals()[c]] * len(nhids)
    # The number of visible units in each layer is the initial input
    # size and the first k-1 hidden unit sizes.
    nviss = [nvis] + nhids[:-1]
    seq = izip(
        nhids,
        nviss,
        final['act_enc'],
        final['act_dec'],
        final['corruptor'],
        final['contracting'],
        final['tied_weights'],
        final['irange'],
    )
    # Create each layer.
    for (nhid, nvis, act_enc, act_dec, corr, cae, tied, ir) in seq:
        args = (nvis, nhid, act_enc, act_dec, tied, ir, rng)
        if cae and corr is not None:
            raise ValueError("Can't specify denoising and contracting "
                             "objectives simultaneously")
        elif cae:
            autoenc = ContractiveAutoencoder(*args)
        elif corr is not None:
            autoenc = DenoisingAutoencoder(corr, *args)
        else:
            autoenc = Autoencoder(*args)
        layers.append(autoenc)

    # Create the stack
    return StackedBlocks(layers)
Ejemplo n.º 9
0
def test_momentum():
    """
    Make sure that learning_rule.Momentum obtains the same parameter values as
    with a hand-crafted sgd w/ momentum implementation, given a dummy model and
    learning rate scaler for each parameter.
    """
    # We include a cost other than SumOfParams so that data is actually
    # queried from the training set, and the expected number of updates
    # are applied.
    cost = SumOfCosts([SumOfParams(), (0., DummyCost())])

    scales = [.01, .02, .05, 1., 5.]
    shapes = [(1,), (9,), (8, 7), (6, 5, 4), (3, 2, 2, 2)]

    model = DummyModel(shapes, lr_scalers=scales)
    dataset = ArangeDataset(1)
    learning_rate = .001
    momentum = 0.5

    sgd = SGD(cost=cost,
              learning_rate=learning_rate,
              learning_rule=Momentum(momentum),
              batch_size=1)

    sgd.setup(model=model, dataset=dataset)

    manual = [param.get_value() for param in model.get_params()]
    inc = [-learning_rate * scale for scale in scales]
    manual = [param + i for param, i in izip(manual, inc)]

    sgd.train(dataset=dataset)

    assert all(np.allclose(manual_param, sgd_param.get_value())
               for manual_param, sgd_param in
               izip(manual, model.get_params()))

    manual = [param - learning_rate * scale + i * momentum
              for param, scale, i in izip(manual, scales, inc)]

    sgd.train(dataset=dataset)

    assert all(np.allclose(manual_param, sgd_param.get_value())
               for manual_param, sgd_param in
               izip(manual, model.get_params()))
Ejemplo n.º 10
0
    def __init__(self, autoencoders):
        super(DeepComposedAutoencoder, self).__init__()
        self.fn = None
        self.cpu_only = False

        assert all(pre.get_output_space().dim == post.get_input_space().dim
                   for pre, post in izip(autoencoders[:-1], autoencoders[1:]))

        self.autoencoders = list(autoencoders)
        self.input_space = autoencoders[0].get_input_space()
        self.output_space = autoencoders[-1].get_output_space()
Ejemplo n.º 11
0
    def __init__(self, autoencoders):
        super(DeepComposedAutoencoder, self).__init__()
        self.fn = None
        self.cpu_only = False

        assert all(pre.get_output_space().dim == post.get_input_space().dim
                   for pre, post in izip(autoencoders[:-1], autoencoders[1:]))

        self.autoencoders = list(autoencoders)
        self.input_space = autoencoders[0].get_input_space()
        self.output_space = autoencoders[-1].get_output_space()
Ejemplo n.º 12
0
 def adagrad_manual(model, state):
     rval = []
     for scale, param in izip(scales, model.get_params()):
         pstate = state[param]
         param_val = param.get_value()
         # begin adadelta
         pstate['sg2'] += param_val**2
         dx_t = -(scale * learning_rate / np.sqrt(pstate['sg2']) *
                  param_val)
         rval += [param_val + dx_t]
     return rval
Ejemplo n.º 13
0
 def adagrad_manual(model, state):
     rval = []
     for scale, param in izip(scales, model.get_params()):
         pstate = state[param]
         param_val = param.get_value()
         # begin adadelta
         pstate['sg2'] += param_val ** 2
         dx_t = - (scale * learning_rate
                   / np.sqrt(pstate['sg2'])
                   * param_val)
         rval += [param_val + dx_t]
     return rval
Ejemplo n.º 14
0
 def get_gradients(self, model, data, ** kwargs):
     """
     Overwrites the Cost.get_gradients so we can inject our theano.Op
     This will do a separate call back for each model.param
         Consider rewriting your model to have one param 
     """
     srng = RandomStreams(seed=232)
     params = list(model.get_params())
     grads = [OverwriteOp(self.grad,model)(srng.uniform(size=i.shape,dtype=theano.config.floatX),data) for i in params]
     gradients = OrderedDict(izip(params, grads))
     updates = OrderedDict()
     return gradients, updates        
Ejemplo n.º 15
0
def test_rmsprop():
    """
    Make sure that learning_rule.RMSProp obtains the same parameter values as
    with a hand-crafted RMSProp implementation, given a dummy model and
    learning rate scaler for each parameter.
    """

    # We include a cost other than SumOfParams so that data is actually
    # queried from the training set, and the expected number of updates
    # are applied.
    cost = SumOfCosts([SumOfOneHalfParamsSquared(), (0., DummyCost())])

    scales = [.01, .02, .05, 1., 5.]
    shapes = [(1,), (9,), (8, 7), (6, 5, 4), (3, 2, 2, 2)]

    model = DummyModel(shapes, lr_scalers=scales)
    dataset = ArangeDataset(1)
    learning_rate = .001
    decay = 0.90
    max_scaling = 1e5

    sgd = SGD(cost=cost,
              learning_rate=learning_rate,
              learning_rule=RMSProp(decay),
              batch_size=1)

    sgd.setup(model=model, dataset=dataset)

    state = {}
    for param in model.get_params():
        param_shape = param.get_value().shape
        state[param] = {}
        state[param]['g2'] = np.zeros(param_shape)

    def rmsprop_manual(model, state):
        inc = []
        rval = []
        epsilon = 1. / max_scaling
        for scale, param in izip(scales, model.get_params()):
            pstate = state[param]
            param_val = param.get_value()
            # begin rmsprop
            pstate['g2'] = decay * pstate['g2'] + (1 - decay) * param_val ** 2
            rms_g_t = np.maximum(np.sqrt(pstate['g2']), epsilon)
            dx_t = - scale * learning_rate / rms_g_t * param_val
            rval += [param_val + dx_t]
        return rval

    manual = rmsprop_manual(model, state)
    sgd.train(dataset=dataset)
    assert all(np.allclose(manual_param, sgd_param.get_value())
               for manual_param, sgd_param
               in izip(manual, model.get_params()))
Ejemplo n.º 16
0
def test_rmsprop():
    """
    Make sure that learning_rule.RMSProp obtains the same parameter values as
    with a hand-crafted RMSProp implementation, given a dummy model and
    learning rate scaler for each parameter.
    """

    # We include a cost other than SumOfParams so that data is actually
    # queried from the training set, and the expected number of updates
    # are applied.
    cost = SumOfCosts([SumOfOneHalfParamsSquared(), (0., DummyCost())])

    scales = [.01, .02, .05, 1., 5.]
    shapes = [(1, ), (9, ), (8, 7), (6, 5, 4), (3, 2, 2, 2)]

    model = DummyModel(shapes, lr_scalers=scales)
    dataset = ArangeDataset(1)
    learning_rate = .001
    decay = 0.90
    max_scaling = 1e5

    sgd = SGD(cost=cost,
              learning_rate=learning_rate,
              learning_rule=RMSProp(decay),
              batch_size=1)

    sgd.setup(model=model, dataset=dataset)

    state = {}
    for param in model.get_params():
        param_shape = param.get_value().shape
        state[param] = {}
        state[param]['g2'] = np.zeros(param_shape)

    def rmsprop_manual(model, state):
        inc = []
        rval = []
        epsilon = 1. / max_scaling
        for scale, param in izip(scales, model.get_params()):
            pstate = state[param]
            param_val = param.get_value()
            # begin rmsprop
            pstate['g2'] = decay * pstate['g2'] + (1 - decay) * param_val**2
            rms_g_t = np.maximum(np.sqrt(pstate['g2']), epsilon)
            dx_t = -scale * learning_rate / rms_g_t * param_val
            rval += [param_val + dx_t]
        return rval

    manual = rmsprop_manual(model, state)
    sgd.train(dataset=dataset)
    assert all(
        np.allclose(manual_param, sgd_param.get_value())
        for manual_param, sgd_param in izip(manual, model.get_params()))
Ejemplo n.º 17
0
 def rmsprop_manual(model, state):
     inc = []
     rval = []
     epsilon = 1. / max_scaling
     for scale, param in izip(scales, model.get_params()):
         pstate = state[param]
         param_val = param.get_value()
         # begin rmsprop
         pstate['g2'] = decay * pstate['g2'] + (1 - decay) * param_val ** 2
         rms_g_t = np.maximum(np.sqrt(pstate['g2']), epsilon)
         dx_t = - scale * learning_rate / rms_g_t * param_val
         rval += [param_val + dx_t]
     return rval
Ejemplo n.º 18
0
    def get_gradients(self, model, data, **kwargs):
        cost, neg_v = self._cost(model,data,**kwargs)

        params = list(model.get_params())

        grads = T.grad(cost, params, disconnected_inputs = 'ignore',
                       consider_constant = [neg_v])

        gradients = OrderedDict(izip(params, grads))

        updates = OrderedDict()

        return gradients, updates
Ejemplo n.º 19
0
 def adadelta_manual(model, state):
     rval = []
     for scale, param in izip(scales, model.get_params()):
         pstate = state[param]
         param_val = param.get_value()
         # begin adadelta
         pstate['g2'] = decay * pstate['g2'] + (1 - decay) * param_val ** 2
         rms_g_t = np.sqrt(pstate['g2'] + scale * learning_rate)
         rms_dx_tm1 = np.sqrt(pstate['dx2'] + scale * learning_rate)
         dx_t = -rms_dx_tm1 / rms_g_t * param_val
         pstate['dx2'] = decay * pstate['dx2'] + (1 - decay) * dx_t ** 2
         rval += [param_val + dx_t]
     return rval
Ejemplo n.º 20
0
    def get_gradients(self, model, data, **kwargs):
        cost, neg_v = self._cost(model, data, **kwargs)

        params = list(model.get_params())

        grads = T.grad(cost, params, disconnected_inputs='ignore',
                       consider_constant=[neg_v])

        gradients = OrderedDict(izip(params, grads))

        updates = OrderedDict()

        return gradients, updates
Ejemplo n.º 21
0
 def rmsprop_manual(model, state):
     inc = []
     rval = []
     epsilon = 1. / max_scaling
     for scale, param in izip(scales, model.get_params()):
         pstate = state[param]
         param_val = param.get_value()
         # begin rmsprop
         pstate['g2'] = decay * pstate['g2'] + (1 - decay) * param_val**2
         rms_g_t = np.maximum(np.sqrt(pstate['g2']), epsilon)
         dx_t = -scale * learning_rate / rms_g_t * param_val
         rval += [param_val + dx_t]
     return rval
Ejemplo n.º 22
0
 def adadelta_manual(model, state):
     rval = []
     for scale, param in izip(scales, model.get_params()):
         pstate = state[param]
         param_val = param.get_value()
         # begin adadelta
         pstate['g2'] = decay * pstate['g2'] + (1 - decay) * param_val**2
         rms_g_t = np.sqrt(pstate['g2'] + scale * learning_rate)
         rms_dx_tm1 = np.sqrt(pstate['dx2'] + scale * learning_rate)
         dx_t = -rms_dx_tm1 / rms_g_t * param_val
         pstate['dx2'] = decay * pstate['dx2'] + (1 - decay) * dx_t**2
         rval += [param_val + dx_t]
     return rval
Ejemplo n.º 23
0
def build_stacked_ae(nvis, nhids, act_enc, act_dec,
                     tied_weights=False, irange=1e-3, rng=None,
                     corruptor=None, contracting=False):
    """
    .. todo::

        WRITEME properly

    Allocate a stack of autoencoders.
    """
    rng = make_np_rng(rng, which_method='randn')
    layers = []
    final = {}
    # "Broadcast" arguments if they are singular, or accept sequences if
    # they are the same length as nhids
    for c in ['corruptor', 'contracting', 'act_enc', 'act_dec',
              'tied_weights', 'irange']:
        if type(locals()[c]) is not str and hasattr(locals()[c], '__len__'):
            assert len(nhids) == len(locals()[c])
            final[c] = locals()[c]
        else:
            final[c] = [locals()[c]] * len(nhids)
    # The number of visible units in each layer is the initial input
    # size and the first k-1 hidden unit sizes.
    nviss = [nvis] + nhids[:-1]
    seq = izip(nhids, nviss,
        final['act_enc'],
        final['act_dec'],
        final['corruptor'],
        final['contracting'],
        final['tied_weights'],
        final['irange'],
    )
    # Create each layer.
    for (nhid, nvis, act_enc, act_dec, corr, cae, tied, ir) in seq:
        args = (nvis, nhid, act_enc, act_dec, tied, ir, rng)
        if cae and corr is not None:
            raise ValueError("Can't specify denoising and contracting "
                             "objectives simultaneously")
        elif cae:
            autoenc = ContractiveAutoencoder(*args)
        elif corr is not None:
            autoenc = DenoisingAutoencoder(corr, *args)
        else:
            autoenc = Autoencoder(*args)
        layers.append(autoenc)

    # Create the stack
    return StackedBlocks(layers)
Ejemplo n.º 24
0
    def get_gradients(self, model, data, **kwargs):
        cost = self._cost(model,data,**kwargs)

        params = list(model.get_params())

        grads = T.grad(cost, params, disconnected_inputs = 'ignore',
                       consider_constant = [self.sampler.particles])

        gradients = OrderedDict(izip(params, grads))

        updates = OrderedDict()

        sampler_updates = self.sampler.updates()
        updates.update(sampler_updates)
        return gradients, updates
Ejemplo n.º 25
0
    def get_gradients(self, model, data, **kwargs):
        cost = self._cost(model, data, **kwargs)

        params = list(model.get_params())

        grads = T.grad(cost, params, disconnected_inputs='ignore',
                       consider_constant=[self.sampler.particles])

        gradients = OrderedDict(izip(params, grads))

        updates = OrderedDict()

        sampler_updates = self.sampler.updates()
        updates.update(sampler_updates)
        return gradients, updates
Ejemplo n.º 26
0
def test_adagrad():
    """
    Make sure that learning_rule.AdaGrad obtains the same parameter values as
    with a hand-crafted AdaGrad implementation, given a dummy model and
    learning rate scaler for each parameter.
    Reference:
    "Adaptive subgradient methods for online learning and
    stochastic optimization", Duchi J, Hazan E, Singer Y.
    """

    cost, model, dataset, sgd, state = prepare_adagrad_test()

    def adagrad_manual(model, state):
        rval = []
        for scale, param in izip(scales, model.get_params()):
            pstate = state[param]
            param_val = param.get_value()
            # begin adadelta
            pstate['sg2'] += param_val ** 2
            dx_t = - (scale * learning_rate
                      / np.sqrt(pstate['sg2'])
                      * param_val)
            rval += [param_val + dx_t]
        return rval

    manual = adagrad_manual(model, state)
    sgd.train(dataset=dataset)
    assert all(np.allclose(manual_param, sgd_param.get_value())
               for manual_param, sgd_param
               in izip(manual, model.get_params()))

    manual = adagrad_manual(model, state)
    sgd.train(dataset=dataset)
    assert all(np.allclose(manual_param, sgd_param.get_value())
               for manual_param, sgd_param in
               izip(manual, model.get_params()))
Ejemplo n.º 27
0
def test_nesterov_momentum():
    """
    Make sure that learning_rule.Momentum obtains the same parameter values as
    with a hand-crafted sgd w/ momentum implementation, given a dummy model and
    learning rate scaler for each parameter.
    """

    # We include a cost other than SumOfParams so that data is actually
    # queried from the training set, and the expected number of updates
    # are applied.
    cost = SumOfCosts([SumOfParams(), (0., DummyCost())])
    model = DummyModel(shapes, lr_scalers=scales)
    dataset = ArangeDataset(1)
    momentum = 0.5

    sgd = SGD(cost=cost,
              learning_rate=learning_rate,
              learning_rule=Momentum(momentum, nesterov_momentum=True),
              batch_size=1)

    sgd.setup(model=model, dataset=dataset)

    manual = [param.get_value() for param in model.get_params()]
    vel = [-learning_rate * scale for scale in scales]
    updates = [
        -learning_rate * scale + v * momentum
        for scale, v in izip(scales, vel)
    ]
    manual = [param + update for param, update in izip(manual, updates)]

    sgd.train(dataset=dataset)

    assert all(
        np.allclose(manual_param, sgd_param.get_value())
        for manual_param, sgd_param in izip(manual, model.get_params()))

    vel = [
        -learning_rate * scale + i * momentum
        for scale, i in izip(scales, vel)
    ]
    updates = [
        -learning_rate * scale + v * momentum
        for scale, v in izip(scales, vel)
    ]
    manual = [param + update for param, update in izip(manual, updates)]

    sgd.train(dataset=dataset)

    assert all(
        np.allclose(manual_param, sgd_param.get_value())
        for manual_param, sgd_param in izip(manual, model.get_params()))
Ejemplo n.º 28
0
def test_nesterov_momentum():
    """
    Make sure that learning_rule.Momentum obtains the same parameter values as
    with a hand-crafted sgd w/ momentum implementation, given a dummy model and
    learning rate scaler for each parameter.
    """

    # We include a cost other than SumOfParams so that data is actually
    # queried from the training set, and the expected number of updates
    # are applied.
    cost = SumOfCosts([SumOfParams(), (0., DummyCost())])
    model = DummyModel(shapes, lr_scalers=scales)
    dataset = ArangeDataset(1)
    momentum = 0.5

    sgd = SGD(cost=cost,
              learning_rate=learning_rate,
              learning_rule=Momentum(momentum, nesterov_momentum=True),
              batch_size=1)

    sgd.setup(model=model, dataset=dataset)

    manual = [param.get_value() for param in model.get_params()]
    vel = [-learning_rate * scale for scale in scales]
    updates = [-learning_rate * scale + v * momentum
               for scale, v in izip(scales, vel)]
    manual = [param + update for param, update in izip(manual, updates)]

    sgd.train(dataset=dataset)

    assert all(np.allclose(manual_param, sgd_param.get_value())
               for manual_param, sgd_param
               in izip(manual, model.get_params()))

    vel = [-learning_rate * scale + i * momentum
           for scale, i in izip(scales, vel)]
    updates = [-learning_rate * scale + v * momentum
               for scale, v in izip(scales, vel)]
    manual = [param + update for param, update in izip(manual, updates)]

    sgd.train(dataset=dataset)

    assert all(np.allclose(manual_param, sgd_param.get_value())
               for manual_param, sgd_param
               in izip(manual, model.get_params()))
Ejemplo n.º 29
0
def test_adadelta():
    """
    Make sure that learning_rule.AdaDelta obtains the same parameter values as
    with a hand-crafted AdaDelta implementation, given a dummy model and
    learning rate scaler for each parameter.

    Reference:
    "AdaDelta: An Adaptive Learning Rate Method", Matthew D. Zeiler.
    """

    # We include a cost other than SumOfParams so that data is actually
    # queried from the training set, and the expected number of updates
    # are applied.
    cost = SumOfCosts([SumOfOneHalfParamsSquared(), (0., DummyCost())])

    scales = [.01, .02, .05, 1., 5.]
    shapes = [(1,), (9,), (8, 7), (6, 5, 4), (3, 2, 2, 2)]

    model = DummyModel(shapes, lr_scalers=scales)
    dataset = ArangeDataset(1)
    learning_rate = .001
    decay = 0.95

    sgd = SGD(cost=cost,
              learning_rate=learning_rate,
              learning_rule=AdaDelta(decay),
              batch_size=1)

    sgd.setup(model=model, dataset=dataset)

    state = {}
    for param in model.get_params():
        param_shape = param.get_value().shape
        state[param] = {}
        state[param]['g2'] = np.zeros(param_shape)
        state[param]['dx2'] = np.zeros(param_shape)

    def adadelta_manual(model, state):
        inc = []
        rval = []
        for scale, param in izip(scales, model.get_params()):
            pstate = state[param]
            param_val = param.get_value()
            # begin adadelta
            pstate['g2'] = decay * pstate['g2'] + (1 - decay) * param_val ** 2
            rms_g_t = np.sqrt(pstate['g2'] + scale * learning_rate)
            rms_dx_tm1 = np.sqrt(pstate['dx2'] + scale * learning_rate)
            dx_t = - rms_dx_tm1 / rms_g_t * param_val
            pstate['dx2'] = decay * pstate['dx2'] + (1 - decay) * dx_t ** 2
            rval += [param_val + dx_t]
        return rval

    manual = adadelta_manual(model, state)
    sgd.train(dataset=dataset)
    assert all(np.allclose(manual_param, sgd_param.get_value())
               for manual_param, sgd_param in
               izip(manual, model.get_params()))

    manual = adadelta_manual(model, state)
    sgd.train(dataset=dataset)
    assert all(np.allclose(manual_param, sgd_param.get_value())
               for manual_param, sgd_param in
               izip(manual, model.get_params()))
Ejemplo n.º 30
0
def feature_sign_search(dictionary, signals, sparsity, max_iter=1000, solution=None):
    """
    Solve L1-penalized quadratic minimization problems with
    feature-sign search.

    Employs the feature sign search algorithm of Lee et al (2006)
    to solve an L1-penalized quadratic optimization problem as a
    sequence of unconstrained quadratic minimization problems over
    subsets of the variables, with candidates for non-zero elements
    chosen by means of a gradient-based criterion.

    Parameters
    ----------
    dictionary : array_like, 2-dimensional
        The dictionary of basis functions from which to form the
        sparse linear combination. Each column constitutes a basis
        vector for the sparse code. There should be as many rows as
        input dimensions in the signal.
    signals : array_like, 1- or 2-dimensional
        The signal(s) to be decomposed as a sparse linear combination
        of the columns of the dictionary. If 2-dimensional, each
        different signal (training case) should be a row of this matrix.
    sparsity : float
        The coefficient on the L1 penalty term of the cost function.
    max_iter : int, optional
        The maximum number of iterations to run, per code vector, if
        the optimization has still not converged. Default is 1000.
    solution : ndarray, 1- or 2-dimensional, optional
        Pre-allocated vector or matrix used to store the solution(s).
        If provided, it should have the same rank as `signals`. If
        2-dimensional, it should have as many rows as `signals`.

    Returns
    -------
    solution : ndarray, 1- or 2-dimensional
        Matrix where each row contains the solution corresponding to a
        row of `signals`. If an array was passed in as the argument
        `solution`, it  will be updated in place and the same object
        will be returned.

    Notes
    -----
    It might seem more natural, from a linear-algebraic point of
    view, to think of both `signals` and `solution` as matrices with
    training examples contained as column vectors; then the overall
    cost function being minimized is

    .. math::
        (Y - AX)^2 + \gamma \sum_{i,j} |X_{ij}|

    with :math:`$A$` representing the dictionary, :math:`Y` being
    `signals.T` and math:`X` being `solutions.T`. However, in order
    to maintain the convention of training examples being indexed
    along the first dimension in the case of 2-dimensional `signals`
    input (as well as provide faster computation via memory locality
    in the case of C-contiguous inputs), this function expects and
    returns input with training examples as rows of a matrix.

    References
    ----------
    .. [1] H. Lee, A. Battle, R. Raina, and A. Y. Ng. "Efficient
       sparse coding algorithms". Advances in Neural Information
       Processing Systems 19, 2007.
    """
    dictionary = np.asarray(dictionary)
    _feature_sign_checkargs(dictionary, signals, sparsity, max_iter, solution)
    # Make things the code a bit simpler by always forcing the
    # 2-dimensional case.
    signals_ndim = signals.ndim
    signals = np.atleast_2d(signals)
    if solution is None:
        solution = np.zeros((signals.shape[0], dictionary.shape[1]), dtype=signals.dtype)
        orig_sol = None
    else:
        orig_sol = solution
        solution = np.atleast_2d(solution)
    # Solve each minimization in sequence.
    for row, (signal, sol) in enumerate(izip(signals, solution)):
        _, iters = _feature_sign_search_single(dictionary, signal, sparsity, max_iter, sol)
        if iters >= max_iter:
            log.warning(
                "maximum number of iterations reached when "
                "optimizing code for training case %d; solution "
                "may not be optimal" % iters
            )
    # Attempt to return the exact same object reference.
    if orig_sol is not None and orig_sol.ndim == 1:
        solution = orig_sol
    # Return a vector with the same rank as the input `signals`.
    elif orig_sol is None and signals_ndim == 1:
        solution = solution.squeeze()
    return solution
Ejemplo n.º 31
0
def pooling_matrix(groups, per_group, strides=None, dtype=None, sparse=None):
    """
    Construct a pooling matrix, optionally with overlapping pools
    arranged in a 1 or 2D topology.

    Parameters
    ----------
    groups : int or tuple
        The grid dimensions of a 1- or 2-dimensional pooling grid.
    per_group : int or tuple
        The grid dimensions of a single 1- or 2-dimensional feature
        pool. Must be same length as `groups`.
    strides : int or tuple, optional
        The stride of the pools along each dimension. A value of `None`
        is equivalent to setting equal to `per_group`, i.e. no overlap
    dtype : dtype object or str, optional
        The dtype of the resulting pooling matrix.
    sparse : str, optional
        If `None`, the function will return a dense matrix (a rank-2
        `numpy.ndarray`). Specifying 'csc' or 'csr' in this argument will
        cause the function to return a `scipy.sparse.csc_matrix` or a
        `scipy.sparse.csr_matrix`, instead.

    Returns
    -------
    pools : ndarray or sparse matrix
        Either a dense 2-dimensional NumPy array or one of
        `scipy.sparse.csc_matrix` or `scipy.sparse.csr_matrix`, depending
        on the value of the `sparse` argument. In any case, the shape is
        `(n_pools, n_filters)` and the value of `pools[i, j]` is 1 if
        feature `j` is in pool `i`, and 0 otherwise.
    """
    # Error-check arguments and fill in row_stride and col_stride
    # if either argument is absent.
    def _validate_shape(shape, param_name):
        try:
            shape = tuple(shape)
            [int(val) for val in shape]
        except (ValueError, TypeError):
            try:
                shape = (int(shape),)
            except TypeError:
                reraise_as(TypeError("%s must be int or int tuple"
                                     % param_name))
        return shape

    groups = _validate_shape(groups, 'groups')
    per_group = _validate_shape(per_group, 'per_group')
    if strides is not None:
        strides = _validate_shape(strides, 'strides')
    else:
        strides = per_group
    if len(groups) != len(per_group):
        raise ValueError('groups and per_group must have the same length')
    elif len(per_group) != len(strides):
        raise ValueError('per_group and strides must have the same length')
    if len(groups) > 2 or len(per_group) > 2:
        raise ValueError('only <= 2-dimensional pooling grids are supported')
    if not all(stride <= dim for stride, dim in izip(strides, per_group)):
        raise ValueError('strides must each be <= per_group dimensions')
    try:
        group_rows, group_cols = groups
        rows_per_group, cols_per_group = per_group
        row_stride, col_stride = strides
    except ValueError:
        group_rows, group_cols = groups[0], 1
        rows_per_group, cols_per_group = per_group[0], 1
        row_stride, col_stride = strides[0], 1
    if sparse is not None and sparse not in ('csc', 'csr'):
        raise ValueError("sparse must be one of (None, 'csr', 'csc')")
    # The total number of filters along either dimension is the
    # the number of groups times the stride, plus whatever dangles
    # off the last filter (the added term is zero if there's no
    # overlapping pools).
    filter_rows = group_rows * row_stride + (rows_per_group - row_stride)
    filter_cols = group_cols * col_stride + (cols_per_group - col_stride)
    if dtype is None:
        dtype = theano.config.floatX
    # If the return type is dense we can treat it as a 4-tensor and
    # then reshape. If not we'll need some index math, but it happens
    shape = (group_rows, group_cols, filter_rows, filter_cols)
    matrix_shape = group_rows * group_cols, filter_rows * filter_cols
    if sparse is not None:
        # Use a dictionary-of-keys matrix at construction time,
        # since they are efficient for arbitrary assignment.
        # TODO: I think CSC/CSR are fast to construct if you know the total
        # number of elements, which should be easy to calculate.
        pools = scipy.sparse.dok_matrix(matrix_shape, dtype=dtype)
    else:
        pools = np.zeros(shape, dtype=dtype)
    for g_row in xrange(group_rows):
        for g_col in xrange(group_cols):
            # The start and end points of the contiguous block of 1's.
            row_start = row_stride * g_row
            row_end = row_start + rows_per_group
            col_start = col_stride * g_col
            col_end = col_start + cols_per_group
            if sparse is not None:
                for f_row in xrange(row_start, row_end):
                    matrix_cols = slice(f_row * shape[3] + col_start,
                                        f_row * shape[3] + col_end)
                    # The group to which this belongs.
                    matrix_row = g_row * shape[1] + g_col
                    pools[matrix_row, matrix_cols] = 1.
            else:
                # If the matrix is a dense 4-tensor then we can get
                # away with doing an entire pool in one assignment.
                pools[g_row, g_col, row_start:row_end, col_start:col_end] = 1
    if sparse is not None:
        # Call either .tocsr() or .tocsc()
        pools = getattr(pools, 'to' + sparse)()
    else:
        pools = pools.reshape(matrix_shape)
    return pools
Ejemplo n.º 32
0
def feature_sign_search(dictionary,
                        signals,
                        sparsity,
                        max_iter=1000,
                        solution=None):
    """
    Solve L1-penalized quadratic minimization problems with
    feature-sign search.

    Employs the feature sign search algorithm of Lee et al (2006)
    to solve an L1-penalized quadratic optimization problem as a
    sequence of unconstrained quadratic minimization problems over
    subsets of the variables, with candidates for non-zero elements
    chosen by means of a gradient-based criterion.

    Parameters
    ----------
    dictionary : array_like, 2-dimensional
        The dictionary of basis functions from which to form the
        sparse linear combination. Each column constitutes a basis
        vector for the sparse code. There should be as many rows as
        input dimensions in the signal.
    signals : array_like, 1- or 2-dimensional
        The signal(s) to be decomposed as a sparse linear combination
        of the columns of the dictionary. If 2-dimensional, each
        different signal (training case) should be a row of this matrix.
    sparsity : float
        The coefficient on the L1 penalty term of the cost function.
    max_iter : int, optional
        The maximum number of iterations to run, per code vector, if
        the optimization has still not converged. Default is 1000.
    solution : ndarray, 1- or 2-dimensional, optional
        Pre-allocated vector or matrix used to store the solution(s).
        If provided, it should have the same rank as `signals`. If
        2-dimensional, it should have as many rows as `signals`.

    Returns
    -------
    solution : ndarray, 1- or 2-dimensional
        Matrix where each row contains the solution corresponding to a
        row of `signals`. If an array was passed in as the argument
        `solution`, it  will be updated in place and the same object
        will be returned.

    Notes
    -----
    It might seem more natural, from a linear-algebraic point of
    view, to think of both `signals` and `solution` as matrices with
    training examples contained as column vectors; then the overall
    cost function being minimized is

    .. math::
        (Y - AX)^2 + \gamma \sum_{i,j} |X_{ij}|

    with :math:`$A$` representing the dictionary, :math:`Y` being
    `signals.T` and math:`X` being `solutions.T`. However, in order
    to maintain the convention of training examples being indexed
    along the first dimension in the case of 2-dimensional `signals`
    input (as well as provide faster computation via memory locality
    in the case of C-contiguous inputs), this function expects and
    returns input with training examples as rows of a matrix.

    References
    ----------
    .. [1] H. Lee, A. Battle, R. Raina, and A. Y. Ng. "Efficient
       sparse coding algorithms". Advances in Neural Information
       Processing Systems 19, 2007.
    """
    dictionary = np.asarray(dictionary)
    _feature_sign_checkargs(dictionary, signals, sparsity, max_iter, solution)
    # Make things the code a bit simpler by always forcing the
    # 2-dimensional case.
    signals_ndim = signals.ndim
    signals = np.atleast_2d(signals)
    if solution is None:
        solution = np.zeros((signals.shape[0], dictionary.shape[1]),
                            dtype=signals.dtype)
        orig_sol = None
    else:
        orig_sol = solution
        solution = np.atleast_2d(solution)
    # Solve each minimization in sequence.
    for row, (signal, sol) in enumerate(izip(signals, solution)):
        _, iters = _feature_sign_search_single(dictionary, signal, sparsity,
                                               max_iter, sol)
        if iters >= max_iter:
            log.warning("maximum number of iterations reached when "
                        "optimizing code for training case %d; solution "
                        "may not be optimal" % iters)
    # Attempt to return the exact same object reference.
    if orig_sol is not None and orig_sol.ndim == 1:
        solution = orig_sol
    # Return a vector with the same rank as the input `signals`.
    elif orig_sol is None and signals_ndim == 1:
        solution = solution.squeeze()
    return solution
Ejemplo n.º 33
0
def safe_izip(*args):
    """Like izip, but ensures arguments are of same length"""
    assert all([len(arg) == len(args[0]) for arg in args])
    return izip(*args)
Ejemplo n.º 34
0
def pooling_matrix(groups, per_group, strides=None, dtype=None, sparse=None):
    """
    Construct a pooling matrix, optionally with overlapping pools
    arranged in a 1 or 2D topology.

    Parameters
    ----------
    groups : int or tuple
        The grid dimensions of a 1- or 2-dimensional pooling grid.
    per_group : int or tuple
        The grid dimensions of a single 1- or 2-dimensional feature
        pool. Must be same length as `groups`.
    strides : int or tuple, optional
        The stride of the pools along each dimension. A value of `None`
        is equivalent to setting equal to `per_group`, i.e. no overlap
    dtype : dtype object or str, optional
        The dtype of the resulting pooling matrix.
    sparse : str, optional
        If `None`, the function will return a dense matrix (a rank-2
        `numpy.ndarray`). Specifying 'csc' or 'csr' in this argument will
        cause the function to return a `scipy.sparse.csc_matrix` or a
        `scipy.sparse.csr_matrix`, instead.

    Returns
    -------
    pools : ndarray or sparse matrix
        Either a dense 2-dimensional NumPy array or one of
        `scipy.sparse.csc_matrix` or `scipy.sparse.csr_matrix`, depending
        on the value of the `sparse` argument. In any case, the shape is
        `(n_pools, n_filters)` and the value of `pools[i, j]` is 1 if
        feature `j` is in pool `i`, and 0 otherwise.
    """

    # Error-check arguments and fill in row_stride and col_stride
    # if either argument is absent.
    def _validate_shape(shape, param_name):
        try:
            shape = tuple(shape)
            [int(val) for val in shape]
        except (ValueError, TypeError):
            try:
                shape = (int(shape), )
            except TypeError:
                reraise_as(
                    TypeError("%s must be int or int tuple" % param_name))
        return shape

    groups = _validate_shape(groups, 'groups')
    per_group = _validate_shape(per_group, 'per_group')
    if strides is not None:
        strides = _validate_shape(strides, 'strides')
    else:
        strides = per_group
    if len(groups) != len(per_group):
        raise ValueError('groups and per_group must have the same length')
    elif len(per_group) != len(strides):
        raise ValueError('per_group and strides must have the same length')
    if len(groups) > 2 or len(per_group) > 2:
        raise ValueError('only <= 2-dimensional pooling grids are supported')
    if not all(stride <= dim for stride, dim in izip(strides, per_group)):
        raise ValueError('strides must each be <= per_group dimensions')
    try:
        group_rows, group_cols = groups
        rows_per_group, cols_per_group = per_group
        row_stride, col_stride = strides
    except ValueError:
        group_rows, group_cols = groups[0], 1
        rows_per_group, cols_per_group = per_group[0], 1
        row_stride, col_stride = strides[0], 1
    if sparse is not None and sparse not in ('csc', 'csr'):
        raise ValueError("sparse must be one of (None, 'csr', 'csc')")
    # The total number of filters along either dimension is the
    # the number of groups times the stride, plus whatever dangles
    # off the last filter (the added term is zero if there's no
    # overlapping pools).
    filter_rows = group_rows * row_stride + (rows_per_group - row_stride)
    filter_cols = group_cols * col_stride + (cols_per_group - col_stride)
    if dtype is None:
        dtype = theano.config.floatX
    # If the return type is dense we can treat it as a 4-tensor and
    # then reshape. If not we'll need some index math, but it happens
    shape = (group_rows, group_cols, filter_rows, filter_cols)
    matrix_shape = group_rows * group_cols, filter_rows * filter_cols
    if sparse is not None:
        # Use a dictionary-of-keys matrix at construction time,
        # since they are efficient for arbitrary assignment.
        # TODO: I think CSC/CSR are fast to construct if you know the total
        # number of elements, which should be easy to calculate.
        pools = scipy.sparse.dok_matrix(matrix_shape, dtype=dtype)
    else:
        pools = np.zeros(shape, dtype=dtype)
    for g_row in xrange(group_rows):
        for g_col in xrange(group_cols):
            # The start and end points of the contiguous block of 1's.
            row_start = row_stride * g_row
            row_end = row_start + rows_per_group
            col_start = col_stride * g_col
            col_end = col_start + cols_per_group
            if sparse is not None:
                for f_row in xrange(row_start, row_end):
                    matrix_cols = slice(f_row * shape[3] + col_start,
                                        f_row * shape[3] + col_end)
                    # The group to which this belongs.
                    matrix_row = g_row * shape[1] + g_col
                    pools[matrix_row, matrix_cols] = 1.
            else:
                # If the matrix is a dense 4-tensor then we can get
                # away with doing an entire pool in one assignment.
                pools[g_row, g_col, row_start:row_end, col_start:col_end] = 1
    if sparse is not None:
        # Call either .tocsr() or .tocsc()
        pools = getattr(pools, 'to' + sparse)()
    else:
        pools = pools.reshape(matrix_shape)
    return pools
Ejemplo n.º 35
0
def safe_izip(*args):
    """Like izip, but ensures arguments are of same length"""
    assert all([len(arg) == len(args[0]) for arg in args])
    return izip(*args)
Ejemplo n.º 36
0
    def get_gradients(self, model, data, ** kwargs):


        try:
            cost = self.expr(model=model, data=data, **kwargs)
        except TypeError:
            # If anybody knows how to add type(self) to the exception message
            # but still preserve the stack trace, please do so
            # The current code does neither
            message = "Error while calling " + str(type(self)) + ".expr"
            reraise_as(TypeError(message))

        if cost is None:
            raise NotImplementedError(str(type(self)) +
                                      " represents an intractable cost and "
                                      "does not provide a gradient "
                                      "approximation scheme.")

        n_leafnodes = len(cost)
        n_model_layer = len(model.layers)
        params = []
        for i in xrange(n_leafnodes):
            params_branch = []
            layer_params = model.layers[0].get_params()
            for param in layer_params:
                params_branch.append(param)
            params.append(params_branch)

        for i in xrange(1,n_model_layer - constants.NUM_REGLAYER,1):
            CompsiteLayer =  model.layers[i]
            n_curnodes = len(CompsiteLayer.layers)
            step = int(n_leafnodes / n_curnodes)
            for k in xrange(n_leafnodes):
                layer_params = CompsiteLayer.layers[k/step].get_params()
                for param in layer_params:
                    params[k].append(param)

        for i in xrange(-constants.NUM_REGLAYER,0,1):
            CompsiteLayer =  model.layers[i]
            for k in xrange(n_leafnodes):
                layer_params = CompsiteLayer.layers[k].get_params()
                for param in layer_params:
                    params[k].append(param)

        grads =[]
        for i in xrange(n_leafnodes):
            grads.append(theano.tensor.grad(cost[i], params[i], disconnected_inputs='ignore'))


        flat_grads=[]
        flat_params = []
        for j in xrange(n_model_layer - constants.NUM_REGLAYER):
            weight = params[0][2*j]
            bais = params[0][2*j+1]

            tmp_grad_weigth = grads[0][2*j]
            tmp_grad_bais = grads[0][2*j+1]

            for i in  xrange(1,n_leafnodes,1):
                if weight.name == params[i][2*j].name:
                    weight += params[i][2*j]
                    weight.name = params[i][2*j].name

                    bais += params[i][2*j+1]
                    bais.name = params[i][2*j+1].name

                    tmp_grad_weigth += grads[i][2*j]
                    tmp_grad_bais += grads[i][2*j+1]
                else:
                    flat_params.append(weight)
                    flat_params.append(bais)

                    flat_grads.append(tmp_grad_weigth)
                    flat_grads.append(tmp_grad_bais)

                    weight = params[i][2*j]
                    bais = params[i][2*j+1]

                    tmp_grad_weigth = grads[i][2*j]
                    tmp_grad_bais = grads[i][2*j+1]


            flat_params.append(weight)
            flat_params.append(bais)
            flat_grads.append(tmp_grad_weigth)
            flat_grads.append(tmp_grad_bais)


        for j in xrange(-constants.NUM_REGLAYER,0,1):
             for i in xrange(0, n_leafnodes,1):
                flat_params.append(params[i][2*j])
                flat_params.append(params[i][2*j+1])
                flat_grads.append(grads[i][2*j])
                flat_grads.append(grads[i][2*j+1])

        params_model = model.get_params()
        if len(flat_params) != len(params_model):
                   raise ValueError("the length of the flat_params of tree cnn "
                                    "does not meet the list of model params" )
        else:
            for flat_i, p_i in zip(flat_params,params_model):
                flat_i.name = p_i.name
        gradients = OrderedDict(izip(params_model, flat_grads))

        updates = OrderedDict()

        return gradients, updates