Beispiel #1
0
 def __init__(self, decay=0.9, max_scaling=1e5):
     assert 0. <= decay < 1.
     assert max_scaling > 0
     self.decay = sharedX(decay, 'decay')
     self.epsilon = 1. / max_scaling
     self.mean_square_grads = OrderedDict()
Beispiel #2
0
    def get_updates(self, learning_rate, grads, lr_scalers=None):
        """
        Provides the symbolic (theano) description of the updates needed to
        perform this learning rule. See Notes for side-effects.

        Parameters
        ----------
        learning_rate : float
            Learning rate coefficient.
        grads : dict
            A dictionary mapping from the model's parameters to their
            gradients.
        lr_scalers : dict
            A dictionary mapping from the model's parameters to a learning
            rate multiplier.

        Returns
        -------
        updates : OrderdDict
            A dictionary mapping from the old model parameters, to their new
            values after a single iteration of the learning rule.

        Notes
        -----
        This method has the side effect of storing the moving average
        of the square gradient in `self.mean_square_grads`. This is
        necessary in order for the monitoring channels to be able
        to track the value of these moving averages.
        Therefore, this method should only get called once for each
        instance of RMSProp.
        """

        updates = OrderedDict()
        for param in grads:

            # mean_squared_grad := E[g^2]_{t-1}
            mean_square_grad = sharedX(param.get_value() * 0.)

            if param.name is None:
                raise ValueError("Model parameters must be named.")
            mean_square_grad.name = 'mean_square_grad_' + param.name

            if param.name in self.mean_square_grads:
                warnings.warn("Calling get_updates more than once on the "
                              "gradients of `%s` may make monitored values "
                              "incorrect." % param.name)
            # Store variable in self.mean_square_grads for monitoring.
            self.mean_square_grads[param.name] = mean_square_grad

            # Accumulate gradient
            new_mean_squared_grad = (self.decay * mean_square_grad +
                                     (1 - self.decay) * T.sqr(grads[param]))

            # Compute update
            scaled_lr = lr_scalers.get(param, 1.) * learning_rate
            rms_grad_t = T.sqrt(new_mean_squared_grad)
            rms_grad_t = T.maximum(rms_grad_t, self.epsilon)
            delta_x_t = -scaled_lr * grads[param] / rms_grad_t

            # Apply update
            updates[mean_square_grad] = new_mean_squared_grad
            updates[param] = param + delta_x_t

        return updates
Beispiel #3
0
def setup_detector_layer_c01b(layer, input_space, rng):
    """
    .. todo::

        WRITEME properly

    Takes steps to set up an object for use as being some kind of convolutional
    layer. This function sets up only the detector layer.

    Does the following:

    * raises a RuntimeError if cuda is not available
    * sets layer.input_space to input_space
    * sets up addition of dummy channels for compatibility with cuda-convnet:

      - layer.dummy_channels: # of dummy channels that need to be added
        (You might want to check this and raise an Exception if it's not 0)
      - layer.dummy_space: The Conv2DSpace representing the input with dummy
        channels added

    * sets layer.detector_space to the space for the detector layer
    * sets layer.transformer to be a Conv2D instance
    * sets layer.b to the right value

    Parameters
    ----------
    layer : object
        Any python object that allows the modifications described below and
        has the following attributes:

          * pad : int describing amount of zero padding to add
          * kernel_shape : 2-element tuple or list describing spatial shape of
            kernel
          * fix_kernel_shape : bool, if true, will shrink the kernel shape to
            make it feasible, as needed (useful for hyperparameter searchers)
          * detector_channels : The number of channels in the detector layer
          * init_bias : numeric constant added to a tensor of zeros to
            initialize the bias
          * tied_b : If true, biases are shared across all spatial locations
    input_space : WRITEME
        A Conv2DSpace to be used as input to the layer
    rng : WRITEME
        A numpy RandomState or equivalent
    """

    # Use "self" to refer to layer from now on, so we can pretend we're
    # just running in the set_input_space method of the layer
    self = layer

    # Make sure cuda is available
    check_cuda(str(type(self)))

    # Validate input
    if not isinstance(input_space, Conv2DSpace):
        raise TypeError("The input to a convolutional layer should be a "
                        "Conv2DSpace, but layer " + self.layer_name + " got " +
                        str(type(self.input_space)))

    if not hasattr(self, 'detector_channels'):
        raise ValueError("layer argument must have a 'detector_channels' "
                         "attribute specifying how many channels to put in "
                         "the convolution kernel stack.")

    # Store the input space
    self.input_space = input_space

    # Make sure number of channels is supported by cuda-convnet
    # (multiple of 4 or <= 3)
    # If not supported, pad the input with dummy channels
    ch = self.input_space.num_channels
    rem = ch % 4
    if ch > 3 and rem != 0:
        self.dummy_channels = 4 - rem
    else:
        self.dummy_channels = 0
    self.dummy_space = Conv2DSpace(
        shape=input_space.shape,
        channels=input_space.num_channels + self.dummy_channels,
        axes=('c', 0, 1, 'b')
    )

    if hasattr(self, 'kernel_stride'):
        kernel_stride = self.kernel_stride
    else:
        kernel_stride = [1, 1]

    output_shape = \
        [int(np.ceil((i_sh + 2. * self.pad - k_sh) / float(k_st))) + 1
         for i_sh, k_sh, k_st in zip(self.input_space.shape,
                                     self.kernel_shape, kernel_stride)]

    def handle_kernel_shape(idx):
        if self.kernel_shape[idx] < 1:
            raise ValueError("kernel must have strictly positive size on all "
                             "axes but has shape: " + str(self.kernel_shape))
        if output_shape[idx] <= 0:
            if self.fix_kernel_shape:
                self.kernel_shape[idx] = \
                    self.input_space.shape[idx] + 2 * self.pad
                assert self.kernel_shape[idx] != 0
                output_shape[idx] = 1
                warnings.warn("Had to change the kernel shape to make "
                              "network feasible")
            else:
                raise ValueError("kernel too big for input "
                                 "(even with zero padding)")

    map(handle_kernel_shape, [0, 1])

    if self.detector_channels < 16:
        raise ValueError("Cuda-convnet requires the detector layer to have "
                         "at least 16 channels.")

    self.detector_space = Conv2DSpace(shape=output_shape,
                                      num_channels=self.detector_channels,
                                      axes=('c', 0, 1, 'b'))

    if hasattr(self, 'partial_sum'):
        partial_sum = self.partial_sum
    else:
        partial_sum = 1

    if hasattr(self, 'sparse_init') and self.sparse_init is not None:
        self.transformer = \
            checked_call(make_sparse_random_conv2D,
                         OrderedDict([('num_nonzero', self.sparse_init),
                                      ('input_space', self.input_space),
                                      ('output_space', self.detector_space),
                                      ('kernel_shape', self.kernel_shape),
                                      ('pad', self.pad),
                                      ('partial_sum', partial_sum),
                                      ('kernel_stride', kernel_stride),
                                      ('rng', rng)]))
    else:
        self.transformer = make_random_conv2D(
            irange=self.irange,
            input_axes=self.input_space.axes,
            output_axes=self.detector_space.axes,
            input_channels=self.dummy_space.num_channels,
            output_channels=self.detector_space.num_channels,
            kernel_shape=self.kernel_shape,
            pad=self.pad,
            partial_sum=partial_sum,
            kernel_stride=kernel_stride,
            rng=rng
        )

    W, = self.transformer.get_params()
    W.name = self.layer_name + '_W'

    if self.tied_b:
        self.b = sharedX(np.zeros(self.detector_space.num_channels) +
                         self.init_bias)
    else:
        self.b = sharedX(self.detector_space.get_origin() + self.init_bias)
    self.b.name = self.layer_name + '_b'

    logger.info('Input shape: {0}'.format(self.input_space.shape))
    logger.info('Detector space: {0}'.format(self.detector_space.shape))
Beispiel #4
0
def estimate_likelihood(W_list,
                        b_list,
                        trainset,
                        testset,
                        free_energy_fn=None,
                        batch_size=100,
                        large_ais=False,
                        log_z=None,
                        pos_mf_steps=50,
                        pos_sample_steps=0):
    """
    Compute estimate of log-partition function and likelihood of trainset and
    testset

    Parameters
    ----------
    W_list : array-like object of theano shared variables
    b_list : array-like object of theano shared variables
        Biases of the DBM
    trainset : pylearn2.datasets.dataset.Dataset
        Training set
    testset : pylearn2.datasets.dataset.Dataset
        Test set
    free_energy_fn : theano.function
        Function which, given temperature beta_k, computes the free energy
        of the samples stored in model.samples. This function should return
        a symbolic vector.
    batch_size : integer
        Size of a batch of examples
    large_ais : boolean
        If True, will use 3e5 chains, instead of 3e4
    log_z : log-partition function (if precomputed)
    pos_mf_steps: the number of fixed-point iterations for approximate inference
    pos_sample_steps: same thing as pos_mf_steps
        when both pos_mf_steps > 0 and pos_sample_steps > 0,
        pos_mf_steps has a priority

    Returns
    -------
    nll : scalar
        Negative log-likelihood of data.X under `model`.
    logz : scalar
        Estimate of log-partition function of `model`.
    """

    warnings.warn("This is garanteed to work only for DBMs with a " +
                  "BinaryVector visible layer and BinaryVectorMaxPool " +
                  "hidden layers with pool sizes of 1.")

    # Add a dummy placeholder for visible layer's weights in W_list
    W_list = [None] + W_list

    # Depth of the DBM
    depth = len(b_list)

    # Initialize samples
    psamples = []
    nsamples = []
    for i, b in enumerate(b_list):
        psamples += [
            utils.sharedX(rng.rand(batch_size,
                                   b.get_value().shape[0]),
                          name='psamples%i' % i)
        ]
        nsamples += [
            utils.sharedX(rng.rand(batch_size,
                                   b.get_value().shape[0]),
                          name='nsamples%i' % i)
        ]
    psamples[0] = T.matrix('psamples0')

    ##########################
    ## BUILD THEANO FUNCTIONS
    ##########################
    beta = T.scalar()

    # For an even number of layers, we marginalize the odd layers
    # (and vice-versa)
    marginalize_odd = (depth % 2) == 0

    # Build function to retrieve energy.
    E = -T.dot(nsamples[0], b_list[0]) * beta
    for i in xrange(1, depth):
        E -= T.sum(T.dot(nsamples[i - 1], W_list[i] * beta) * nsamples[i],
                   axis=1)
        E -= T.dot(nsamples[i], b_list[i] * beta)
    energy_fn = theano.function([beta], E)

    # Build inference function.
    assert (pos_mf_steps or pos_sample_steps)
    pos_steps = pos_mf_steps if pos_mf_steps else pos_sample_steps
    new_psamples = _e_step(psamples, W_list, b_list, n_steps=pos_steps)
    ups = OrderedDict()
    for psample, new_psample in zip(psamples[1:], new_psamples[1:]):
        ups[psample] = new_psample
    temp = numpy.asarray(trainset.X, dtype=floatX)
    mean_train = numpy.mean(temp, axis=0)
    inference_fn = theano.function(inputs=[psamples[0]],
                                   outputs=[],
                                   updates=ups)

    # Configure baserate bias for (h0 if `marginalize_odd` else h1)
    inference_fn(numpy.tile(mean_train, (batch_size, 1)))
    numpy_psamples = [mean_train[None, :]] + \
                     [psample.get_value() for psample in psamples[1:]]
    mean_pos = numpy.minimum(numpy_psamples[not marginalize_odd], 1 - 1e-5)
    mean_pos = numpy.maximum(mean_pos, 1e-5)
    pa_bias = -numpy.log(1. / mean_pos[0] - 1.)

    # Build Theano function to sample from interpolating distributions.
    updates = OrderedDict()
    new_nsamples = neg_sampling(W_list,
                                b_list,
                                nsamples,
                                beta=beta,
                                pa_bias=pa_bias,
                                marginalize_odd=marginalize_odd,
                                theano_rng=theano_rng)
    for (nsample, new_nsample) in zip(nsamples, new_nsamples):
        updates[nsample] = new_nsample
    sample_fn = theano.function([beta], [],
                                updates=updates,
                                name='sample_func')

    # Build function to compute free-energy of p_k(h1).
    fe_bp_h1 = free_energy_at_beta(W_list,
                                   b_list,
                                   nsamples,
                                   beta,
                                   pa_bias,
                                   marginalize_odd=marginalize_odd)
    free_energy_fn = theano.function([beta], fe_bp_h1)

    ###########
    ## RUN AIS
    ###########

    # Generate exact sample for the base model.
    for i, nsample_i in enumerate(nsamples):
        bias = pa_bias if i == 1 else b_list[i].get_value()
        hi_mean_vec = 1. / (1. + numpy.exp(-bias))
        hi_mean = numpy.tile(hi_mean_vec, (batch_size, 1))
        r = rng.random_sample(hi_mean.shape)
        hi_sample = numpy.array(hi_mean > r, dtype=floatX)
        nsample_i.set_value(hi_sample)

    # Default configuration for interpolating distributions
    if large_ais:
        betas = numpy.cast[floatX](numpy.hstack(
            (numpy.linspace(0, 0.5, 1e5 + 1)[:-1],
             numpy.linspace(0.5, 0.9,
                            1e5 + 1)[:-1], numpy.linspace(0.9, 1.0, 1e5))))
    else:
        betas = numpy.cast[floatX](numpy.hstack(
            (numpy.linspace(0, 0.5, 1e4 + 1)[:-1],
             numpy.linspace(0.5, 0.9,
                            1e4 + 1)[:-1], numpy.linspace(0.9, 1.0, 1e4))))

    if log_z is None:
        log_ais_w = compute_log_ais_weights(batch_size, free_energy_fn,
                                            sample_fn, betas)
        dlogz, var_dlogz = estimate_from_weights(log_ais_w)
        log_za = compute_log_za(b_list, pa_bias, marginalize_odd)
        log_z = log_za + dlogz
        logging.info('log_z = %f' % log_z)
        logging.info('log_za = %f' % log_za)
        logging.info('dlogz = %f' % dlogz)
        logging.info('var_dlogz = %f' % var_dlogz)

    train_ll = compute_likelihood_given_logz(nsamples, psamples, batch_size,
                                             energy_fn, inference_fn, log_z,
                                             trainset.X)
    logging.info('Training likelihood = %f' % train_ll)
    test_ll = compute_likelihood_given_logz(nsamples, psamples, batch_size,
                                            energy_fn, inference_fn, log_z,
                                            testset.X)
    logging.info('Test likelihood = %f' % test_ll)

    return (train_ll, test_ll, log_z)