Python OrderedDictの例、pylearn2.compat.OrderedDict Pythonの例

コード例 #1

0

ファイルを表示

    def get_gradients(self, model, data, **kwargs):
        """
        Provides the gradients of the cost function with respect to the model
        parameters.

        These are not necessarily those obtained by theano.tensor.grad
        --you may wish to use approximate or even intentionally incorrect
        gradients in some cases.

        Parameters
        ----------
        model : a pylearn2 Model instance
        data : a batch in cost.get_data_specs() form
        kwargs : dict
            Optional extra arguments, not used by the base class.

        Returns
        -------
        gradients : OrderedDict
            a dictionary mapping from the model's parameters
            to their gradients
            The default implementation is to compute the gradients
            using T.grad applied to the value returned by expr.
            However, subclasses may return other values for the gradient.
            For example, an intractable cost may return a sampling-based
            approximation to its gradient.
        updates : OrderedDict
            a dictionary mapping shared variables to updates that must
            be applied to them each time these gradients are computed.
            This is to facilitate computation of sampling-based approximate
            gradients.
            The parameters should never appear in the updates dictionary.
            This would imply that computing their gradient changes
            their value, thus making the gradient value outdated.
        """

        try:
            cost = self.expr(model=model, data=data, **kwargs)
        except TypeError:
            # If anybody knows how to add type(self) to the exception message
            # but still preserve the stack trace, please do so
            # The current code does neither
            message = "Error while calling " + str(type(self)) + ".expr"
            reraise_as(TypeError(message))

        if cost is None:
            raise NotImplementedError(
                str(type(self)) + " represents an intractable cost and "
                "does not provide a gradient "
                "approximation scheme.")

        params = list(model.get_params())

        grads = T.grad(cost, params, disconnected_inputs='ignore')

        gradients = OrderedDict(izip(params, grads))

        updates = OrderedDict()

        return gradients, updates

コード例 #2

0

ファイルを表示

ファイル: cost.py プロジェクト: nitbix/pylearn2

    def get_gradients(self, model, data, ** kwargs):
        indiv_results = []
        composite_specs, mapping = self.get_composite_specs_and_mapping(model)
        nested_data = mapping.nest(data)
        for cost, cost_data in safe_zip(self.costs, nested_data):
            result = cost.get_gradients(model, cost_data, ** kwargs)
            indiv_results.append(result)

        grads = OrderedDict()
        updates = OrderedDict()
        params = model.get_params()

        for coeff, packed in zip(self.coeffs, indiv_results):
            g, u = packed
            for param in g:
                if param not in params:
                    raise ValueError("A shared variable (" +
                                     str(param) +
                                     ") that is not a parameter appeared "
                                     "a cost gradient dictionary.")
            for param in g:
                assert param.ndim == g[param].ndim
                v = coeff * g[param]
                if param not in grads:
                    grads[param] = v
                else:
                    grads[param] = grads[param] + v
                assert grads[param].ndim == param.ndim
            assert not any([state in updates for state in u])
            assert not any([state in params for state in u])
            updates.update(u)

        return grads, updates

コード例 #3

0

ファイルを表示

    def get_monitoring_channels(self, model, data, **kwargs):
        self.get_data_specs(model)[0].validate(data)
        rval = OrderedDict()
        composite_specs, mapping = self.get_composite_specs_and_mapping(model)
        nested_data = mapping.nest(data)

        for i, cost in enumerate(self.costs):
            cost_data = nested_data[i]
            try:
                channels = cost.get_monitoring_channels(
                    model, cost_data, **kwargs)
                rval.update(channels)
            except TypeError:
                reraise_as(
                    Exception('SumOfCosts.get_monitoring_channels '
                              'encountered TypeError while calling {0}'
                              '.get_monitoring_channels'.format(type(cost))))

            value = cost.expr(model, cost_data, **kwargs)
            if value is not None:
                name = ''
                if hasattr(value, 'name') and value.name is not None:
                    name = '_' + value.name
                rval['term_' + str(i) + name] = value

        return rval

コード例 #4

0

ファイルを表示

ファイル: update_norm_monitor.py プロジェクト: Neuroglycerin/neukrill-net-tools

 def __init__(self, base_learning_rule, decay=0.9):
     self.base = base_learning_rule
     # hack to allow MomentumAdjustor to access momentum value
     if hasattr(self.base, 'momentum'):
         self.momentum = self.base.momentum
     self.decay = decay
     self.mean_updates = OrderedDict()

コード例 #5

0

ファイルを表示

ファイル: cost.py プロジェクト: nitbix/pylearn2

    def get_monitoring_channels(self, model, data, ** kwargs):
        self.get_data_specs(model)[0].validate(data)
        rval = OrderedDict()
        composite_specs, mapping = self.get_composite_specs_and_mapping(model)
        nested_data = mapping.nest(data)

        for i, cost in enumerate(self.costs):
            cost_data = nested_data[i]
            try:
                channels = cost.get_monitoring_channels(model, cost_data,
                                                        **kwargs)
                rval.update(channels)
            except TypeError:
                reraise_as(Exception('SumOfCosts.get_monitoring_channels '
                                     'encountered TypeError while calling {0}'
                                     '.get_monitoring_channels'.format(
                                         type(cost))))

            value = cost.expr(model, cost_data, ** kwargs)
            if value is not None:
                name = ''
                if hasattr(value, 'name') and value.name is not None:
                    name = '_' + value.name
                rval['term_' + str(i) + name] = value

        return rval

コード例 #6

0

ファイルを表示

def test_spatiotemporal_cubes():
    def check_patch_coverage(files):
        rng = numpy.random.RandomState(1)
        inputs = [(name, array.shape) for name, array in six.iteritems(files)]
        shape = (5, 7, 7)
        for fname, index in spatiotemporal_cubes(inputs, shape, 50000, rng):
            cube = files[fname][index]
            if len(files[fname].shape) == 3:
                assert cube.shape == shape
            else:
                assert cube.shape[:3] == shape[:3]
            cube[...] = True
        for fname, array in six.iteritems(files):
            assert array.all()

    files = OrderedDict(
        file1=numpy.zeros((10, 30, 21), dtype=bool),
        file2=numpy.zeros((15, 25, 28), dtype=bool),
        file3=numpy.zeros((7, 18, 22), dtype=bool),
    )
    check_patch_coverage(files)

    # Check that stuff still works with an extra color channel dimension.
    files = OrderedDict(
        file1=numpy.zeros((10, 30, 21, 3), dtype=bool),
        file2=numpy.zeros((15, 25, 28, 3), dtype=bool),
        file3=numpy.zeros((7, 18, 22, 3), dtype=bool),
    )
    check_patch_coverage(files)

コード例 #7

0

ファイルを表示

    def get_gradients(self, model, data, **kwargs):
        indiv_results = []
        composite_specs, mapping = self.get_composite_specs_and_mapping(model)
        nested_data = mapping.nest(data)
        for cost, cost_data in safe_zip(self.costs, nested_data):
            result = cost.get_gradients(model, cost_data, **kwargs)
            indiv_results.append(result)

        grads = OrderedDict()
        updates = OrderedDict()
        params = model.get_params()

        for coeff, packed in zip(self.coeffs, indiv_results):
            g, u = packed
            for param in g:
                if param not in params:
                    raise ValueError("A shared variable (" + str(param) +
                                     ") that is not a parameter appeared "
                                     "a cost gradient dictionary.")
            for param in g:
                assert param.ndim == g[param].ndim
                v = coeff * g[param]
                if param not in grads:
                    grads[param] = v
                else:
                    grads[param] = grads[param] + v
                assert grads[param].ndim == param.ndim
            assert not any([state in updates for state in u])
            assert not any([state in params for state in u])
            updates.update(u)

        return grads, updates

コード例 #8

0

ファイルを表示

ファイル: rnn.py プロジェクト: wanasit/pylearn2

 def __init__(self, dim, layer_name, irange, indices=None,
              init_bias=0., nonlinearity=tensor.tanh,
              weight_noise=False, **kwargs):
     self._std_dev = kwargs.pop('noise_std_dev', .075)
     self.rnn_friendly = True
     self._scan_updates = OrderedDict()
     self.__dict__.update(locals())
     del self.self
     super(Recurrent, self).__init__()
     if not self.weight_noise:
         self._std_dev = None

コード例 #9

0

ファイルを表示

 def __init__(self, model):
     avg_updates = OrderedDict()
     t = sharedX(1.)
     self.param_to_mean = OrderedDict()
     for param in model.get_params():
         mean = sharedX(param.get_value())
         assert type(mean) == type(param)
         self.param_to_mean[param] = mean
         avg_updates[mean] = mean - (mean - param) / t
         avg_updates[t] = t + 1.
     self.avg = function([], updates=avg_updates)

コード例 #10

0

ファイルを表示

ファイル: ebm_estimation.py プロジェクト: yo-ga/TextDetector

    def get_gradients(self, model, data, **kwargs):
        cost, neg_v = self._cost(model, data, **kwargs)

        params = list(model.get_params())

        grads = T.grad(cost, params, disconnected_inputs='ignore',
                       consider_constant=[neg_v])

        gradients = OrderedDict(izip(params, grads))

        updates = OrderedDict()

        return gradients, updates

コード例 #11

0

ファイルを表示

ファイル: ebm_estimation.py プロジェクト: ASAPPinc/pylearn2

    def get_gradients(self, model, data, **kwargs):
        cost = self._cost(model, data, **kwargs)

        params = list(model.get_params())

        grads = T.grad(cost, params, disconnected_inputs='ignore',
                       consider_constant=[self.sampler.particles])

        gradients = OrderedDict(izip(params, grads))

        updates = OrderedDict()

        sampler_updates = self.sampler.updates()
        updates.update(sampler_updates)
        return gradients, updates

コード例 #12

0

ファイルを表示

ファイル: ebm_estimation.py プロジェクト: yo-ga/TextDetector

    def get_gradients(self, model, data, **kwargs):
        cost = self._cost(model, data, **kwargs)

        params = list(model.get_params())

        grads = T.grad(cost, params, disconnected_inputs='ignore',
                       consider_constant=[self.sampler.particles])

        gradients = OrderedDict(izip(params, grads))

        updates = OrderedDict()

        sampler_updates = self.sampler.updates()
        updates.update(sampler_updates)
        return gradients, updates

コード例 #13

0

ファイルを表示

 def __init__(self, inputs, outputs=None, updates=None):
     batch_size = T.cast(inputs[0].shape[0], 'float32')
     total_examples = T.scalar()
     transformed_updates = OrderedDict()
     self.has_updates = updates is not None
     if self.has_updates:
         self._clear = function([],
                                updates=[(var, 0. * var)
                                         for var in updates])
         for var in updates:
             update = updates[var]
             transformed_updates[var] = var + \
                 (batch_size / total_examples) * update
     self._shared_mask = [hasattr(elem, 'get_value') for elem in inputs]
     true_inputs = self._true_inputs(inputs)
     self._shared = self._shared_inputs(inputs)
     if outputs is not None:
         if not isinstance(outputs, list):
             outputs = [outputs]
         outputs = [
             output * (batch_size / total_examples) for output in outputs
         ]
     self._func = function(true_inputs + [total_examples],
                           outputs=outputs,
                           updates=transformed_updates)

コード例 #14

0

ファイルを表示

    def get_monitoring_channels(self, data):
        """
        Get monitoring channels for this model.

        Parameters
        ----------
        data : tensor_like, or (possibly nested) tuple of tensor_likes,
            This is data on which the monitoring quantities will be
            calculated (e.g., a validation set). See
            `self.get_monitoring_data_specs()`.

        Returns
        -------
        channels : OrderedDict
            A dictionary with strings as keys, mapping channel names to
            symbolic values that depend on the variables in `data`.

        Notes
        -----
        You can make any channel names you want, just try to make sure they
        won't collide with names made by the training Cost, etc. Anything you
        think is worth monitoring during training can be added here. You
        probably want to control which channels get added with some config
        option for your model.
        """
        space, source = self.get_monitoring_data_specs()
        space.validate(data)
        return OrderedDict()

コード例 #15

0

ファイルを表示

ファイル: cascade.py プロジェクト: baucheng/facedet

    def get_monitoring_channels(self, data):

        rval = OrderedDict()
        for i in xrange(len(self.models)):

            if self.monitor_targets:
                X = data[i]
                Y = data[-1]
            else:
                X = data[i]
                Y = None

            model_data = (X, Y)
            ch = self.models[i].get_monitoring_channels(model_data)
            for key in ch:
                value = ch[key]
                rval["cascade_" + str(i) + '_' + key] = value

            if Y is not None:
                state = self.fprop(data[0:-1])
                # Threshold Y_hat at 0.5.
                prediction = T.gt(state, 0.5)
                # If even one feature is wrong for a given training example,
                # it's considered incorrect, so we max over columns.
                incorrect = T.neq(Y, prediction).max(axis=1)
                rval['misclass'] = T.cast(incorrect, config.floatX).mean()
        return rval

コード例 #16

0

ファイルを表示

    def on_monitor(self, model, dataset, algorithm):
        """
        Make sure Polyak-averaged model gets monitored.
        Save the model if necessary.

        Parameters
        ----------
        model : a Model instance
        dataset : Dataset
        algorithm : WRITEME
        """
        if self._count == self.start:
            self._worker = _PolyakWorker(model)
            algorithm.update_callbacks.append(self._worker)
            # HACK
            try:
                model.add_polyak_channels(self._worker.param_to_mean,
                                          algorithm.monitoring_dataset)
            except AttributeError:
                pass
        elif self.save_path is not None and self._count > self.start and \
                self._count % self.save_freq == 0:
            saved_params = OrderedDict()
            for param in model.get_params():
                saved_params[param] = param.get_value()
                param.set_value(self._worker.param_to_mean[param].get_value())
            serial.save(self.save_path, model)
            for param in model.get_params():
                param.set_value(saved_params[param])
        self._count += 1

コード例 #17

0

ファイルを表示

ファイル: multilinear_mlp.py プロジェクト: se4u/grafl_public

 def get_layer_monitoring_channels(self, state_below=None,
                                   state=None, targets=None):
     T, = self.transformer.get_params()
     assert T.ndim == 3
     # sq_T = theano.tensor.sqr(T)
     # Prepare an orderedDict with values to monitor.
     return OrderedDict()

コード例 #18

0

ファイルを表示

ファイル: __init__.py プロジェクト: jamessergeant/pylearn2

    def __call__(self, inputs):
        """
        .. todo::

            WRITEME
        """
        space = self.dbm.get_input_space()
        num_examples = space.batch_size(inputs)

        last_layer = self.dbm.get_all_layers()[-1]
        layer_to_chains = self.dbm.make_layer_to_symbolic_state(
            num_examples, self.theano_rng)
        # The examples are used to initialize the visible layer's chains
        layer_to_chains[self.dbm.visible_layer] = inputs

        layer_to_clamp = OrderedDict([(self.dbm.visible_layer, True)])
        layer_to_chains = self.dbm.sampling_procedure.sample(
            layer_to_state=layer_to_chains,
            theano_rng=self.theano_rng,
            layer_to_clamp=layer_to_clamp,
            num_steps=1)

        rval = layer_to_chains[last_layer]
        rval = last_layer.upward_state(rval)

        return rval

コード例 #19

0

ファイルを表示

ファイル: rnn.py プロジェクト: MarCnu/pylearn2

 def __init__(self, dim, layer_name, irange, indices=None,
              init_bias=0., svd=True, nonlinearity=tensor.tanh):
     self.rnn_friendly = True
     self._scan_updates = OrderedDict()
     self.__dict__.update(locals())
     del self.self
     super(Recurrent, self).__init__()

コード例 #20

0

ファイルを表示

ファイル: average_agent.py プロジェクト: yo-ga/TextDetector

    def get_learn_func(self):
        """
        Returns a theano function that takes an action and a reward,
        and updates the agent based on this experience.
        """

        a = T.iscalar()
        r = T.scalar()

        old_estimated_reward = self.estimated_rewards[a]
        old_observation_count = self.observation_counts[a]
        observation_count = old_observation_count + 1.

        delta = r - old_estimated_reward
        new_estimated_reward = old_estimated_reward + delta / observation_count

        new_estimated_rewards = T.set_subtensor(self.estimated_rewards[a],
                                                new_estimated_reward)
        new_observation_counts = T.set_subtensor(self.observation_counts[a],
                                                 observation_count)

        updates = OrderedDict([(self.estimated_rewards, new_estimated_rewards),
                               (self.observation_counts,
                                new_observation_counts)])

        rval = function([a, r], updates=updates)

        return rval

コード例 #21

0

ファイルを表示

    def get_monitoring_channels(self, model, data, **kwargs):
        """
        .. todo::

            WRITEME

        .. todo::

            how do you do prereqs in this setup? (I think PL changed
            it, not sure if there still is a way in this context)

        Returns a dictionary mapping channel names to expressions for
        channel values.

        Parameters
        ----------
        model : Model
            the model to use to compute the monitoring channels
        data : batch
            (a member of self.get_data_specs()[0])
            symbolic expressions for the monitoring data
        kwargs : dict
            used so that custom algorithms can use extra variables
            for monitoring.

        Returns
        -------
        rval : dict
            Maps channels names to expressions for channel values.
        """
        self.get_data_specs(model)[0].validate(data)
        return OrderedDict()

コード例 #22

0

ファイルを表示

    def monitoring_channels_from_prior_params(self):
        """
        Get monitoring channels from the parameters of the prior distribution.

        By default, no monitoring channel is computed.
        """
        return OrderedDict()

コード例 #23

0

ファイルを表示

ファイル: __init__.py プロジェクト: yo-ga/TextDetector

    def get_monitoring_channels(self, data):
        """
        Notes
        -----
        Monitors quantities related to the approximate posterior parameters phi
        and the conditional and prior parameters theta.
        """
        space, source = self.get_monitoring_data_specs()
        space.validate(data)

        rval = OrderedDict()

        X = data
        epsilon_shape = (X.shape[0], self.nhid)
        epsilon = self.sample_from_epsilon(shape=epsilon_shape)
        phi = self.encode_phi(X)
        z = self.sample_from_q_z_given_x(epsilon=epsilon, phi=phi)
        theta = self.decode_theta(z)

        X_r = self.means_from_theta(theta)
        rval["reconstruction_mse"] = T.sqr(X - X_r).mean()

        posterior_channels = \
            self.posterior.monitoring_channels_from_conditional_params(phi)
        safe_update(rval, posterior_channels)

        conditional_channels = \
            self.conditional.monitoring_channels_from_conditional_params(theta)
        safe_update(rval, conditional_channels)

        prior_channels = self.prior.monitoring_channels_from_prior_params()
        safe_update(rval, prior_channels)

        return rval

コード例 #24

0

ファイルを表示

ファイル: gsn.py プロジェクト: yusuke0519/pylearn2

    def get_monitoring_channels(self, model, data, **kwargs):
        """
        .. todo::

            WRITEME properly
        
        Provides monitoring of the individual costs that are being added together.

        This is a very useful method to subclass if you need to monitor more
        things about the model.
        """
        self.get_data_specs(model)[0].validate(data)

        rval = OrderedDict()

        # if there's only 1 cost, then no need to split up the costs
        if len(self.costs) > 1:
            output = self._get_samples_from_model(model, data)

            rval['reconstruction_cost'] =\
                self._get_total_for_cost(0, self.costs[0][2], data, output)

            rval['classification_cost'] =\
                self._get_total_for_cost(1, self.costs[1][2], data, output)

        return rval

コード例 #25

0

ファイルを表示

ファイル: video.py プロジェクト: robbaran/pylearn2

def spatiotemporal_cubes(file_tuples, shape, n_patches=numpy.inf, rng=None):
    """
    Generator function that yields a stream of (filename, slicetuple)
    representing a spatiotemporal patch of that file.

    Parameters
    ----------
    file_tuples : list of tuples
        Each element should be a 2-tuple consisting of a filename
        (or arbitrary identifier) and a (length, height, width)
        shape tuple of the dimensions (number of frames in the video,
        height and width of each frame).

    shape : tuple
        A shape tuple consisting of the desired (length, height, width)
        of each spatiotemporal patch.

    n_patches : int, optional
        The number of patches to generate. By default, generates patches
        infinitely.

    rng : RandomState object or seed, optional
        The random number generator (or seed) to use. Defaults to None,
        meaning it will be seeded from /dev/urandom or the clock.

    Returns
    -------
    generator : generator object
        A generator that yields a stream of (filename, slicetuple) tuples.
        The slice tuple is such that it indexes into a 3D array containing
        the entire clip with frames indexed along the first axis, rows
        along the second and columns along the third.
    """
    frame_lookup = FrameLookup([(a, b[0]) for a, b in file_tuples])
    file_lookup = OrderedDict(file_tuples)
    patch_length, patch_height, patch_width = shape
    done = 0
    rng = make_np_rng(rng, which_method="random_integers")
    while done < n_patches:
        frame = rng.random_integers(0, len(frame_lookup) - 1)
        filename, file_length, frame_no = frame_lookup[frame]
        # Check that there is a contiguous block of frames starting at
        # frame_no that is at least as long as our desired cube length.
        if file_length - frame_no < patch_length:
            continue
        _, video_height, video_width = file_lookup[filename][:3]
        # The last row and column in which a patch could "start" to still
        # fall within frame.
        last_row = video_height - patch_height
        last_col = video_width - patch_width
        row = numpy.random.random_integers(0, last_row)
        col = numpy.random.random_integers(0, last_col)
        patch_slice = (slice(frame_no, frame_no + patch_length),
                       slice(row, row + patch_height),
                       slice(col, col + patch_width))
        done += 1
        yield filename, patch_slice

コード例 #26

0

ファイルを表示

 def enforce_constraints(self):
     """
     Enforces all constraints encoded by self.modify_updates.
     """
     params = self.get_params()
     updates = OrderedDict(izip_no_length_check(params, params))
     self.modify_updates(updates)
     f = function([], updates=updates)
     f()

コード例 #27

0

ファイルを表示

ファイル: dbm.py プロジェクト: zuiwufenghua/pylearn2

    def get_monitoring_channels(self, data):
        """
        .. todo::

            WRITEME
        """
        space, source = self.get_monitoring_data_specs()
        space.validate(data)
        X = data
        history = self.mf(X, return_history=True)
        q = history[-1]

        rval = OrderedDict()

        ch = self.visible_layer.get_monitoring_channels()
        for key in ch:
            rval['vis_' + key] = ch[key]

        for state, layer in safe_zip(q, self.hidden_layers):
            ch = layer.get_monitoring_channels()
            for key in ch:
                rval[layer.layer_name + '_' + key] = ch[key]
            ch = layer.get_monitoring_channels_from_state(state)
            for key in ch:
                rval['mf_' + layer.layer_name + '_' + key] = ch[key]

        if len(history) > 1:
            prev_q = history[-2]

            flat_q = flatten(q)
            flat_prev_q = flatten(prev_q)

            mx = None
            for new, old in safe_zip(flat_q, flat_prev_q):
                cur_mx = abs(new - old).max()
                if new is old:
                    logger.error('{0} is {1}'.format(new, old))
                    assert False
                if mx is None:
                    mx = cur_mx
                else:
                    mx = T.maximum(mx, cur_mx)

            rval['max_var_param_diff'] = mx

            for layer, new, old in safe_zip(self.hidden_layers,
                                            q, prev_q):
                sum_diff = 0.
                for sub_new, sub_old in safe_zip(flatten(new), flatten(old)):
                    sum_diff += abs(sub_new - sub_old).sum()
                denom = self.batch_size * \
                    layer.get_total_state_space().get_total_dimension()
                denom = np.cast[config.floatX](denom)
                rval['mean_'+layer.layer_name+'_var_param_diff'] = \
                    sum_diff / denom

        return rval

コード例 #28

0

ファイルを表示

ファイル: conditional.py プロジェクト: yo-ga/TextDetector

    def monitoring_channels_from_conditional_params(self, conditional_params):
        rval = OrderedDict()

        mu, log_sigma = conditional_params
        rval[self.name + '_sigma_min'] = T.exp(log_sigma).min()
        rval[self.name + '_sigma_max'] = T.exp(log_sigma).max()
        rval[self.name + '_sigma_mean'] = T.exp(log_sigma).mean()
        rval[self.name + '_sigma_std'] = T.exp(log_sigma).std()

        return rval

コード例 #29

0

ファイルを表示

    def get_layer_monitoring_channels(self,
                                      state_below=None,
                                      state=None,
                                      targets=None):
        """
        Block monitoring channels if not necessary

        Parameters
        ---------
        : todo
        """

        rval = OrderedDict()
        if self.use_monitoring_channels:
            state = state_below
            x = state
            state_conc = None

            for layer in self.layers:
                # We don't go through all the inner layers recursively
                state_below = state
                if ((self.x_shortcut and layer is not self.layers[0]
                     and layer is not self.layers[-1])):
                    state = self.create_shortcut_batch(state, x, 2, 1)
                if self.y_shortcut and layer is self.layers[-1]:
                    state = layer.fprop(state_conc)
                else:
                    state = layer.fprop(state)
                if self.y_shortcut and layer is not self.layers[-1]:
                    if layer is self.layers[0]:
                        state_conc = state
                    else:
                        state_conc = self.create_shortcut_batch(
                            state_conc, state, 2)
                args = [state_below, state]
                if layer is self.layers[-1] and targets is not None:
                    args.append(targets)
                ch = layer.get_layer_monitoring_channels(*args)
                if not isinstance(ch, OrderedDict):
                    raise TypeError(str((type(ch), layer.layer_name)))
                for key in ch:
                    value = ch[key]
                    doc = get_monitor_doc(value)
                    if doc is None:
                        doc = str(type(layer)) + \
                            ".get_monitoring_channels_from_state did" + \
                            " not provide any further documentation for" + \
                            " this channel."
                    doc = 'This channel came from a layer called "' + \
                        layer.layer_name + '" of an MLP.\n' + doc
                    value.__doc__ = doc
                    rval[layer.layer_name + '_' + key] = value

        return rval

コード例 #30

0

ファイルを表示

    def get_layer_monitoring_channels(self,
                                      state_below=None,
                                      state=None,
                                      targets=None):
        W, U, b = self._params
        sq_W = tensor.sqr(W)
        sq_U = tensor.sqr(U)
        row_norms = tensor.sqrt(sq_W.sum(axis=1))
        col_norms = tensor.sqrt(sq_W.sum(axis=0))
        u_row_norms = tensor.sqrt(sq_U.sum(axis=1))
        u_col_norms = tensor.sqrt(sq_U.sum(axis=0))

        rval = OrderedDict([('W_row_norms_min', row_norms.min()),
                            ('W_row_norms_mean', row_norms.mean()),
                            ('W_row_norms_max', row_norms.max()),
                            ('W_col_norms_min', col_norms.min()),
                            ('W_col_norms_mean', col_norms.mean()),
                            ('W_col_norms_max', col_norms.max()),
                            ('U_row_norms_min', u_row_norms.min()),
                            ('U_row_norms_mean', u_row_norms.mean()),
                            ('U_row_norms_max', u_row_norms.max()),
                            ('U_col_norms_min', u_col_norms.min()),
                            ('U_col_norms_mean', u_col_norms.mean()),
                            ('U_col_norms_max', u_col_norms.max())])

        if (state is not None) or (state_below is not None):
            if state is None:
                state = self.fprop(state_below)
            if isinstance(self.input_space, SequenceSpace):
                state, _ = state
                state_below, _ = state_below

            mx = state.max(axis=0)
            mean = state.mean(axis=0)
            mn = state.min(axis=0)
            rg = mx - mn

            rval['range_x_max_u'] = rg.max()
            rval['range_x_mean_u'] = rg.mean()
            rval['range_x_min_u'] = rg.min()

            rval['max_x_max_u'] = mx.max()
            rval['max_x_mean_u'] = mx.mean()
            rval['max_x_min_u'] = mx.min()

            rval['mean_x_max_u'] = mean.max()
            rval['mean_x_mean_u'] = mean.mean()
            rval['mean_x_min_u'] = mean.min()

            rval['min_x_max_u'] = mn.max()
            rval['min_x_mean_u'] = mn.mean()
            rval['min_x_min_u'] = mn.min()

        return rval

コード例 #31

0

ファイルを表示

ファイル: rnn.py プロジェクト: dwf/pylearn2

 def __init__(self, dim, layer_name, irange, indices=None,
              init_bias=0., nonlinearity=tensor.tanh,
              weight_noise=False, **kwargs):
     self._std_dev = kwargs.pop('noise_std_dev', .075)
     self.rnn_friendly = True
     self._scan_updates = OrderedDict()
     self.__dict__.update(locals())
     del self.self
     super(Recurrent, self).__init__()
     if not self.weight_noise:
         self._std_dev = None

コード例 #32

0

ファイルを表示

ファイル: vae.py プロジェクト: yo-ga/TextDetector

    def get_monitoring_channels(self, model, data, **kwargs):
        space, sources = self.get_data_specs(model)
        space.validate(data)

        rval = model.log_likelihood_lower_bound(data,
                                                self.num_samples,
                                                return_individual_terms=True)
        kl_divergence_term = rval[0].mean()
        expectation_term = -rval[1].mean()

        return OrderedDict([('kl_divergence_term', kl_divergence_term),
                            ('expectation_term', expectation_term)])

コード例 #33

0

ファイルを表示

    def __init__(self, model):
        self.training_succeeded = False
        self.model = model
        self.channels = OrderedDict()
        self._num_batches_seen = 0
        self._examples_seen = 0
        self._epochs_seen = 0
        self._datasets = []
        self._iteration_mode = []
        self._batch_size = []
        self._num_batches = []
        self._dirty = True
        self._rng_seed = []
        self.names_to_del = ['theano_function_mode']
        self.t0 = time.time()
        self.theano_function_mode = None
        self.on_channel_conflict = 'error'

        # Initialize self._nested_data_specs, self._data_specs_mapping,
        # and self._flat_data_specs
        self._build_data_specs()

コード例 #34

0

ファイルを表示

    def get_lr_scalers(self):
        """
        Specify how to rescale the learning rate on each parameter.

        Returns
        -------
        lr_scalers : OrderedDict
            A dictionary mapping the parameters of the model to floats. The
            learning rate will be multiplied by the float for each parameter.
            If a parameter does not appear in the dictionary, it will use
            the global learning rate with no scaling.
        """
        return OrderedDict()

コード例 #35

0

ファイルを表示

ファイル: dbm.py プロジェクト: HBadertscher/pylearn2

    def get_lr_scalers(self):
        """
        .. todo::

            WRITEME
        """
        rval = OrderedDict()

        params = self.get_params()

        for layer in self.hidden_layers + [self.visible_layer]:
            contrib = layer.get_lr_scalers()

            # No two layers can contend to scale a parameter
            assert not any([key in rval for key in contrib])
            # Don't try to scale anything that's not a parameter
            assert all([key in params for key in contrib])

            rval.update(contrib)
        assert all([isinstance(val, float) for val in rval.values()])

        return rval

コード例 #36

0

ファイルを表示

ファイル: dbm.py プロジェクト: zuiwufenghua/pylearn2

    def get_lr_scalers(self):
        """
        .. todo::

            WRITEME
        """
        rval = OrderedDict()

        params = self.get_params()

        for layer in self.hidden_layers + [self.visible_layer]:
            contrib = layer.get_lr_scalers()

            # No two layers can contend to scale a parameter
            assert not any([key in rval for key in contrib])
            # Don't try to scale anything that's not a parameter
            assert all([key in params for key in contrib])

            rval.update(contrib)
        assert all([isinstance(val, float) for val in rval.values()])

        return rval

コード例 #37

0

ファイルを表示

ファイル: learning_rule.py プロジェクト: nitbix/pylearn2

 def __init__(
     self,
     decrease_rate=0.5,
     increase_rate=1.2,
     min_rate=1e-6,
     max_rate=50
 ):
     assert increase_rate > 1.
     assert decrease_rate < 1.
     self.decrease_rate = sharedX(decrease_rate, 'decrease_rate')
     self.increase_rate = sharedX(increase_rate, 'increase_rate')
     self.min_rate = min_rate
     self.max_rate = max_rate
     self.zeros = OrderedDict()

コード例 #38

0

ファイルを表示

ファイル: learning_rule.py プロジェクト: nitbix/pylearn2

 def __init__(
     self,
     decrease_rate=0.5,
     increase_rate=1.2,
     min_rate=1e-6,
     max_rate=50,
     switching_threshold=1e-6
 ):
     assert increase_rate > 1.
     assert decrease_rate < 1.
     self.decrease_rate = sharedX(decrease_rate, 'decrease_rate')
     self.increase_rate = sharedX(increase_rate, 'increase_rate')
     self.min_rate = min_rate
     self.max_rate = max_rate
     self.switching_threshold = switching_threshold
     self.epsilons = OrderedDict()
     self.gt_epsilons = OrderedDict()
     self.lt_epsilons = OrderedDict()
     self.eq_epsilons = OrderedDict()

コード例 #39

0

ファイルを表示

ファイル: monitor.py プロジェクト: MarCnu/pylearn2

    def __init__(self, model):
        self.training_succeeded = False
        self.model = model
        self.channels = OrderedDict()
        self._num_batches_seen = 0
        self._examples_seen = 0
        self._epochs_seen = 0
        self._datasets = []
        self._iteration_mode = []
        self._batch_size = []
        self._num_batches = []
        self._dirty = True
        self._rng_seed = []
        self.names_to_del = ['theano_function_mode']
        self.t0 = time.time()
        self.theano_function_mode = None

        # Initialize self._nested_data_specs, self._data_specs_mapping,
        # and self._flat_data_specs
        self._build_data_specs()

コード例 #40

0

ファイルを表示

ファイル: learning_rule.py プロジェクト: nitbix/pylearn2

class RMSProp(LearningRule):
    """
    Implements the RMSProp learning rule.

    The RMSProp learning rule is described by Hinton in `lecture 6
    <http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf>`
    of the Coursera Neural Networks for Machine Learning course.

    In short, Hinton suggests "[the] magnitude of the gradient can be very
    different for different weights and can change during learning.  This
    makes it hard to choose a global learning rate." RMSProp solves this
    problem by "[dividing] the learning rate for a weight by a running
    average of the magnitudes of recent gradients for that weight."


    Parameters
    ----------
    decay : float, optional
        Decay constant similar to that used in AdaDelta and Momentum methods.
    max_scaling: float, optional
        Restrict the RMSProp gradient scaling coefficient to values
        below `max_scaling`.

    Notes
    -----
    An instance of this LearningRule should only be used with one
    TrainingAlgorithm, and its get_updates method should be called
    only once. This is required in order to make the monitoring
    channels correctly report the moving averages.
    """

    def __init__(self, decay=0.9, max_scaling=1e5):
        assert 0. <= decay < 1.
        assert max_scaling > 0
        self.decay = sharedX(decay, 'decay')
        self.epsilon = 1. / max_scaling
        self.mean_square_grads = OrderedDict()

    @wraps(LearningRule.add_channels_to_monitor)
    def add_channels_to_monitor(self, monitor, monitoring_dataset):
        """
        The channels added are the min, mean, and max of the
        mean_square_grad of each parameter.
        """

        channel_mapping = {
            '_min': T.min,
            '_max': T.max,
            '_mean': T.mean
        }

        for mean_square_grad in self.mean_square_grads.values():
            for suffix, op in channel_mapping.items():
                monitor.add_channel(
                    name=(mean_square_grad.name + suffix),
                    ipt=None,
                    val=op(mean_square_grad),
                    data_specs=(NullSpace(), ''),
                    dataset=monitoring_dataset)
        return

    def get_updates(self, learning_rate, grads, lr_scalers=None):
        """
        Provides the symbolic (theano) description of the updates needed to
        perform this learning rule. See Notes for side-effects.

        Parameters
        ----------
        learning_rate : float
            Learning rate coefficient.
        grads : dict
            A dictionary mapping from the model's parameters to their
            gradients.
        lr_scalers : dict
            A dictionary mapping from the model's parameters to a learning
            rate multiplier.

        Returns
        -------
        updates : OrderdDict
            A dictionary mapping from the old model parameters, to their new
            values after a single iteration of the learning rule.

        Notes
        -----
        This method has the side effect of storing the moving average
        of the square gradient in `self.mean_square_grads`. This is
        necessary in order for the monitoring channels to be able
        to track the value of these moving averages.
        Therefore, this method should only get called once for each
        instance of RMSProp.
        """

        updates = OrderedDict()
        for param in grads:

            # mean_squared_grad := E[g^2]_{t-1}
            mean_square_grad = sharedX(param.get_value() * 0.)

            if param.name is None:
                raise ValueError("Model parameters must be named.")
            mean_square_grad.name = 'mean_square_grad_' + param.name

            if param.name in self.mean_square_grads:
                warnings.warn("Calling get_updates more than once on the "
                              "gradients of `%s` may make monitored values "
                              "incorrect." % param.name)
            # Store variable in self.mean_square_grads for monitoring.
            self.mean_square_grads[param.name] = mean_square_grad

            # Accumulate gradient
            new_mean_squared_grad = (self.decay * mean_square_grad +
                                     (1 - self.decay) * T.sqr(grads[param]))

            # Compute update
            scaled_lr = lr_scalers.get(param, 1.) * learning_rate
            rms_grad_t = T.sqrt(new_mean_squared_grad)
            rms_grad_t = T.maximum(rms_grad_t, self.epsilon)
            delta_x_t = - scaled_lr * grads[param] / rms_grad_t

            # Apply update
            updates[mean_square_grad] = new_mean_squared_grad
            updates[param] = param + delta_x_t

        return updates

コード例 #41

0

ファイルを表示

ファイル: update_norm_monitor.py プロジェクト: Neuroglycerin/neukrill-net-tools

class UpdateNormMonitorLearningRule(LearningRule):

    """ Wraps an existing pylearn2 learning rule and adds monitor channels
        for the norms of the gradient based updates calculated during
        learning.
    """
    
    def __init__(self, base_learning_rule, decay=0.9):
        self.base = base_learning_rule
        # hack to allow MomentumAdjustor to access momentum value
        if hasattr(self.base, 'momentum'):
            self.momentum = self.base.momentum
        self.decay = decay
        self.mean_updates = OrderedDict()
         
    def add_channels_to_monitor(self, monitor, monitoring_dataset):
    
        channel_mapping = {
            '_min': T.min,
            '_max': T.max,
            '_mean': T.mean
        }
        
        for mean_update in self.mean_updates.values():
            if mean_update.ndim == 4:
                # rank-4 tensor (assuming stack of rank-3 convolutional kernels)
                knl_norm_vals = T.sqrt(T.sum(T.sqr(mean_update), axis=(1,2,3)))
                for suffix, op in channel_mapping.items():
                    monitor.add_channel(
                        name=(mean_update.name + "_kernel_norm" + suffix),
                        ipt=None,
                        val=op(knl_norm_vals),
                        data_specs=(NullSpace(), ''),
                        dataset=monitoring_dataset)
            elif mean_update.ndim == 3:
                # rank-3 tensor (assuming stack of rank-2 conv layer biases)
                knl_norm_vals = T.sqrt(T.sum(T.sqr(mean_update), axis=(1,2)))
                for suffix, op in channel_mapping.items():
                    monitor.add_channel(
                        name=(mean_update.name + "_norm" + suffix),
                        ipt=None,
                        val=op(knl_norm_vals),
                        data_specs=(NullSpace(), ''),
                        dataset=monitoring_dataset)
            elif mean_update.ndim == 2:
                # rank-2 tensor (matrix)
                col_norm_vals = T.sqrt(T.sum(T.sqr(mean_update), axis=0))
                row_norm_vals = T.sqrt(T.sum(T.sqr(mean_update), axis=1))
                mtx_norm_val = T.sqrt(T.sum(T.sqr(mean_update)))        
                for suffix, op in channel_mapping.items():
                    monitor.add_channel(
                        name=(mean_update.name + "_col_norm" + suffix),
                        ipt=None,
                        val=op(col_norm_vals),
                        data_specs=(NullSpace(), ''),
                        dataset=monitoring_dataset)
                    monitor.add_channel(
                        name=(mean_update.name + "_row_norm" + suffix),
                        ipt=None,
                        val=op(row_norm_vals),
                        data_specs=(NullSpace(), ''),
                        dataset=monitoring_dataset)
                monitor.add_channel(
                    name=(mean_update.name + "_norm"),
                    ipt=None,
                    val=mtx_norm_val,
                    data_specs=(NullSpace(), ''),
                    dataset=monitoring_dataset)
            elif mean_update.ndim == 1:
                # rank-1 tensor (vector)
                norm_val = T.sqrt(T.sum(T.sqr(mean_update), axis=0))
                monitor.add_channel(
                    name=(mean_update.name + "_norm"),
                    ipt=None,
                    val=norm_val,
                    data_specs=(NullSpace(), ''),
                    dataset=monitoring_dataset)
            elif mean_update.ndim == 0:
                # rank-0 tensor (scalar)
                monitor.add_channel(
                    name=(mean_update.name + "_norm"),
                    ipt=None,
                    val=mean_update,
                    data_specs=(NullSpace(), ''),
                    dataset=monitoring_dataset)                
            else:
                # not sure which axes to sum over in this case
                raise ValueError(
                    'Mean update {0} has unexpected number of dimensions {1} ({2})'
                    .format(mean_update, mean_update.ndim, mean_update.shape))
                    
        self.base.add_channels_to_monitor(monitor, monitoring_dataset)
        
        return  

    def get_updates(self, learning_rate, grads, lr_scalers=None):
    
        updates = self.base.get_updates(learning_rate, grads, lr_scalers)
    
        for (param, grad) in six.iteritems(grads):

            mean_update = sharedX(param.get_value() * 0.)

            if param.name is None:
                raise ValueError("Model parameters must be named.")
            mean_update.name = 'mean_update_' + param.name

            if param.name in self.mean_updates:
                warnings.warn("Calling get_updates more than once on the "
                              "gradients of `%s` may make monitored values "
                              "incorrect." % param.name)
                              
            # Store variable in self.mean_updates for monitoring.
            self.mean_updates[param.name] = mean_update

            # Accumulate updates
            d_param = updates[param] - param
            new_mean_update = (self.decay * mean_update + 
                               (1 - self.decay) * d_param)

            # Apply update
            updates[mean_update] = new_mean_update
            
        return updates

コード例 #42

0

ファイルを表示

ファイル: cascade.py プロジェクト: ballasn/facedet

 def get_lr_scalers(self, model_idx=-1):
     scaler = OrderedDict()
     for model in self.models:
         scaler.update(model.get_lr_scalers())
     return scaler

コード例 #43

0

ファイルを表示

ファイル: learning_rule.py プロジェクト: nitbix/pylearn2

class DROP_RPROP(LearningRule):
    def __init__(
        self,
        decrease_rate=0.5,
        increase_rate=1.2,
        min_rate=1e-6,
        max_rate=50
    ):
        assert increase_rate > 1.
        assert decrease_rate < 1.
        self.decrease_rate = sharedX(decrease_rate, 'decrease_rate')
        self.increase_rate = sharedX(increase_rate, 'increase_rate')
        self.min_rate = min_rate
        self.max_rate = max_rate
        self.zeros = OrderedDict()

    def add_channels_to_monitor(self, monitor, monitoring_dataset):
        monitor.add_channel(
            'rprop_decrease_rate',
            ipt=None,
            val=self.decrease_rate,
            dataset=monitoring_dataset,
            data_specs=(NullSpace(), '')
        )
        monitor.add_channel(
            'rprop_increase_rate',
            ipt=None,
            val=self.increase_rate,
            dataset=monitoring_dataset,
            data_specs=(NullSpace(), '')
        )
        for zero in self.zeros.values():
            monitor.add_channel(
                zero.name,
                ipt=None,
                val=T.sum(zero),
                dataset=monitoring_dataset,
                data_specs=(NullSpace(), '')
            )

    def get_updates(self, learning_rate, grads, lr_scalers=None,
            global_error=None,masks=None):
        updates = OrderedDict()

        for param, grad in grads.iteritems():
            # Create required shared variables
            lr = lr_scalers.get(param, learning_rate.get_value())
            delta = sharedX(
                np.zeros_like(param.get_value()) + lr,
                borrow=True
            )
            previous_grad = sharedX(
                np.zeros_like(param.get_value()),
                borrow=True
            )
            zeros = sharedX(
                np.zeros_like(param.get_value()),
                borrow=True
            )
            

            layer_name = re.sub('_W$','',param.name)
            if re.match(r'.*_W$',param.name) and layer_name in masks:
                    mask = masks[layer_name]
                    masked_grad = T.gt(T.dot(mask.T,T.dot(mask,grad)),0.)
            else:
                masked_grad = 1. #T.ones_like(grad)

            # Name variables according to the parameter name
            if param.name is not None:
                delta.name = 'delta_'+param.name
                zeros.name = 'zeros_' + param.name
                previous_grad.name = 'previous_grad_' + param.name

            self.zeros[param] = zeros
            temp = grad * previous_grad
            delta_inc = T.switch(
                    T.neq(grad,0.),
                    T.clip(
                        T.switch(
                            T.eq(temp, 0.),
                            delta,
                            T.switch(
                                T.lt(temp, 0.),
                                delta*self.decrease_rate,
                                delta*self.increase_rate
                            )
                        ),
                        self.min_rate,
                        self.max_rate
                    ),
                    delta
            )


            previous_grad_inc = T.switch(
                T.gt(masked_grad,0.),
                T.switch(
                    T.gt(temp,0.),
                    grad,
                    0.
                ),
                previous_grad
            )

            # Calculate updates of parameters
            updated_inc = T.switch(
                T.neq(grad,0.),
                - delta_inc * T.sgn(grad),
                0.
            )

            new_zeros = zeros + T.switch(T.neq(grad,0.),0,1)
            # Compile the updates
            updates[param] = param + updated_inc
            updates[delta] = delta_inc
            updates[previous_grad] = previous_grad_inc
            updates[zeros] = new_zeros

        return updates

コード例 #44

0

ファイルを表示

ファイル: monitor.py プロジェクト: 123fengye741/pylearn2

class Monitor(object):
    """
    A class for monitoring Models while they are being trained.

    A monitor object records the number of minibatches and number of
    examples the model has trained, as well as any number of "channels"
    that track quantities of interest (examples: the objective
    function, measures of hidden unit activity, reconstruction error,
    sum of squared second derivatives, average norm of the weight
    vectors, etc.)

    Parameters
    ----------
    model : `pylearn2.models.model.Model`

    Attributes
    ----------
    on_channel_conflict : string
        `error` : this is a behavior when there is conlfict
            on creating a channel twice
        `copy_history` : this is a behavior when creating a
            new channel and transfering history of old_monitor
        `overwrite` : this is a behavior when creating a
            new channel without taking an account of old_monitor
    """

    def __init__(self, model):
        self.training_succeeded = False
        self.model = model
        self.channels = OrderedDict()
        self._num_batches_seen = 0
        self._examples_seen = 0
        self._epochs_seen = 0
        self._datasets = []
        self._iteration_mode = []
        self._batch_size = []
        self._num_batches = []
        self._dirty = True
        self._rng_seed = []
        self.names_to_del = ['theano_function_mode']
        self.t0 = time.time()
        self.theano_function_mode = None
        self.on_channel_conflict = 'error'

        # Initialize self._nested_data_specs, self._data_specs_mapping,
        # and self._flat_data_specs
        self._build_data_specs()

    def _build_data_specs(self):
        """
        Computes a nested data_specs for input and all channels

        Also computes the mapping to flatten it. This function is
        called from redo_theano.
        """
        # Ask the model what it needs
        m_space, m_source = self.model.get_monitoring_data_specs()
        input_spaces = [m_space]
        input_sources = [m_source]
        for channel in self.channels.values():
            space = channel.data_specs[0]
            assert isinstance(space, Space)
            input_spaces.append(space)
            input_sources.append(channel.data_specs[1])

        nested_space = CompositeSpace(input_spaces)
        nested_source = tuple(input_sources)

        self._nested_data_specs = (nested_space, nested_source)
        self._data_specs_mapping = DataSpecsMapping(self._nested_data_specs)

        flat_space = self._data_specs_mapping.flatten(nested_space,
                                                      return_tuple=True)
        flat_source = self._data_specs_mapping.flatten(nested_source,
                                                       return_tuple=True)
        self._flat_data_specs = (CompositeSpace(flat_space), flat_source)

    def set_theano_function_mode(self, mode):
        """
        .. todo::

            WRITEME

        Parameters
        ----------
        mode : theano.compile.Mode
            Theano functions for the monitoring channels will be
            compiled and run using this mode.
        """
        if self.theano_function_mode != mode:
            self._dirty = True
            self.theano_function_mode = mode

    def add_dataset(self, dataset, mode='sequential', batch_size=None,
                    num_batches=None, seed=None):
        """
        Determines the data used to calculate the values of each channel.

        Parameters
        ----------
        dataset : object
            A `pylearn2.datasets.Dataset` object.
        mode : str or object, optional
            Iteration mode; see the docstring of the `iterator` method
            on `pylearn2.datasets.Dataset` for details.
        batch_size : int, optional
            The size of an individual batch. Optional if `mode` is
            'sequential' and `num_batches` is specified (batch size
            will be calculated based on full dataset size).
        num_batches : int, optional
            The total number of batches. Unnecessary if `mode` is
            'sequential' and `batch_size` is specified (number of
            batches will be calculated based on full dataset size).
        seed : int, optional
            Optional. The seed to be used for random iteration modes.
        """
        # The user can ommit using lists if only one dataset is set
        if not isinstance(dataset, list):
            dataset = [dataset]
        if not isinstance(mode, list):
            mode = [mode]
        if not isinstance(batch_size, list):
            batch_size = [batch_size]
        if not isinstance(num_batches, list):
            num_batches = [num_batches]
        if seed is None:
            seed = [None] * len(dataset)
        if not isinstance(seed, list):
            seed = [seed]
        if len(mode) != len(dataset):
            raise ValueError("Received " + str(len(dataset)) +
                             " dataset but " + str(len(mode)) + " modes.")
        if any([len(l) != len(dataset) for l in [batch_size, seed]]):
            raise ValueError("make sure each dataset has its iteration " +
                             "batch size and number of batches.")
        for (d, m, b, n, sd) in safe_izip(dataset, mode, batch_size,
                                          num_batches, seed):
            try:
                it = d.iterator(mode=m,
                                batch_size=b,
                                num_batches=n,
                                data_specs=self._flat_data_specs,
                                return_tuple=True,
                                rng=sd)
            except ValueError as exc:
                reraise_as(ValueError("invalid iteration parameters in " +
                                      "Monitor.add_dataset: " + str(exc)))
            if it.stochastic:
                # Must be a seed, not a random number generator. If it were a
                # random number generator, different iterators using it would
                # update its state, so we would not get the same iterator
                # each time. Also, must not be None, because this makes the
                # iterator pick a seed based on the clock
                if sd is None:
                    raise TypeError("Monitor requires a seed when using " +
                                    "stochastic iteration modes.")
                if not isinstance(sd, (list, tuple, int)):
                    raise TypeError("Monitor requires a seed (not a random " +
                                    "number generator) when using " +
                                    "stochastic iteration modes.")
            else:
                # The iterator should catch this, but let's double-check
                assert sd is None

            if d not in self._datasets:
                self._datasets.append(d)
                self._iteration_mode.append(m)
                self._batch_size.append(b)
                self._num_batches.append(n)
                self._rng_seed.append(sd)

    def __call__(self):
        """
        Runs the model on the monitoring dataset in order to add one
        data point to each of the channels.
        """

        # If the channels have changed at all, we need to recompile the theano
        # functions used to compute them
        if self._dirty:
            self.redo_theano()

        datasets = self._datasets

        # Set all channels' val_shared to 0
        self.begin_record_entry()
        for d, i, b, n, a, sd, ne in safe_izip(datasets,
                                               self._iteration_mode,
                                               self._batch_size,
                                               self._num_batches,
                                               self.accum,
                                               self._rng_seed,
                                               self.num_examples):
            if isinstance(d, six.string_types):
                d = yaml_parse.load(d)
                raise NotImplementedError()

            # need to put d back into self._datasets
            myiterator = d.iterator(mode=i,
                                    batch_size=b,
                                    num_batches=n,
                                    data_specs=self._flat_data_specs,
                                    return_tuple=True,
                                    rng=sd)

            # If self._flat_data_specs is empty, no channel needs data,
            # so we do not need to call the iterator in order to average
            # the monitored values across different batches, we only
            # have to call them once.
            if len(self._flat_data_specs[1]) == 0:
                X = ()
                self.run_prereqs(X, d)
                a(*X)

            else:
                actual_ne = 0
                for X in myiterator:
                    # X is a flat (not nested) tuple
                    self.run_prereqs(X, d)
                    a(*X)
                    actual_ne += self._flat_data_specs[0].np_batch_size(X)
                # end for X
                if actual_ne != ne:
                    raise RuntimeError("At compile time, your iterator said "
                                       "it had %d examples total, but at "
                                       "runtime it gave us %d." %
                                       (ne, actual_ne))
        # end for d

        log.info("Monitoring step:")
        log.info("\tEpochs seen: %d" % self._epochs_seen)
        log.info("\tBatches seen: %d" % self._num_batches_seen)
        log.info("\tExamples seen: %d" % self._examples_seen)
        t = time.time() - self.t0
        for channel_name in sorted(self.channels.keys(),
                                   key=number_aware_alphabetical_key):
            channel = self.channels[channel_name]
            channel.time_record.append(t)
            channel.batch_record.append(self._num_batches_seen)
            channel.example_record.append(self._examples_seen)
            channel.epoch_record.append(self._epochs_seen)
            val = channel.val_shared.get_value()
            channel.val_record.append(val)
            # TODO: use logging infrastructure so that user can configure
            # formatting
            if abs(val) < 1e4:
                val_str = str(val)
            else:
                val_str = '%.3e' % val

            log.info("\t%s: %s" % (channel_name, val_str))

    def run_prereqs(self, data, dataset):
        """
        Runs all "prerequistie functions" on a batch of data. Always
        called right before computing the monitoring channels on that
        batch.

        Parameters
        ----------
        data : tuple or Variable
            a member of the Space used as input to the monitoring
            functions
        dataset : Dataset
            the Dataset the data was drawn from
        """
        if dataset not in self.prereqs:
            return
        for prereq in self.prereqs[dataset]:
            prereq(*data)

    def get_batches_seen(self):
        """
        Returns the number of batches the model has learned on
        (assuming that the learning code has been calling
        Monitor.report_batch correctly).
        """
        return self._num_batches_seen

    def get_epochs_seen(self):
        """
        .. todo::

            WRITEME

        Returns
        -------
        epochs_seen : int
            The number of epochs the model has been trained on.
            One "epoch" is one pass through Dataset.iterator.
        """
        return self._epochs_seen

    def get_examples_seen(self):
        """
        .. todo::

            WRITEME

        Returns
        -------
        examples_seen : int
            The number of examples the model has learned on (assuming
            that the learning code has been calling Monitor.report_batch
            correctly)
        """
        return self._examples_seen

    def report_batch(self, num_examples):
        """
        Call this whenever the model has learned on another batch of
        examples. Report how many examples were learned on.

        Parameters
        ----------
        num_examples : int
            The number of examples learned on in this minibatch.
        """
        self._examples_seen += num_examples
        self._num_batches_seen += 1

    def report_epoch(self):
        """
        Call this whenever the model has completed another "epoch" of
        learning. We regard one pass through Dataset.iterator as one
        epoch.
        """
        self._epochs_seen += 1

    def redo_theano(self):
        """
        Recompiles Theano functions used by this monitor.

        This is called any time we need to evaluate the channels and
        the channel definitions have changed since last we called it,
        or if the theano functions are unavailable for any other reason
        (first time they are needed after construction or
        deserialization, etc.)

        All channels are compiled as part of the same theano function
        so that the theano optimizations can eliminate subexpressions
        that are shared between multiple channels.
        """
        self._dirty = False

        # Recompute the data specs, since the channels may have changed.
        self._build_data_specs()

        init_names = dir(self)
        self.prereqs = OrderedDict()
        for channel in self.channels.values():
            if channel.prereqs is not None:
                dataset = channel.dataset
                if dataset not in self.prereqs:
                    self.prereqs[dataset] = []
                prereqs = self.prereqs[dataset]
                for prereq in channel.prereqs:
                    if prereq not in prereqs:
                        prereqs.append(prereq)

        updates = OrderedDict()
        for channel in self.channels.values():
            updates[channel.val_shared] = np.cast[config.floatX](0.0)
        with log_timing(log, "compiling begin_record_entry"):
            self.begin_record_entry = function(
                inputs=[],
                updates=updates,
                mode=self.theano_function_mode,
                name='Monitor.begin_record_entry'
            )
        updates = OrderedDict()
        givens = OrderedDict()
        # Get the appropriate kind of theano variable to represent the data
        # the model acts on
        batch_names = ['monitoring_%s' % s for s in self._flat_data_specs[1]]
        theano_args = self._flat_data_specs[0].make_theano_batch(batch_names)

        # Get a symbolic expression of the batch size
        # We do it here, rather than for each channel, because channels with an
        # empty data_specs do not use data, and are unable to extract the batch
        # size. The case where the whole data specs is empty is not supported.
        batch_size = self._flat_data_specs[0].batch_size(theano_args)

        # Also get a nested representation, for joint iteration
        # with each of channel.graph_input
        nested_theano_args = self._data_specs_mapping.nest(theano_args)
        if not isinstance(nested_theano_args, tuple):
            nested_theano_args = (nested_theano_args,)
        assert len(nested_theano_args) == (len(self.channels) + 1)

        log.info('Monitored channels: ')
        for key in sorted(self.channels.keys()):
            mode = self.theano_function_mode
            if mode is not None and hasattr(mode, 'record'):
                mode.record.handle_line('compiling monitor including ' +
                                        'channel ' + key + '\n')
            log.info('\t%s' % key)
        it = []
        for d, i, n, b in safe_izip(self._datasets, self._iteration_mode,
                                    self._num_batches, self._batch_size):
            it.append(d.iterator(mode=i, num_batches=n, batch_size=b,
                                 data_specs=self._flat_data_specs,
                                 return_tuple=True))
        self.num_examples = [i.num_examples for i in it]
        givens = [OrderedDict() for d in self._datasets]
        updates = [OrderedDict() for d in self._datasets]
        for i, channel in enumerate(self.channels.values()):
            index = self._datasets.index(channel.dataset)
            d = self._datasets[index]
            g = givens[index]
            cur_num_examples = self.num_examples[index]
            u = updates[index]

            # Flatten channel.graph_input and the appropriate part of
            # nested_theano_args, to iterate jointly over them.
            c_mapping = DataSpecsMapping(channel.data_specs)
            channel_inputs = c_mapping.flatten(channel.graph_input,
                                               return_tuple=True)
            inputs = c_mapping.flatten(nested_theano_args[i + 1],
                                       return_tuple=True)

            for (channel_X, X) in safe_izip(channel_inputs, inputs):
                assert channel_X not in g or g[channel_X] is X
                assert channel_X.type == X.type, (channel_X.type, X.type)
                g[channel_X] = X

            if batch_size == 0:
                # No channel does need any data, so there is not need to
                # average results, and we will call the accum functions only
                # once.
                # TODO: better handling of channels not needing data when
                # some other channels need data.
                assert len(self._flat_data_specs[1]) == 0
                val = channel.val
            else:
                if n == 0:
                    raise ValueError("Iterating over 0 examples results in " +
                                     "divide by 0")
                val = T.cast(channel.val * T.cast(batch_size, 'float64')
                             / cur_num_examples, config.floatX)
            u[channel.val_shared] = channel.val_shared + val

        with log_timing(log, "Compiling accum"):
            # Check type of update expressions
            for up in updates:
                for key in up:
                    if key.dtype != up[key].dtype:
                        raise TypeError('Monitoring channel shared variable ' +
                                        key.name + ' has dtype ' + key.dtype +
                                        ' but is driven by an expression ' +
                                        'with type ' + up[key].dtype)

            self.accum = []
            for idx, packed in enumerate(safe_izip(givens, updates)):
                g, u = packed
                mode = self.theano_function_mode
                if mode is not None and hasattr(mode, 'record'):
                    for elem in g:
                        mode.record.handle_line('g key ' +
                                                var_descriptor(elem) + '\n')
                        mode.record.handle_line('g val ' +
                                                var_descriptor(g[elem]) + '\n')
                    for elem in u:
                        mode.record.handle_line('u key ' +
                                                var_descriptor(elem) + '\n')
                        mode.record.handle_line('u val ' +
                                                var_descriptor(u[elem]) + '\n')
                function_name = 'Monitor.accum[%d]' % idx
                if mode is not None and hasattr(mode, 'record'):
                    mode.record.handle_line('compiling supervised accum\n')
                # Some channels may not depend on the data, ie, they might just
                # monitor the model parameters, or some shared variable updated
                # by the training algorithm, so we need to ignore the unused
                # input error
                self.accum.append(function(theano_args,
                                           givens=g,
                                           updates=u,
                                           mode=self.theano_function_mode,
                                           name=function_name))
            for a in self.accum:
                if mode is not None and hasattr(mode, 'record'):
                    for elem in a.maker.fgraph.outputs:
                        mode.record.handle_line('accum output ' +
                                                var_descriptor(elem) + '\n')
                log.info("graph size: %d" % len(a.maker.fgraph.toposort()))
        final_names = dir(self)
        self.register_names_to_del([name for name in final_names
                                    if name not in init_names])

    def register_names_to_del(self, names):
        """
        Register names of fields that should be deleted before pickling.

        Parameters
        ----------
        names : list
            A list of attribute names as strings.
        """
        for name in names:
            if name not in self.names_to_del:
                self.names_to_del.append(name)

    def __getstate__(self):
        """
        In order to avoid pickling a copy of the dataset whenever a
        monitor is saved, the __getstate__ method replaces the dataset
        field with the dataset's yaml source. This is not a perfect
        solution because it won't work with job resuming, which would
        require saving the state of the dataset's random number
        generator.

        Like in the Model class, we also need to avoid saving any
        Theano functions, so we delete everything that can be
        regenerated with `redo_theano` by deleting the fields in
        `self.names_to_del`
        """

        # Patch old pickled monitors
        if not hasattr(self, '_datasets'):
            self._datasets = [self._dataset]
            del self._dataset

        temp = self._datasets

        if self._datasets:
            self._datasets = []
            for dataset in temp:
                if isinstance(dataset, six.string_types):
                    self._datasets.append(dataset)
                else:
                    try:
                        self._datasets.append(dataset.yaml_src)
                    except AttributeError:
                        warnings.warn('Trained model saved without ' +
                                      'indicating yaml_src')
        d = copy.copy(self.__dict__)
        self._datasets = temp
        for name in self.names_to_del:
            if name in d:
                del d[name]

        return d

    def __setstate__(self, d):
        """
        Sets the object to have the state described by `d`.

        Parameters
        ----------
        d : dict
            A dictionary mapping string names of fields to values for
            these fields.
        """
        # patch old pkl files
        if '_dataset' in d:
            d['_datasets'] = [d['_dataset']]
            del d['_dataset']

        self.__dict__.update(d)

    def add_channel(self, name, ipt, val, dataset=None, prereqs=None,
                    data_specs=None):
        """
        Asks the monitor to start tracking a new value.  Can be called
        even after the monitor is already in use.

        Parameters
        ----------
        name : str
            The display name in the monitor.
        ipt : tensor_like
            The symbolic tensor which should be clamped to the data.
            (or a list/tuple containing symbolic tensors, following the
            data_specs)
        val : tensor_like
            The value (function of `ipt`) to be tracked.
        dataset : pylearn2.datasets.Dataset
            Which dataset to compute this channel on
        prereqs : list of callables that take a list of numpy tensors
            Each prereq must be called exactly once per each new batch
            of data drawn *from dataset* before the channel value is
            computed if two channels provide a prereq with exactly the
            same id, that prereq will only be called once
        data_specs : (space, source) pair
            Identifies the order, format and semantics of ipt
        """
        if six.PY3:
            numeric = (float, int)
        else:
            numeric = (float, int, long)  # noqa

        if isinstance(val, numeric):
            val = np.cast[theano.config.floatX](val)

        val = T.as_tensor_variable(val)

        if data_specs is None:
            warnings.warn("parameter 'data_specs' should be provided when " +
                          "calling add_channel. We will build a default one.",
                          stacklevel=2)
            if isinstance(ipt, list):
                ipt = tuple(ipt)
            if ipt is not None and not isinstance(ipt, tuple):
                ipt = (ipt,)

            if ipt is None:
                data_specs = (NullSpace(), '')
            elif len(ipt) == 0:
                data_specs = (CompositeSpace([]), ())
            elif hasattr(dataset, 'get_data_specs'):
                dataset_space, dataset_source = dataset.get_data_specs()
                if (len(ipt) == 1 and
                        dataset_source is not None and
                        (not isinstance(dataset_source, tuple) or
                            len(dataset_source) == 1) and
                        'features' in dataset_source):
                    data_specs = (dataset_space, dataset_source)
                elif (len(ipt) == 2 and
                        dataset_source == ('features', 'targets')):
                    data_specs = (dataset_space, dataset_source)
                else:
                    raise ValueError("Cannot infer default data_specs for " +
                                     "the following input points and " +
                                     "dataset: ipt = %s, dataset = %s"
                                     % (ipt, dataset))

        data_specs[0].validate(ipt)

        mapping = DataSpecsMapping(data_specs)
        flat_ipt = mapping.flatten(ipt)
        if not isinstance(flat_ipt, tuple):
            flat_ipt = (flat_ipt,)
        inputs = theano.gof.graph.inputs([val])
        for elem in inputs:
            if not hasattr(elem, 'get_value') and \
               not isinstance(elem, theano.gof.graph.Constant):
                if elem not in flat_ipt:
                    raise ValueError("Unspecified input: " + str(elem) +
                                     ". This may be due to an incorrect " +
                                     "implementation of a cost's " +
                                     "get_data_specs() method, or of a " +
                                     "model's get_monitoring_data_specs() " +
                                     "method.")

        mode = self.theano_function_mode
        if mode is not None and hasattr(mode, 'record'):
            mode.record.handle_line('Adding monitor channel '+name+'\n')
            assert isinstance(flat_ipt, tuple)
            if len(flat_ipt) != 1:
                for elem in flat_ipt:
                    mode.record.handle_line('Includes input var ' +
                                            var_descriptor(elem) + '\n')
            else:
                mode.record.handle_line(name + ' input var is ' +
                                        var_descriptor(flat_ipt[0]) + '\n')
            mode.record.handle_line('channel ' + name + ' is ' +
                                    var_descriptor(val) + '\n')

        if dataset is None:
            if len(self._datasets) == 1:
                dataset = self._datasets[0]
            elif len(self._datasets) == 0:
                raise ValueError(_err_no_data)
            else:
                raise ValueError(_err_ambig_data)

        try:
            self._datasets.index(dataset)
        except ValueError:
            reraise_as(ValueError("The dataset specified is not one of the " +
                                  "monitor's datasets"))

        if ((self.on_channel_conflict not in
             ('error', 'copy_history', 'overwrite'))):
            raise ValueError("on_channel_conflict should be either 'error'" +
                             "'copy_history', or 'overwrite'")

        if name in self.channels and self.on_channel_conflict == 'error':
            raise ValueError("Tried to create the same channel twice (%s)" %
                             name)
        elif ((name in self.channels and
               self.on_channel_conflict == 'copy_history')):
            self.channels[name] = MonitorChannel(ipt, val, name, data_specs,
                                                 dataset, prereqs,
                                                 self.channels[name])
        elif ((name not in self.channels or
               self.on_channel_conflict == 'overwrite')):
            self.channels[name] = MonitorChannel(ipt, val, name, data_specs,
                                                 dataset, prereqs)
        self._dirty = True

    def _sanity_check(self):
        """
        Sometimes we serialize models and then load them somewhere else
        but still try to use their Monitor, and the Monitor is in a
        mangled state. I've added some calls to _sanity_check to try to
        catch when that happens. Not sure what to do for a long term
        fix. I think it requires making theano graphs serializable
        first.
        """
        for name in self.channels:
            channel = self.channels[name]
            assert hasattr(channel, 'prereqs')

    @classmethod
    def get_monitor(cls, model):
        """
        Returns a model's monitor. If the model doesn't have a monitor
        yet, installs one and returns that.

        Parameters
        ----------
        model : object
            An object that implements the `Model` interface specified
            in `pylearn2.models`.
        """

        if hasattr(model, 'monitor'):
            rval = model.monitor
            rval._sanity_check()
        else:
            rval = Monitor(model)
            model.monitor = rval

        return rval

    # TODO: find out if this method is used anywhere, remove if not.
    @property
    def batch_size(self):
        """
        .. todo::

            WRITEME

        Returns
        -------
        batch_size : int
            The size of the batches used for monitoring
        """
        return self._batch_size

    # TODO: find out if this method is used anywhere, remove if not.
    @property
    def num_batches(self):
        """
        .. todo::

            WRITEME

        Returns
        -------
        num_batches : int
            The number of batches used for monitoring
        """
        return self._num_batches

    def setup(self, dataset, cost, batch_size, num_batches=None,
              extra_costs=None, mode='sequential', obj_prereqs=None,
              cost_monitoring_args=None):
        """
        Sets up the monitor for a cost minimization problem.
        Adds channels defined by both the model and the cost for
        the specified dataset(s), as well as a channel called
        'objective' defined by the costs' __call__ method.

        Parameters
        ----------
        dataset : pylearn2.datasets.Dataset
            Dataset or dictionary mapping string names to Datasets.
            If string names are used, then for every dataset, each
            channel defined by the model or cost will be replicated
            with that dataset's name followed by an underscore as the
            prefix. For example, if your cost defines a channel called
            'misclass', and datasets is
            {'train' : train_dataset, 'valid' : valid_dataset},
            you will get channels called 'train_misclass' and
            'valid_misclass'.
        cost : pylearn2.costs.Cost
            The cost being optimized by training. The value of the cost
            will appear as the `objective` channel. Its
            `get_monitoring_channels` method will also be used to
            supply other channels.
        extra_costs : OrderedDict, optional
            A dictionary mapping channel names to Cost objects.
            Their value will appear as the specified channel name.
            They will also provide more monitoring channels via their
            `get_monitoring_channels` method.
        obj_prereqs : None, or list of functions
            Functions to pass as prerequisites to the `objective` channel.
        cost_monitoring_args : dict
            Dictionary of kwargs that will be passed to
            `cost.get_monitoring_channels()`
            (but not for the extra_costs).
        """

        if dataset is None:
            return
        if isinstance(dataset, Dataset):
            dataset = {'': dataset}
        else:
            assert isinstance(dataset, dict)
            assert all(isinstance(key, str) for key in dataset)
            assert all(isinstance(dataset[key], Dataset) for key in dataset)

        if extra_costs is None:
            costs = {}
        else:
            assert isinstance(extra_costs, (OrderedDict, dict))
            costs = extra_costs
        assert '' not in costs
        costs[''] = cost

        if cost_monitoring_args is None:
            cost_monitoring_args = {}

        model = self.model

        # Build a composite data_specs containing the specs for all costs,
        # then the specs of the model
        cost_names = sorted(costs.keys())
        spaces = []
        sources = []
        for c in cost_names:
            c_space, c_source = costs[c].get_data_specs(model)
            spaces.append(c_space)
            sources.append(c_source)

        # Ask the model for the data_specs needed
        m_space, m_source = model.get_monitoring_data_specs()
        spaces.append(m_space)
        sources.append(m_source)

        nested_space = CompositeSpace(spaces)
        nested_sources = tuple(sources)

        # Flatten this data_specs, so we build only one symbolic Theano
        # variable for each of the unique (space, source) pairs.
        mapping = DataSpecsMapping((nested_space, nested_sources))
        space_tuple = mapping.flatten(nested_space, return_tuple=True)
        source_tuple = mapping.flatten(nested_sources, return_tuple=True)
        ipt = tuple(space.make_theano_batch(name='monitor_%s' % source,
                                            batch_size=None)
                    for (space, source) in safe_zip(space_tuple, source_tuple))

        # Build a nested tuple from ipt, to dispatch the appropriate parts
        # of the ipt batch to each cost
        nested_ipt = mapping.nest(ipt)

        custom_channels = {}
        for i, cost_name in enumerate(cost_names):
            if cost_name == '':
                prefix = ''
            else:
                prefix = cost_name + '_'
            cost = costs[cost_name]
            cost_ipt = nested_ipt[i]
            raw_channels = cost.get_monitoring_channels(model, cost_ipt)
            channels = {}
            for name in raw_channels:
                # We need three things: the value itself (raw_channels[name]),
                # the input variables (cost_ipt), and the data_specs for
                # these input variables ((spaces[i], sources[i]))
                channels[prefix + name] = (raw_channels[name],
                                           cost_ipt,
                                           (spaces[i], sources[i]))
            custom_channels.update(channels)

        # Use the last inputs from nested_ipt for the model
        model_channels = model.get_monitoring_channels(nested_ipt[-1])
        channels = {}
        for name in model_channels:
            # Note: some code used to consider that model_channels[name]
            # could be a a (channel, prereqs) pair, this is not supported.
            channels[name] = (model_channels[name],
                              nested_ipt[-1],
                              (spaces[-1], sources[-1]))
        custom_channels.update(channels)

        if is_stochastic(mode):
            seed = [[2013, 2, 22]]
        else:
            seed = None

        for dataset_name in dataset:
            cur_dataset = dataset[dataset_name]
            self.add_dataset(dataset=cur_dataset,
                             mode=mode,
                             batch_size=batch_size,
                             num_batches=num_batches,
                             seed=seed)
            if dataset_name == '':
                dprefix = ''
            else:
                dprefix = dataset_name + '_'
            # These channel name 'objective' must not vary, since callbacks
            # that respond to the values in the monitor use the name to find
            # it.
            for i, cost_name in enumerate(cost_names):
                cost = costs[cost_name]
                cost_ipt = nested_ipt[i]
                cost_value = cost.expr(model, cost_ipt)
                if cost_value is not None:
                    if cost_name == '':
                        name = dprefix + 'objective'
                        prereqs = obj_prereqs
                    else:
                        name = dprefix + cost_name
                        prereqs = None

                    cost.get_data_specs(model)[0].validate(cost_ipt)
                    self.add_channel(name=name,
                                     ipt=cost_ipt,
                                     val=cost_value,
                                     data_specs=cost.get_data_specs(model),
                                     dataset=cur_dataset,
                                     prereqs=prereqs)

            for key in custom_channels:
                val, ipt, data_specs = custom_channels[key]
                data_specs[0].validate(ipt)
                self.add_channel(name=dprefix + key,
                                 ipt=ipt,
                                 val=val,
                                 data_specs=data_specs,
                                 dataset=cur_dataset)

コード例 #45

0

ファイルを表示

ファイル: learning_rule.py プロジェクト: nitbix/pylearn2

 def __init__(self, decay=0.9, max_scaling=1e5):
     assert 0. <= decay < 1.
     assert max_scaling > 0
     self.decay = sharedX(decay, 'decay')
     self.epsilon = 1. / max_scaling
     self.mean_square_grads = OrderedDict()

コード例 #46

0

ファイルを表示

ファイル: learning_rule.py プロジェクト: nitbix/pylearn2

class DRPROP(LearningRule):
    def __init__(
        self,
        decrease_rate=0.5,
        increase_rate=1.2,
        min_rate=1e-6,
        max_rate=50,
        switching_threshold=1e-6
    ):
        assert increase_rate > 1.
        assert decrease_rate < 1.
        self.decrease_rate = sharedX(decrease_rate, 'decrease_rate')
        self.increase_rate = sharedX(increase_rate, 'increase_rate')
        self.min_rate = min_rate
        self.max_rate = max_rate
        self.switching_threshold = switching_threshold
        self.epsilons = OrderedDict()
        self.gt_epsilons = OrderedDict()
        self.lt_epsilons = OrderedDict()
        self.eq_epsilons = OrderedDict()

    def add_channels_to_monitor(self, monitor, monitoring_dataset):
        monitor.add_channel(
            'rprop_decrease_rate',
            ipt=None,
            val=self.decrease_rate,
            dataset=monitoring_dataset,
            data_specs=(NullSpace(), '')
        )
        monitor.add_channel(
            'rprop_increase_rate',
            ipt=None,
            val=self.increase_rate,
            dataset=monitoring_dataset,
            data_specs=(NullSpace(), '')
        )
        #for gt_epsilon in self.gt_epsilons.values():
        #    monitor.add_channel(
        #        gt_epsilon.name,
        #        ipt=None,
        #        val=T.sum(gt_epsilon),
        #        dataset=monitoring_dataset,
        #        data_specs=(NullSpace(), '')
        #    )
        #for lt_epsilon in self.lt_epsilons.values():
        #    monitor.add_channel(
        #        lt_epsilon.name,
        #        ipt=None,
        #        val=T.sum(lt_epsilon),
        #        dataset=monitoring_dataset,
        #        data_specs=(NullSpace(), '')
        #    )
        #for eq_epsilon in self.eq_epsilons.values():
        #    monitor.add_channel(
        #        eq_epsilon.name,
        #        ipt=None,
        #        val=T.sum(eq_epsilon),
        #        dataset=monitoring_dataset,
        #        data_specs=(NullSpace(), '')
        #    )
        for epsilon in self.epsilons.values():
            monitor.add_channel(
                epsilon.name + '_sum',
                ipt=None,
                val=T.sum(epsilon),
                dataset=monitoring_dataset,
                data_specs=(NullSpace(), '')
            )
            monitor.add_channel(
                epsilon.name + '_min',
                ipt=None,
                val=T.min(epsilon),
                dataset=monitoring_dataset,
                data_specs=(NullSpace(), '')
            )
            monitor.add_channel(
                epsilon.name + '_max',
                ipt=None,
                val=T.max(epsilon),
                dataset=monitoring_dataset,
                data_specs=(NullSpace(), '')
            )

    def get_updates(self, learning_rate, grads, lr_scalers=None,
            global_error=None,dropout_mask=None):
        updates = OrderedDict()

        for param, grad in grads.iteritems():
            # Created required shared variables
            lr = lr_scalers.get(param, learning_rate.get_value())
            delta = sharedX(
                np.zeros_like(param.get_value()) + lr,
                borrow=True
            )
            previous_grad = sharedX(
                np.zeros_like(param.get_value()),
                borrow=True
            )
            epsilons = sharedX(
                np.zeros_like(param.get_value()),
                borrow=True
            )
            #gt_epsilons = sharedX(
            #    np.zeros_like(param.get_value()),
            #    borrow=True
            #)
            #lt_epsilons = sharedX(
            #    np.zeros_like(param.get_value()),
            #    borrow=True
            #)
            #eq_epsilons = sharedX(
            #    np.zeros_like(param.get_value()),
            #    borrow=True
            #)
            

            # Name variables according to the parameter name
            if param.name is not None:
                delta.name = 'delta_'+param.name
                epsilons.name = 'epsilons_' + param.name
                #gt_epsilons.name = 'gt_epsilons_' + param.name
                #lt_epsilons.name = 'lt_epsilons_' + param.name
                #eq_epsilons.name = 'eq_epsilons_' + param.name
                previous_grad.name = 'previous_grad_' + param.name

            self.epsilons[param] = epsilons
            #self.gt_epsilons[param] = gt_epsilons
            #self.lt_epsilons[param] = lt_epsilons
            #self.eq_epsilons[param] = eq_epsilons

            temp = grad*previous_grad
            new_epsilons = T.clip(
                    T.switch(
                        T.lt(T.abs_(grad),self.switching_threshold),
                        epsilons + 1.,
                        0.
                    ),
                    0.,
                    10
            )

            delta_inc = T.switch(T.neq(grad,0.),
                T.clip(
                    T.switch(
                        T.eq(temp, 0.),
                        delta,
                        T.switch(
                            T.lt(temp, 0.),
                            delta*self.decrease_rate,
                            delta*self.increase_rate
                        )
                    ),
                    self.min_rate,
                    self.max_rate
                ),
                delta
            )
            
            previous_grad_inc = T.switch(
                T.neq(grad,0.),
                T.switch(
                    T.gt(temp, 0.),
                    grad,
                    T.zeros_like(grad)
                ),
                previous_grad
            )

            scaled_lr = lr_scalers.get(param, 1.) * learning_rate
            unscaled_update = - delta_inc * T.sgn(grad)
            # Calculate updates of parameters
            updated_inc = T.switch(
                T.lt(new_epsilons,0.1),
                unscaled_update,
                T.switch(
                    T.gt(T.abs_(grad),T.abs_(previous_grad)),
                    - unscaled_update / (2 ** (new_epsilons + 1.)),
                    unscaled_update / (2 ** (new_epsilons + 1.))
                )
            )

            #new_gt_epsilons = T.switch(
            #        T.eq(grad,0.),
            #        0.,
            #        T.switch(
            #            T.gt(T.abs_(grad),self.switching_threshold),
            #            0.,
            #            T.switch(
            #                T.gt(temp,0.),
            #                1.,
            #                0.
            #            )
            #        )
            #)

            #new_lt_epsilons = T.switch(
            #        T.eq(grad,0.),
            #        0.,
            #        T.switch(
            #            T.gt(T.abs_(grad),self.switching_threshold),
            #            0.,
            #            T.switch(
            #                T.lt(temp,0.),
            #                1.,
            #                0.
            #            )
            #        )
            #)

            #new_eq_epsilons = T.switch(
            #        T.eq(grad,0.),
            #        0.,
            #        T.switch(
            #            T.gt(T.abs_(grad),self.switching_threshold),
            #            0.,
            #            T.switch(
            #                T.eq(temp,0.),
            #                1.,
            #                0.
            #            )
            #        )
            #)
            # Compile the updates
            updates[param] = param + updated_inc
            updates[delta] = delta_inc 
            updates[previous_grad] = previous_grad_inc
            updates[epsilons] = new_epsilons
            #updates[gt_epsilons] = new_gt_epsilons
            #updates[lt_epsilons] = new_lt_epsilons
            #updates[eq_epsilons] = new_eq_epsilons

        return updates

コード例 #47

0

ファイルを表示

ファイル: batch_gradient_descent.py プロジェクト: 123fengye741/pylearn2

    def __init__(self, objective, params, inputs=None,
                 param_constrainers=None, max_iter=-1,
                 lr_scalers=None, verbose=0, tol=None,
                 init_alpha=None, min_init_alpha=1e-3,
                 reset_alpha=True, conjugate=False,
                 reset_conjugate=True, gradients=None,
                 gradient_updates=None, line_search_mode=None,
                 accumulate=False, theano_function_mode=None):

        self.__dict__.update(locals())
        del self.self

        if line_search_mode is None:
            if init_alpha is None:
                init_alpha = (.001, .005, .01, .05, .1)
        else:
            assert line_search_mode == 'exhaustive'
            if init_alpha is None:
                init_alpha = (.5, 1.)

        self.init_alpha = tuple([float(elem) for elem in init_alpha])

        if inputs is None:
            inputs = []

        if param_constrainers is None:
            param_constrainers = []

        obj = objective

        self.verbose = verbose

        param_to_grad_sym = OrderedDict()
        param_to_grad_shared = OrderedDict()
        updates = OrderedDict()
        if self.gradient_updates is not None:
            updates.update(self.gradient_updates)

        self.params = [param for param in params]

        for param in params:
            if self.gradients is not None and param in self.gradients:
                g = self.gradients[param]
            else:
                g = grad(objective, param)
            param_to_grad_sym[param] = g
            if param.name is not None:
                param_name = param.name
            else:
                param_name = 'anon_param'
            grad_name = 'BatchGradientDescent.grad_' + param_name
            grad_shared = sharedX(param.get_value() * 0., name=grad_name)
            param_to_grad_shared[param] = grad_shared
            updates[grad_shared] = g

        self.param_to_grad_shared = param_to_grad_shared

        if self.verbose:
            logger.info('batch gradient class compiling gradient function')
        t1 = time.time()
        if self.accumulate:
            self._compute_grad = Accumulator(inputs, updates=updates)
        else:
            self._compute_grad = function(
                inputs,
                updates=updates,
                mode=self.theano_function_mode,
                name='BatchGradientDescent._compute_grad')
        if self.verbose:
            t2 = time.time()
            logger.info('done. Took {0}'.format(t2-t1))

        if self.verbose:
            logger.info('batch gradient class compiling objective function')
        if self.accumulate:
            self.obj = Accumulator(inputs, obj)
        else:
            self.obj = function(inputs, obj, mode=self.theano_function_mode,
                                name='BatchGradientDescent.obj')

        if self.verbose:
            logger.info('done')

        self.param_to_cache = OrderedDict()
        alpha = T.scalar(name='alpha')
        alpha.tag.test_value = np.cast[alpha.dtype](.01)
        cache_updates = OrderedDict()
        goto_updates = OrderedDict()
        for param in params:
            if param.name is None:
                param_name = 'anon_param'
            else:
                param_name = param.name
            cache_name = 'BatchGradientDescent.param_to_cache[%s]' % param_name
            self.param_to_cache[param] = sharedX(param.get_value(borrow=False),
                                                 name=cache_name)
            cache_updates[self.param_to_cache[param]] = param
            cached = self.param_to_cache[param]
            g = self.param_to_grad_shared[param]
            if lr_scalers is not None and param in lr_scalers:
                scaled_alpha = alpha * lr_scalers[param]
            else:
                scaled_alpha = alpha
            mul = scaled_alpha * g
            diff = cached - mul
            goto_updates[param] = diff
        self._cache_values = function(
            [],
            updates=cache_updates,
            mode=self.theano_function_mode,
            name='BatchGradientDescent._cache_values')
        assert isinstance(param_constrainers, (list, tuple))
        for param_constrainer in param_constrainers:
            param_constrainer(goto_updates)
        self._goto_alpha = function(
            [alpha],
            updates=goto_updates,
            mode=self.theano_function_mode,
            name='BatchGradientDescent._goto_alpha')

        norm = T.sqrt(sum([T.sqr(elem).sum() for elem in
                           self.param_to_grad_shared.values()]))
        norm.name = 'BatchGradientDescent.norm'
        normalize_grad_updates = OrderedDict()
        for grad_shared in self.param_to_grad_shared.values():
            normalize_grad_updates[grad_shared] = grad_shared / norm

        # useful for monitoring
        self.ave_grad_size = sharedX(0.)
        self.new_weight = sharedX(1.)
        normalize_grad_updates[self.ave_grad_size] = \
            self.new_weight * norm + (1.-self.new_weight) * self.ave_grad_size

        self._normalize_grad = \
            function([],
                     norm,
                     updates=normalize_grad_updates,
                     mode=self.theano_function_mode,
                     name='BatchGradientDescent._normalize_grad')

        if self.conjugate:
            grad_shared = self.param_to_grad_shared.values()

            grad_to_old_grad = OrderedDict()
            for elem in grad_shared:
                grad_to_old_grad[elem] = \
                    sharedX(elem.get_value(), 'old_'+elem.name)

            self._store_old_grad = \
                function([norm],
                         updates=OrderedDict([(grad_to_old_grad[g_], g_ * norm)
                                             for g_ in grad_to_old_grad]),
                         mode=self.theano_function_mode,
                         name='BatchGradientDescent._store_old_grad')

            grad_ordered = list(grad_to_old_grad.keys())
            old_grad_ordered = [grad_to_old_grad[g_] for g_ in grad_ordered]

            def dot_product(x, y):
                return sum([(x_elem * y_elem).sum()
                           for x_elem, y_elem in safe_zip(x, y)])

            beta_pr = (dot_product(grad_ordered, grad_ordered) - dot_product(grad_ordered, old_grad_ordered)) / \
                (1e-7+dot_product(old_grad_ordered, old_grad_ordered))
            assert beta_pr.ndim == 0

            beta = T.maximum(beta_pr, 0.)

            # beta_pr is the Polak-Ribiere formula for beta.
            # According to wikipedia, the beta to use for NCG is "a matter of
            # heuristics or taste" but max(0, beta_pr) is "a popular choice...
            # which provides direction reset automatically." (ie, it is meant
            # to revert to steepest descent when you have traveled far enough
            # that the objective function is behaving non-quadratically enough
            # that the conjugate gradient formulas aren't working anymore)

            # http://en.wikipedia.org/wiki/Nonlinear_conjugate_gradient_method

            assert grad not in grad_to_old_grad

            make_conjugate_updates = \
                [(g_, g_ + beta * grad_to_old_grad[g_]) for g_ in grad_ordered]

            mode = self.theano_function_mode
            if mode is not None and hasattr(mode, 'record'):
                for v, u in make_conjugate_updates:
                    mode.record.handle_line(
                        'BatchGradientDescent._make_conjugate var '
                        + var_descriptor(v) + '\n')
                    mode.record.handle_line(
                        'BatchGradientDescent._make_conjugate update '
                        + var_descriptor(u) + '\n')

            self._make_conjugate = \
                function([], updates=make_conjugate_updates,
                         mode=self.theano_function_mode,
                         name='BatchGradientDescent._make_conjugate')

            if mode is not None and hasattr(mode, 'record'):
                for output in self._make_conjugate.maker.fgraph.outputs:
                    mode.record.handle_line(
                        'BatchGradientDescent._make_conjugate output '
                        + var_descriptor(output) + '\n')

        if tol is None:
            if objective.dtype == "float32":
                self.tol = 1e-6
            else:
                self.tol = 3e-7
        else:
            self.tol = tol

        self.ave_step_size = sharedX(0.)
        self.ave_grad_mult = sharedX(0.)

コード例 #48

0

ファイルを表示

ファイル: rnn.py プロジェクト: dwf/pylearn2

class Recurrent(Layer):
    """
    A recurrent neural network layer using the hyperbolic tangent
    activation function, passing on all hidden states or a selection
    of them to the next layer.

    The hidden state is initialized to zeros.

    Parameters
    ----------
    dim : int
        The number of elements in the hidden layer
    layer_name : str
        The name of the layer. All layers in an MLP must have a unique name.
    irange : float
        Initializes each weight randomly in U(-irange, irange)
    irange : float
        The input-to-hidden weight matrix is initialized with weights in
        the uniform interval (-irange, irange). The hidden-to-hidden
        matrix weights are sampled in the same manner, unless the argument
        svd is set to True (see below).
    indices : slice, list of integers or integer, optional
        If specified this layer will return only the given hidden
        states. If an integer is given, it will not return a
        SequenceSpace. Otherwise, it will return a SequenceSpace of
        fixed length. Note that a SequenceSpace of fixed length
        can be flattened by using the FlattenerLayer.
        Note: For now only [-1] is supported.
    init_bias : float, optional
        Set an initial bias to be added at each time step. Defaults to 0.
    nonlinearity : theano.function, optional
    weight_noise : bool, optional
        Additive Gaussian noise applied to parameters
    """
    def __init__(self, dim, layer_name, irange, indices=None,
                 init_bias=0., nonlinearity=tensor.tanh,
                 weight_noise=False, **kwargs):
        self._std_dev = kwargs.pop('noise_std_dev', .075)
        self.rnn_friendly = True
        self._scan_updates = OrderedDict()
        self.__dict__.update(locals())
        del self.self
        super(Recurrent, self).__init__()
        if not self.weight_noise:
            self._std_dev = None

    @wraps(Layer.set_input_space)
    def set_input_space(self, space):
        if ((not isinstance(space, SequenceSpace) and
                not isinstance(space, SequenceDataSpace)) or
                not isinstance(space.space, VectorSpace)):
            raise ValueError("Recurrent layer needs a SequenceSpace("
                             "VectorSpace) or SequenceDataSpace(VectorSpace)\
                             as input but received  %s instead"
                             % (space))

        self.input_space = space

        if self.indices is not None:
            if len(self.indices) > 1:
                raise ValueError("Only indices = [-1] is supported right now")
                self.output_space = CompositeSpace(
                    [VectorSpace(dim=self.dim) for _
                     in range(len(self.indices))]
                )
            else:
                assert self.indices == [-1], "Only indices = [-1] works now"
                self.output_space = VectorSpace(dim=self.dim)
        else:
            if isinstance(self.input_space, SequenceSpace):
                self.output_space = SequenceSpace(VectorSpace(dim=self.dim))
            elif isinstance(self.input_space, SequenceDataSpace):
                self.output_space =\
                    SequenceDataSpace(VectorSpace(dim=self.dim))

        # Initialize the parameters
        rng = self.mlp.rng
        if self.irange is None:
            raise ValueError("Recurrent layer requires an irange value in "
                             "order to initialize its weight matrices")

        input_dim = self.input_space.dim

        # W is the input-to-hidden matrix
        W = rng.uniform(-self.irange, self.irange, (input_dim, self.dim))

        # U is the hidden-to-hidden transition matrix
        U = rng.randn(self.dim, self.dim)
        U, _ = scipy.linalg.qr(U)

        # b is the bias
        b = np.zeros((self.dim,))

        self._params = [
            sharedX(W, name=(self.layer_name + '_W')),
            sharedX(U, name=(self.layer_name + '_U')),
            sharedX(b + self.init_bias,
                    name=(self.layer_name + '_b'))
        ]

    @wraps(Layer.get_layer_monitoring_channels)
    def get_layer_monitoring_channels(self, state_below=None, state=None,
                                      targets=None):
        W, U, b = self._params
        sq_W = tensor.sqr(W)
        sq_U = tensor.sqr(U)
        row_norms = tensor.sqrt(sq_W.sum(axis=1))
        col_norms = tensor.sqrt(sq_W.sum(axis=0))
        u_row_norms = tensor.sqrt(sq_U.sum(axis=1))
        u_col_norms = tensor.sqrt(sq_U.sum(axis=0))

        rval = OrderedDict([('W_row_norms_min',  row_norms.min()),
                            ('W_row_norms_mean', row_norms.mean()),
                            ('W_row_norms_max',  row_norms.max()),
                            ('W_col_norms_min',  col_norms.min()),
                            ('W_col_norms_mean', col_norms.mean()),
                            ('W_col_norms_max',  col_norms.max()),
                            ('U_row_norms_min', u_row_norms.min()),
                            ('U_row_norms_mean', u_row_norms.mean()),
                            ('U_row_norms_max', u_row_norms.max()),
                            ('U_col_norms_min', u_col_norms.min()),
                            ('U_col_norms_mean', u_col_norms.mean()),
                            ('U_col_norms_max', u_col_norms.max())])

        if (state is not None) or (state_below is not None):
            if state is None:
                state = self.fprop(state_below)
            if isinstance(self.input_space, SequenceSpace):
                state, _ = state
                state_below, _ = state_below

            mx = state.max(axis=0)
            mean = state.mean(axis=0)
            mn = state.min(axis=0)
            rg = mx - mn

            rval['range_x_max_u'] = rg.max()
            rval['range_x_mean_u'] = rg.mean()
            rval['range_x_min_u'] = rg.min()

            rval['max_x_max_u'] = mx.max()
            rval['max_x_mean_u'] = mx.mean()
            rval['max_x_min_u'] = mx.min()

            rval['mean_x_max_u'] = mean.max()
            rval['mean_x_mean_u'] = mean.mean()
            rval['mean_x_min_u'] = mean.min()

            rval['min_x_max_u'] = mn.max()
            rval['min_x_mean_u'] = mn.mean()
            rval['min_x_min_u'] = mn.min()

        return rval

    @wraps(Layer._modify_updates)
    def _modify_updates(self, updates):
        # When random variables are used in the scan function the updates
        # dictionary returned by scan might not be empty, and needs to be
        # added to the updates dictionary before compiling the training
        # function
        if any(key in updates for key in self._scan_updates):
            # Don't think this is possible, but let's check anyway
            raise ValueError("A single shared variable is being updated by "
                             "multiple scan functions")
        updates.update(self._scan_updates)

    def add_noise(self, param):
        """
        A function that adds additive Gaussian
        noise

        Parameters
        ----------
        param : sharedX
            model parameter to be regularized

        Returns
        -------
        param : sharedX
            model parameter with additive noise
        """
        param += self.mlp.theano_rng.normal(size=param.shape,
                                            avg=0.,
                                            std=self._std_dev,
                                            dtype=param.dtype)

        return param

    @wraps(Layer.fprop)
    def fprop(self, state_below, return_all=False):
        if isinstance(state_below, tuple):
            state_below, mask = state_below
        else:
            mask = None

        # z0 is the initial hidden state which is (batch size, output dim)
        z0 = tensor.alloc(np.cast[config.floatX](0), state_below.shape[1],
                          self.dim)
        if self.dim == 1:
            # This should fix the bug described in Theano issue #1772
            z0 = tensor.unbroadcast(z0, 1)

        # Later we will add a noise function
        W, U, b = self._params
        if self.weight_noise:
            W = self.add_noise(W)
            U = self.add_noise(U)

        # It is faster to do the input-to-hidden matrix multiplications
        # outside of scan
        state_below = tensor.dot(state_below, W) + b

        if mask is not None:
            z, updates = scan(fn=self.fprop_step_mask,
                              sequences=[state_below, mask],
                              outputs_info=[z0],
                              non_sequences=[U])
        else:
            z, updates = scan(fn=self.fprop_step,
                              sequences=[state_below],
                              outputs_info=[z0],
                              non_sequences=[U])

        self._scan_updates.update(updates)

        if self.indices is not None:
            if len(self.indices) > 1:
                return [z[i] for i in self.indices]
            else:
                return z[self.indices[0]]
        else:
            return (z, mask)

    def fprop_step_mask(self, state_below, mask, state_before, U):
        """
        Scan function for case using masks

        Parameters
        ----------
        : todo
        state_below : TheanoTensor
        """

        z = self.nonlinearity(state_below +
                              tensor.dot(state_before, U))

        # Only update the state for non-masked data, otherwise
        # just carry on the previous state until the end
        z = mask[:, None] * z + (1 - mask[:, None]) * state_before

        return z

    def fprop_step(self, state_below, state_before, U):
        """
        Scan function for case without masks

        Parameters
        ----------
        : todo
        state_below : TheanoTensor
        """

        z = self.nonlinearity(state_below +
                              tensor.dot(state_before, U))

        return z