Beispiel #1
0
def main():
    var = theano.shared(T.zeros(shape=(88, 100), dtype=theano.config.floatX).eval(), name='W')
    updates = [(var, add_uniform(input=var, noise_level=.02))]

    stats = get_stats(var)
    l1 = stats.pop('l1')
    l2 = stats.pop('l2')
    min = stats.pop('min')
    max = stats.pop('max')
    var = stats.pop('var')
    std = stats.pop('std')
    mean = stats.pop('mean')

    mean_monitor = Monitor('mean', mean, train=True, valid=True, out_service=FileService('outs/mean.txt'))
    var_monitor = Monitor('var', var, out_service=FileService('outs/var.txt'))

    w_channel = MonitorsChannel('W', monitors=mean_monitor)

    stat_channel = MonitorsChannel('stats', monitors=[var_monitor])

    monitors = [w_channel, stat_channel]

    train_collapsed_raw = collapse_channels(monitors, train=True)
    train_collapsed = OrderedDict([(item[0], item[1]) for item in train_collapsed_raw])
    train_services = OrderedDict([(item[0], item[2]) for item in train_collapsed_raw])
    valid_collapsed_raw = collapse_channels(monitors, valid=True)
    valid_collapsed = OrderedDict([(item[0], item[1]) for item in valid_collapsed_raw])
    valid_services = OrderedDict([(item[0], item[2]) for item in valid_collapsed_raw])

    log.debug('compiling...')
    f = theano.function(inputs=[], outputs=train_collapsed.values(), updates=updates)
    f2 = theano.function(inputs=[], outputs=valid_collapsed.values(), updates=updates)
    log.debug('done')

    t1=time.time()

    for epoch in range(10):
        t=time.time()
        log.debug(epoch)
        vals = f()
        m = OrderedDict(zip(train_collapsed.keys(), vals))
        for name, service in train_services.items():
            if name in m:
                service.write(m[name], TRAIN)
        log.debug('----- '+make_time_units_string(time.time()-t))

    for epoch in range(10):
        t = time.time()
        log.debug(epoch)
        vals = f2()
        m = OrderedDict(zip(valid_collapsed.keys(), vals))
        for name, service in valid_services.items():
            if name in m:
                service.write(m[name], VALID)
        log.debug('----- ' + make_time_units_string(time.time() - t))

    log.debug("TOTAL TIME "+make_time_units_string(time.time()-t1))
def main():
    var = theano.shared(T.zeros(shape=(88, 100), dtype=theano.config.floatX).eval(), name='W')
    updates = [(var, add_uniform(input=var, noise_level=.02))]

    stats = get_stats(var)
    l1 = stats.pop('l1')
    l2 = stats.pop('l2')
    min = stats.pop('min')
    max = stats.pop('max')
    var = stats.pop('var')
    std = stats.pop('std')
    mean = stats.pop('mean')

    mean_monitor = Monitor('mean', mean, train=True, valid=True, out_service=FileService('outs/mean.txt'))
    var_monitor = Monitor('var', var, out_service=FileService('outs/var.txt'))

    w_channel = MonitorsChannel('W', monitors=mean_monitor)

    stat_channel = MonitorsChannel('stats', monitors=[var_monitor])

    monitors = [w_channel, stat_channel]

    train_collapsed_raw = collapse_channels(monitors, train=True)
    train_collapsed = OrderedDict([(item[0], item[1]) for item in train_collapsed_raw])
    train_services = OrderedDict([(item[0], item[2]) for item in train_collapsed_raw])
    valid_collapsed_raw = collapse_channels(monitors, valid=True)
    valid_collapsed = OrderedDict([(item[0], item[1]) for item in valid_collapsed_raw])
    valid_services = OrderedDict([(item[0], item[2]) for item in valid_collapsed_raw])

    log.debug('compiling...')
    f = theano.function(inputs=[], outputs=list(train_collapsed.values()), updates=updates)
    f2 = theano.function(inputs=[], outputs=list(valid_collapsed.values()), updates=updates)
    log.debug('done')

    t1=time.time()

    for epoch in range(10):
        t=time.time()
        log.debug(epoch)
        vals = f()
        m = OrderedDict(zip(train_collapsed.keys(), vals))
        for name, service in train_services.items():
            if name in m:
                service.write(m[name], "train")
        log.debug('----- '+make_time_units_string(time.time()-t))

    for epoch in range(10):
        t = time.time()
        log.debug(epoch)
        vals = f2()
        m = OrderedDict(zip(valid_collapsed.keys(), vals))
        for name, service in valid_services.items():
            if name in m:
                service.write(m[name], "valid")
        log.debug('----- ' + make_time_units_string(time.time() - t))

    log.debug("TOTAL TIME "+make_time_units_string(time.time()-t1))
def main():
    w = theano.shared(T.zeros(shape=(88, 100), dtype=theano.config.floatX).eval(), name='W')
    updates = [(w, add_uniform(input=w, noise_level=.02))]

    stats = get_stats(w)
    l1 = stats.pop('l1')
    l2 = stats.pop('l2')
    min = stats.pop('min')
    max = stats.pop('max')
    var = stats.pop('var')
    std = stats.pop('std')
    mean = stats.pop('mean')

    mean_monitor = Monitor('mean', mean, train=True, valid=True)
    stat_monitor = Monitor('max', max)

    w_channel = MonitorsChannel('W', monitors=mean_monitor)

    stat_channel = MonitorsChannel('stats', monitors=[stat_monitor])

    monitors = [w_channel, stat_channel]

    train_collapsed = collapse_channels(monitors, train=True)
    train_collapsed = OrderedDict([(name, expression) for name, expression, _ in train_collapsed])
    valid_collapsed = collapse_channels(monitors, valid=True)
    valid_collapsed = OrderedDict([(name, expression) for name, expression, _ in valid_collapsed])

    plot = Plot(bokeh_doc_name='test_plots', monitor_channels=monitors, open_browser=True)

    log.debug('compiling...')
    f = theano.function(inputs=[], outputs=list(train_collapsed.values()), updates=updates)
    f2 = theano.function(inputs=[], outputs=list(valid_collapsed.values()), updates=updates)
    log.debug('done')

    t1=time.time()

    for epoch in range(100):
        t=time.time()
        log.debug(epoch)
        vals = f()
        m = OrderedDict(zip(train_collapsed.keys(), vals))
        plot.update_plots(epoch, m)
        time.sleep(0.02)
        log.debug('----- '+make_time_units_string(time.time()-t))

    for epoch in range(100):
        t = time.time()
        log.debug(epoch)
        vals = f2()
        m = OrderedDict(zip(valid_collapsed.keys(), vals))
        plot.update_plots(epoch, m)
        time.sleep(0.02)
        log.debug('----- ' + make_time_units_string(time.time() - t))

    log.debug("TOTAL TIME "+make_time_units_string(time.time()-t1))
Beispiel #4
0
    def get_updates(self, grads):
        grads = OrderedDict(grads)
        updates = OrderedDict()

        i_t = self.i + 1.
        fix1 = 1. - (1. - self.b1) ** i_t
        fix2 = 1. - (1. - self.b2) ** i_t
        lr_t = self.learning_rate * (T.sqrt(fix2) / fix1)

        for param in grads.keys():
            m = theano.shared(param.get_value() * 0.)
            self.parameters.append(m)
            v = theano.shared(param.get_value() * 0.)
            self.parameters.append(v)

            b1t = 1. - (1. - self.b1) * self.lmbda**(i_t - 1)
            m_t = b1t * grads[param] + (1. - b1t) * m
            v_t = self.b2 * T.sqr(grads[param]) + (1. - self.b2) * v
            g_t = m_t / (T.sqrt(v_t) + self.epsilon)
            p_t = param - (lr_t * g_t)

            updates[m] = m_t
            updates[v] = v_t
            updates[param] = p_t
        updates[self.i] = i_t

        return updates
Beispiel #5
0
    def get_updates(self, grads):
        grads = OrderedDict(grads)
        updates = OrderedDict()

        for param in grads.keys():
            # mean_squared_grad := E[g^2]_{t-1}
            mean_square_grad = theano.shared(theano._asarray(param.get_value() * 0., dtype=theano.config.floatX), name='mean_square_grad_' + param.name, borrow=False)
            self.parameters.append(mean_square_grad)
            # mean_square_dx := E[(\Delta x)^2]_{t-1}
            mean_square_dx = theano.shared(theano._asarray(param.get_value() * 0., dtype=theano.config.floatX), name='mean_square_dx_' + param.name, borrow=False)
            self.parameters.append(mean_square_dx)

            # Accumulate gradient
            new_mean_squared_grad = self.decay * mean_square_grad + (1 - self.decay) * T.sqr(grads[param])

            # Compute update
            rms_dx_tm1 = T.sqrt(mean_square_dx + self.epsilon)
            rms_grad_t = T.sqrt(new_mean_squared_grad + self.epsilon)
            delta_x_t = - rms_dx_tm1 / rms_grad_t * grads[param]

            # Accumulate updates
            new_mean_square_dx = self.decay * mean_square_dx + (1 - self.decay) * T.sqr(delta_x_t)

            # Apply update
            updates[mean_square_grad] = new_mean_squared_grad
            updates[mean_square_dx] = new_mean_square_dx
            updates[param] = param + delta_x_t

        return updates
Beispiel #6
0
    def __init__(self, valid=None, invalid=None, valid_equivalent=None):
        '''
        Check if variables can be expressed without using variables in invalid.

        init_valid_equivalent provides a dictionary mapping some invalid
        variables to valid ones that can be used instead.
        '''

        if valid is None:
            valid = []
        if invalid is None:
            invalid = []
        if valid_equivalent is None:
            valid_equivalent = OrderedDict()

        # Nodes that are valid to have in the graph computing outputs
        self.valid = set(valid)

        # Nodes that are NOT valid to have in the graph computing outputs
        self.invalid = set(invalid)

        # Mapping from invalid variables to equivalent valid ones.
        self.valid_equivalent = valid_equivalent.copy()
        self.valid.update(valid_equivalent.values())
        self.invalid.update(valid_equivalent.keys())
Beispiel #7
0
    def get_updates(self, grads):
        grads = OrderedDict(grads)
        updates = OrderedDict()

        for param in grads.keys():
            # sum_squared_grad := \sum g_t^2
            sum_squared_grad = theano.shared(
                theano._asarray(param.get_value() * 0.,
                                dtype=theano.config.floatX),
                name='mean_square_grad_' + param.name,
                borrow=False)
            self.parameters.append(sum_squared_grad)

            # Accumulate gradient
            new_sum_squared_grad = sum_squared_grad + T.sqr(grads[param])

            # Compute update
            root_sum_squared = T.sqrt(new_sum_squared_grad + self.epsilon)

            # Apply update
            updates[sum_squared_grad] = new_sum_squared_grad
            updates[param] = param - (self.learning_rate /
                                      root_sum_squared) * grads[param]

        return updates
Beispiel #8
0
    def __init__(self, valid=None, invalid=None, valid_equivalent=None):
        '''
        Check if variables can be expressed without using variables in invalid.

        init_valid_equivalent provides a dictionary mapping some invalid
        variables to valid ones that can be used instead.
        '''

        if valid is None:
            valid = []
        if invalid is None:
            invalid = []
        if valid_equivalent is None:
            valid_equivalent = OrderedDict()

        # Nodes that are valid to have in the graph computing outputs
        self.valid = set(valid)

        # Nodes that are NOT valid to have in the graph computing outputs
        self.invalid = set(invalid)

        # Mapping from invalid variables to equivalent valid ones.
        self.valid_equivalent = valid_equivalent.copy()
        self.valid.update(valid_equivalent.values())
        self.invalid.update(valid_equivalent.keys())
    def get_funcs(self,
                  learning_rate,
                  grads,
                  inp,
                  cost,
                  errors,
                  lr_scalers=None):
        """
        Provides the updates for learning with gradient descent + momentum.

        Parameters
        ----------
        learning_rate : float
            Learning rate coefficient.
        grads : dict
            A dictionary mapping from the model's parameters to their
            gradients.
        lr_scalers : dict
            A dictionary mapping from the model's parameters to a learning
            rate multiplier.
        """
        gshared = OrderedDict({
            p: sharedX(p.get_value() * 0., name='%s_grad' % p.name)
            for p, g in grads.iteritems()
        })

        gsup = [(gs, g) for gs, g in zip(gshared.values(), grads.values())]
        get_norms = lambda x: T.sqrt(sum(map(lambda y: (y**2).sum(), x)))
        gnorm = get_norms(grads.values())
        pnorm = get_norms(grads.keys())
        f_grad_shared = theano.function(inp, [cost, errors, gnorm, pnorm],
                                        updates=gsup)
        updates = OrderedDict()

        for param, grad in gshared.keys():
            vel = sharedX(param.get_value() * 0.)
            assert param.dtype == vel.dtype
            assert grad.dtype == param.dtype
            if param.name is not None:
                vel.name = 'vel_' + param.name

            scaled_lr = learning_rate * lr_scalers.get(param, 1.)
            updates[vel] = self.momentum * vel - scaled_lr * grad

            inc = updates[vel]
            if self.nesterov_momentum:
                inc = self.momentum * inc - scaled_lr * grad

            assert inc.dtype == vel.dtype
            updates[param] = param + inc

        f_update = theano.function([learning_rate], [],
                                   updates=updates,
                                   on_unused_input='ignore')

        return f_grad_shared, f_update
Beispiel #10
0
    def get_updates(self, grads):
        grads = OrderedDict(grads)
        updates = OrderedDict()

        for param in grads.keys():
            decreased_learning_rate = T.cast(self.learning_rate / (1 + (self.decrease_constant * self.current_iteration)), dtype=theano.config.floatX)
            updates[param] = param - decreased_learning_rate * grads[param]

        updates[self.current_iteration] = self.current_iteration + 1

        return updates
    def get_funcs(self, learning_rate, grads, inp, cost, errors, lr_scalers=None):
        """
        Provides the updates for learning with gradient descent + momentum.

        Parameters
        ----------
        learning_rate : float
            Learning rate coefficient.
        grads : dict
            A dictionary mapping from the model's parameters to their
            gradients.
        lr_scalers : dict
            A dictionary mapping from the model's parameters to a learning
            rate multiplier.
        """
        gshared = OrderedDict({p: sharedX(p.get_value() * 0.,
                             name='%s_grad' % p.name)
                             for p, g in grads.iteritems()})

        gsup = [(gs, g) for gs, g in zip(gshared.values(), grads.values())]
        get_norms = lambda x: T.sqrt(sum(map(lambda y: (y**2).sum(), x)))
        gnorm = get_norms(grads.values())
        pnorm = get_norms(grads.keys())
        f_grad_shared = theano.function(inp,
                                        [cost, errors, gnorm, pnorm],
                                        updates=gsup)
        updates = OrderedDict()

        for param, grad in gshared.keys():
            vel = sharedX(param.get_value() * 0.)
            assert param.dtype == vel.dtype
            assert grad.dtype == param.dtype
            if param.name is not None:
                vel.name = 'vel_' + param.name

            scaled_lr = learning_rate * lr_scalers.get(param, 1.)
            updates[vel] = self.momentum * vel - scaled_lr * grad

            inc = updates[vel]
            if self.nesterov_momentum:
                inc = self.momentum * inc - scaled_lr * grad

            assert inc.dtype == vel.dtype
            updates[param] = param + inc

        f_update = theano.function([learning_rate],
                                   [],
                                   updates=updates,
                                   on_unused_input='ignore')

        return f_grad_shared, f_update
Beispiel #12
0
    def get_updates(self, grads):
        grads = OrderedDict(grads)
        updates = OrderedDict()

        for param in grads.keys():
            decreased_learning_rate = T.cast(
                self.learning_rate /
                (1 + (self.decrease_constant * self.current_iteration)),
                dtype=theano.config.floatX)
            updates[param] = param - decreased_learning_rate * grads[param]

        updates[self.current_iteration] = self.current_iteration + 1

        return updates
Beispiel #13
0
    def get_updates(self, grads):
        grads = OrderedDict(grads)
        updates = OrderedDict()

        for param in grads.keys():
            # sum_squared_grad := \sum g_t^2
            sum_squared_grad = theano.shared(theano._asarray(param.get_value() * 0., dtype=theano.config.floatX), name='mean_square_grad_' + param.name, borrow=False)
            self.parameters.append(sum_squared_grad)

            # Accumulate gradient
            new_sum_squared_grad = sum_squared_grad + T.sqr(grads[param])

            # Compute update
            root_sum_squared = T.sqrt(new_sum_squared_grad + self.epsilon)

            # Apply update
            updates[sum_squared_grad] = new_sum_squared_grad
            updates[param] = param - (self.learning_rate / root_sum_squared) * grads[param]

        return updates
Beispiel #14
0
    def get_updates(self, grads):
        grads = OrderedDict(grads)
        updates = OrderedDict()

        for param in grads.keys():
            # mean_squared_grad := E[g^2]_{t-1}
            mean_square_grad = theano.shared(
                theano._asarray(param.get_value() * 0.,
                                dtype=theano.config.floatX),
                name='mean_square_grad_' + param.name,
                borrow=False)
            self.parameters.append(mean_square_grad)
            # mean_square_dx := E[(\Delta x)^2]_{t-1}
            mean_square_dx = theano.shared(theano._asarray(
                param.get_value() * 0., dtype=theano.config.floatX),
                                           name='mean_square_dx_' + param.name,
                                           borrow=False)
            self.parameters.append(mean_square_dx)

            # Accumulate gradient
            new_mean_squared_grad = self.decay * mean_square_grad + (
                1 - self.decay) * T.sqr(grads[param])

            # Compute update
            rms_dx_tm1 = T.sqrt(mean_square_dx + self.epsilon)
            rms_grad_t = T.sqrt(new_mean_squared_grad + self.epsilon)
            delta_x_t = -rms_dx_tm1 / rms_grad_t * grads[param]

            # Accumulate updates
            new_mean_square_dx = self.decay * mean_square_dx + (
                1 - self.decay) * T.sqr(delta_x_t)

            # Apply update
            updates[mean_square_grad] = new_mean_squared_grad
            updates[mean_square_dx] = new_mean_square_dx
            updates[param] = param + delta_x_t

        return updates
Beispiel #15
0
class Monitor(object):
    """
    A class for monitoring Models while they are being trained.

    A monitor object records the number of minibatches and number of examples
    the model has trained, as well as any number of "channels" that track
    quantities of interest (examples: the objective function, measures of
    hidden unit activity, reconstruction error, sum of squared second
    derivatives, average norm of the weight vectors,  etc.)
    """
    def __init__(self, model):
        """
        Makes a monitor for `model`. Assumes the model has not been
        trained at all yet.

        Parameters
        ----------
        model : pylearn2.models.model.Model instance
        """
        self.training_succeeded = False
        self.model = model
        self.channels = OrderedDict()
        self._num_batches_seen = 0
        self._examples_seen = 0
        self._epochs_seen = 0
        self._datasets = []
        self._iteration_mode = []
        self._batch_size = []
        self._num_batches = []
        self._dirty = True
        self._rng_seed = []
        self.names_to_del = ['theano_function_mode']
        self.t0 = time.time()
        # Determine whether the model should use topological or vector form of
        # examples. If the model acts on a space with more than the batch index
        # and channel dimension, the model has topological dimensions, so the
        # topological view of the data should be used.
        vector = model.get_input_space().make_theano_batch(
            name='monitoring_input')
        if isinstance(vector.type, theano.sparse.SparseType):
            self.topo = False
        else:
            self.topo = len(vector.type.broadcastable) > 2

        self.require_label = False
        self.theano_function_mode = None

    def set_theano_function_mode(self, mode):
        if self.theano_function_mode != mode:
            self._dirty = True
            self.theano_function_mode = mode

    def add_dataset(self,
                    dataset,
                    mode='sequential',
                    batch_size=None,
                    num_batches=None,
                    seed=None):
        """
        Determines the data used to calculate the values of each channel.

        Parameters
        ----------
        dataset : object
            A `pylearn2.datasets.Dataset` object.
        mode : str or object, optional
            Iteration mode; see the docstring of the `iterator` method
            on `pylearn2.datasets.Dataset` for details.
        batch_size : int, optional
            The size of an individual batch. Optional if `mode` is
            'sequential' and `num_batches` is specified (batch size
            will be calculated based on full dataset size).
        num_batches : int, optional
            The total number of batches. Unnecessary if `mode` is
            'sequential' and `batch_size` is specified (number of
            batches will be calculated based on full dataset size).
        """
        # The user can ommit using lists if only one dataset is set
        if not isinstance(dataset, list):
            dataset = [dataset]
        if not isinstance(mode, list):
            mode = [mode]
        if not isinstance(batch_size, list):
            batch_size = [batch_size]
        if not isinstance(num_batches, list):
            num_batches = [num_batches]
        if seed is None:
            seed = [None] * len(dataset)
        if not isinstance(seed, list):
            seed = [seed]
        if len(mode) != len(dataset):
            raise ValueError("Received " + str(len(dataset)) +
                             " dataset but " + str(len(mode)) + " modes.")
        if any([len(l) != len(dataset) for l in [batch_size, seed]]):
            raise ValueError("make sure each dataset has its iteration " + \
                        "batch size and number of batches.")
        for (d, m, b, n, sd) in safe_izip(dataset, mode, batch_size,
                                          num_batches, seed):
            try:
                it = d.iterator(mode=m,
                                batch_size=b,
                                num_batches=n,
                                topo=self.topo,
                                targets=self.require_label,
                                rng=sd)
            except ValueError as exc:
                raise ValueError("invalid iteration parameters in "
                                 "Monitor.add_dataset: " + str(exc))
            if it.stochastic:
                # must be a seed, not a random number generator
                # if it were a random number generator, different iterators using
                # it would update its state, so we would not get the same iterator
                # each time
                # Also, must not be None, because this makes the iterator pick
                # a seed based on the clock
                if sd is None:
                    raise TypeError(
                        "Monitor requires a seed when using stochastic iteration modes."
                    )
                if not isinstance(sd, (list, tuple, int)):
                    raise TypeError(
                        "Monitor requires a seed (not a random number generator) when using stochastic iteration modes."
                    )
            else:
                assert sd is None  # the iterator should catch this, but let's double-check

            if not d in self._datasets:
                self._datasets.append(d)
                self._iteration_mode.append(m)
                self._batch_size.append(b)
                self._num_batches.append(n)
                self._rng_seed.append(sd)

    def __call__(self):
        """
        Runs the model on the monitoring dataset in order to add one
        data point to each of the channels.
        """

        # If the channels have changed at all, we need to recompile the theano
        # functions used to compute them
        if self._dirty:
            self.redo_theano()

        model = self.model
        datasets = self._datasets

        # Set all channels' val_shared to 0
        self.begin_record_entry()

        for d, i, b, n, a, sd, ne in safe_izip(datasets, self._iteration_mode,
                                               self._batch_size,
                                               self._num_batches, self.accum,
                                               self._rng_seed,
                                               self.num_examples):
            if isinstance(d, basestring):
                d = yaml_parse.load(d)
                raise NotImplementedError()
                # need to put d back into self._datasets
            myiterator = d.iterator(mode=i,
                                    batch_size=b,
                                    num_batches=n,
                                    topo=self.topo,
                                    targets=self.require_label,
                                    rng=sd)

            actual_ne = 0
            for X in myiterator:
                if self.require_label:
                    X, y = X
                    self.run_prereqs(X, y, d)
                    a(X, y)
                else:
                    self.run_prereqs(X, None, d)
                    a(X)
                if X.ndim == 2:
                    actual_batch_size = X.shape[0]
                else:
                    actual_batch_size = X.shape[d.get_topo_batch_axis()]
                actual_ne += actual_batch_size
            # end for X
            if actual_ne != ne:
                raise RuntimeError(
                    "At compile time, your iterator said it had " + str(ne) +
                    " examples total, but at runtime it gave us " +
                    str(actual_ne) + ".")
        # end for d

        log.info("Monitoring step:")
        log.info("\tEpochs seen: %d" % self._epochs_seen)
        log.info("\tBatches seen: %d" % self._num_batches_seen)
        log.info("\tExamples seen: %d" % self._examples_seen)
        t = time.time() - self.t0
        for channel_name in sorted(self.channels.keys(),
                                   key=number_aware_alphabetical_key):
            channel = self.channels[channel_name]
            channel.time_record.append(t)
            channel.batch_record.append(self._num_batches_seen)
            channel.example_record.append(self._examples_seen)
            channel.epoch_record.append(self._epochs_seen)
            val = channel.val_shared.get_value()
            channel.val_record.append(val)
            # TODO: use logging infrastructure so that user can configure
            # formatting
            if abs(val) < 1e4:
                val_str = str(val)
            else:
                val_str = '%.3e' % val

            log.info("\t%s: %s" % (channel_name, val_str))

    def run_prereqs(self, X, y, dataset):
        if dataset not in self.prereqs:
            return
        for prereq in self.prereqs[dataset]:
            prereq(X, y)

    def get_batches_seen(self):
        """ Returns the number of batches the model has learned on (assuming
        that the learning code has been calling Monitor.report_batch correctly)
        """
        return self._num_batches_seen

    def get_epochs_seen(self):
        return self._epochs_seen

    def get_examples_seen(self):
        """ Returns the number of examples the model has learned on (assuming
        that the learning code has been calling Monitor.report_batch correctly)
        """
        return self._examples_seen

    def report_batch(self, num_examples):
        """ Call this whenever the model has learned on another batch of examples.
        Report how many examples were learned on. """
        self._examples_seen += num_examples
        self._num_batches_seen += 1

    def report_epoch(self):
        self._epochs_seen += 1

    def redo_theano(self):
        """
        Recompiles Theano functions used by this monitor.

        This is needed so that if new channels are added, Theano's
        optimizations make sure (to the extent that they can) that the new
        channels and old channels don't have any redundant calculations.

        It is also needed to regenerate Theano functions after pickling and
        unpickling, since Theano functions should not be pickled.
        """
        self._dirty = False

        init_names = dir(self)
        self.prereqs = OrderedDict()
        for channel in self.channels.values():
            if channel.prereqs is not None:
                dataset = channel.dataset
                if dataset not in self.prereqs:
                    self.prereqs[dataset] = []
                prereqs = self.prereqs[dataset]
                for prereq in channel.prereqs:
                    if prereq not in prereqs:
                        prereqs.append(prereq)

        updates = OrderedDict()
        for channel in self.channels.values():
            updates[channel.val_shared] = np.cast[config.floatX](0.0)
        with log_timing(log, "compiling begin_record_entry"):
            self.begin_record_entry = function(
                inputs=[],
                updates=updates,
                mode=self.theano_function_mode,
                name='Monitor.begin_record_entry')
        updates = OrderedDict()
        givens = OrderedDict()
        # Get the appropriate kind of theano variable to represent the data the model
        # acts on
        X = self.model.get_input_space().make_theano_batch(name="monitoring_X")
        if config.compute_test_value != 'off':
            m = self.model.get_test_batch_size()
            test_value = self.model.get_input_space().get_origin_batch(m)
            X.tag.test_value = np.cast[X.type.dtype](test_value)
        if self.require_label:
            Y = self.model.get_output_space().make_theano_batch(
                name="monitoring_Y")

        log.info('Monitored channels: ')
        for key in sorted(self.channels.keys()):
            mode = self.theano_function_mode
            if mode is not None and hasattr(mode, 'record'):
                mode.record.handle_line(
                    'compiling monitor including channel ' + key + '\n')
            log.info('\t%s' % key)
        it = [d.iterator(mode=i, num_batches=n, batch_size=b, topo=self.topo) \
              for d, i, n, b in safe_izip(self._datasets, self._iteration_mode,
                                    self._num_batches, self._batch_size)]
        self.num_examples = [
            np.cast[config.floatX](float(i.num_examples)) for i in it
        ]
        givens = [OrderedDict() for d in self._datasets]
        updates = [OrderedDict() for d in self._datasets]
        for channel in self.channels.values():
            index = self._datasets.index(channel.dataset)
            d = self._datasets[index]
            g = givens[index]
            cur_num_examples = self.num_examples[index]
            u = updates[index]
            if isinstance(channel.graph_input, (list, tuple)):
                channel_X, channel_Y = channel.graph_input
                assert channel_X not in g or g[channel_X] is X
                assert channel_Y not in g or g[channel_Y] is Y
                g[channel_X] = X
                g[channel_Y] = Y
            else:
                channel_X = channel.graph_input
                assert channel_X not in g or g[channel_X] is X
                g[channel_X] = X
            if n == 0:
                raise ValueError(
                    "Iterating over 0 examples results in divide by 0")
            if self.topo:
                batch_index = d.get_topo_batch_axis()
            else:
                batch_index = 0
            val = channel.val * T.cast(X.shape[batch_index],
                                       config.floatX) / cur_num_examples
            u[channel.val_shared] = channel.val_shared + val

        with log_timing(log, "Compiling accum"):
            # Check type of update expressions
            for up in updates:
                for key in up:
                    if key.dtype != up[key].dtype:
                        raise TypeError('Monitoring channel shared variable ' \
                                + key.name + ' has dtype ' + key.dtype + \
                                ' but is driven by an expression with type ' + \
                                up[key].dtype)

            self.accum = []
            for idx, packed in enumerate(safe_izip(givens, updates)):
                g, u = packed
                mode = self.theano_function_mode
                if mode is not None and hasattr(mode, 'record'):
                    for elem in g:
                        mode.record.handle_line('g key ' +
                                                var_descriptor(elem) + '\n')
                        mode.record.handle_line('g val ' +
                                                var_descriptor(g[elem]) + '\n')
                    for elem in u:
                        mode.record.handle_line('u key ' +
                                                var_descriptor(elem) + '\n')
                        mode.record.handle_line('u val ' +
                                                var_descriptor(u[elem]) + '\n')
                function_name = 'Monitor.accum[%d]' % idx
                if self.require_label:
                    if mode is not None and hasattr(mode, 'record'):
                        mode.record.handle_line('compiling supervised accum\n')
                    # Some channels may not depend on the data, ie, they might just monitor the model
                    # parameters, or some shared variable updated by the training algorithm, so we
                    # need to ignore the unused input error
                    self.accum.append(
                        function([X, Y],
                                 givens=g,
                                 updates=u,
                                 mode=self.theano_function_mode,
                                 name=function_name))
                else:
                    if mode is not None and hasattr(mode, 'record'):
                        mode.record.handle_line(
                            'compiling unsupervised accum\n')
                    self.accum.append(
                        function([X],
                                 givens=g,
                                 updates=u,
                                 mode=self.theano_function_mode,
                                 name=function_name))
            for a in self.accum:
                if mode is not None and hasattr(mode, 'record'):
                    for elem in a.maker.fgraph.outputs:
                        mode.record.handle_line('accum output ' +
                                                var_descriptor(elem) + '\n')
                log.info("graph size: %d" % len(a.maker.fgraph.toposort()))
        final_names = dir(self)
        self.register_names_to_del(
            [name for name in final_names if name not in init_names])

    def register_names_to_del(self, names):
        """
        Register names of fields that should be deleted before pickling.

        Parameters
        ----------
        names : list
            A list of attribute names as strings.
        """
        for name in names:
            if name not in self.names_to_del:
                self.names_to_del.append(name)

    def __getstate__(self):
        """
        In order to avoid pickling a copy of the dataset whenever a monitor
        is saved, the __getstate__ method replaces the dataset field with the
        dataset's yaml source. This is not a perfect solution because it won't
        work with job resuming, which would require saving the state of the
        dataset's random number generator.

        Like in the Model class, we also need to avoid saving any Theano
        functions, so we delete everything that can be regenerated with
        `redo_theano` by deleting the fields in `self.names_to_del`
        """

        # Patch old pickled monitors
        if not hasattr(self, '_datasets'):
            self._datasets = [self._dataset]
            del self._dataset

        temp = self._datasets

        if self._datasets:
            self._datasets = []
            for dataset in temp:
                if isinstance(dataset, basestring):
                    self._datasets.append(dataset)
                else:
                    try:
                        self._datasets.append(dataset.yaml_src)
                    except AttributeError:
                        warnings.warn(
                            'Trained model saved without indicating yaml_src')
        d = copy.copy(self.__dict__)
        self._datasets = temp
        for name in self.names_to_del:
            if name in d:
                del d[name]

        return d

    def __setstate__(self, d):

        # patch old pkl files
        if '_dataset' in d:
            d['_datasets'] = [d['_dataset']]
            del d['_dataset']

        self.__dict__.update(d)

    def add_channel(self, name, ipt, val, dataset=None, prereqs=None):
        """
        Asks the monitor to start tracking a new value.  Can be called even
        after the monitor is already in use.

        Parameters
        ----------
        name: str
            The display name in the monitor.
        ipt: tensor_like
            The symbolic tensor which should be clamped to the data.
            (or a (features,targets) list/tuple containing two symbolic tensors)
        val: tensor_like
            The value (function of `ipt`) to be tracked.
        dataset: A Dataset instance specifying which dataset to compute
            this channel on.
        prereqs: list of callables that take two numpy tensors
            (X and y, where y will be None if no labels are used)
            each prereq must be called exactly once per each new
            batch of data drawn *from dataset* before the channel
            value is computed
            if two channels provide a prereq with exactly the same
            id, that prereq will only be called once
        """

        if isinstance(val, (float, int, long)):
            val = np.cast[theano.config.floatX](val)

        val = T.as_tensor_variable(val)

        if not isinstance(ipt, (list, tuple)):
            tmp = [ipt]
        else:
            tmp = ipt
        inputs = theano.gof.graph.inputs([val])
        for elem in inputs:
            if not hasattr(elem, 'get_value') and not isinstance(
                    elem, theano.gof.graph.Constant):
                if elem not in tmp:
                    raise ValueError("Unspecified input: " + str(elem))

        mode = self.theano_function_mode
        if mode is not None and hasattr(mode, 'record'):
            mode.record.handle_line('Adding monitor channel ' + name + '\n')
            if isinstance(ipt, (list, tuple)):
                for elem in ipt:
                    mode.record.handle_line('Includes input var ' +
                                            var_descriptor(elem) + '\n')
            else:
                mode.record.handle_line(name + ' input var is ' +
                                        var_descriptor(ipt) + '\n')
            mode.record.handle_line('channel ' + name + ' is ' +
                                    var_descriptor(val) + '\n')

        if dataset is None:
            if len(self._datasets) == 1:
                dataset = self._datasets[0]
            elif len(self._datasets) == 0:
                raise ValueError(_err_no_data)
            else:
                raise ValueError(_err_ambig_data)

        try:
            self._datasets.index(dataset)
        except ValueError:
            raise ValueError("The dataset specified is not " + \
                "one of the monitor's datasets")

        if name in self.channels:
            raise ValueError("Tried to create the same channel twice (%s)" %
                             name)
        if isinstance(ipt, (list, tuple)):
            if dataset is not None:
                if not dataset.has_targets():
                    raise ValueError("Tried to create a channel ("+name \
                            +") that uses targets, but monitoring dataset has no targets")
            self.require_label = True
            assert len(ipt) == 2
        self.channels[name] = MonitorChannel(ipt, val, name, dataset, prereqs)
        self._dirty = True

    def _sanity_check(self):
        """
            Sometimes we serialize models and then load them somewhere else
            but still try to use their Monitor, and the Monitor is in a mangled
            state. I've added some calls to _sanity_check to try to catch when
            that happens. Not sure what to do for a long term fix. I think it
            requires making theano graphs serializable first.
        """
        for name in self.channels:
            channel = self.channels[name]
            assert hasattr(channel, 'prereqs')

    @classmethod
    def get_monitor(cls, model):
        """
        Returns a model's monitor. If the model doesn't have a monitor yet,
        installs one and returns that.

        Parameters
        ----------
        model : object
            An object that implements the `Model` interface specified in
            `pylearn2.models`.
        """

        if hasattr(model, 'monitor'):
            rval = model.monitor
            rval._sanity_check()
        else:
            rval = Monitor(model)
            model.monitor = rval

        return rval

    # TODO: find out if monitor.foo below are used anywhere, remove if not.
    @property
    def batch_size(self):
        return self._batch_size

    @property
    def num_batches(self):
        return self._num_batches

    def setup(self,
              dataset,
              cost,
              batch_size,
              num_batches=None,
              extra_costs=None,
              mode='sequential'):
        """
        Sets up the monitor for a cost minimization problem.
        Adds channels defined by both the model and the cost for
        the specified dataset(s), as well as a channel called 'objective'
        defined by the costs' __call__ method.

        dataset: a Dataset or dictionary mapping string names to Datasets
                    If string names are used, then for every dataset,
                    each channel defined by the model or cost will be
                    replicated with that dataset's name followed by an
                    underscore as the prefix.
                    For example, if your cost defines a channel called
                    'misclass', and datasets is {'train' : train_dataset,
                    'valid' : valid_dataset} you will get channels called
                    'train_misclass' and 'valid_misclass'.

        cost: a Cost

        """
        if dataset is None:
            return
        if isinstance(dataset, Dataset):
            dataset = {'': dataset}
        else:
            assert isinstance(dataset, dict)
            assert all(isinstance(key, str) for key in dataset)
            assert all(isinstance(dataset[key], Dataset) for key in dataset)

        if extra_costs is None:
            costs = {}
        else:
            costs = extra_costs
        assert '' not in costs
        costs[''] = cost

        supervised = any(cost.supervised for cost in costs.values())
        model = self.model

        X_space = model.get_input_space()
        X = X_space.make_theano_batch(name='monitor_X')

        if config.compute_test_value != 'off':
            X.tag.test_value = X_space.get_origin_batch(batch_size).astype(
                X.dtype)

        if supervised:
            Y_space = model.get_output_space()
            Y = Y_space.make_theano_batch(name='monitor_Y')

            if config.compute_test_value != 'off':
                Y.tag.test_value = Y_space.get_origin_batch(batch_size).astype(
                    Y.dtype)

            ipt = (X, Y)
        else:
            Y = None
            ipt = X
        custom_channels = {}
        for cost_name in costs:
            if cost_name == '':
                prefix = ''
            else:
                prefix = cost_name + '_'
            cost = costs[cost_name]
            raw_channels = cost.get_monitoring_channels(model, X, Y)
            channels = {}
            for name in raw_channels:
                channels[prefix + name] = raw_channels[name]
            custom_channels.update(channels)
        model_channels = model.get_monitoring_channels(X, Y)
        custom_channels.update(model_channels)

        if is_stochastic(mode):
            seed = [[2013, 02, 22]]
        else:
            seed = None

        for dataset_name in dataset:
            cur_dataset = dataset[dataset_name]
            self.add_dataset(dataset=cur_dataset,
                             mode=mode,
                             batch_size=batch_size,
                             num_batches=num_batches,
                             seed=seed)
            if dataset_name == '':
                dprefix = ''
            else:
                dprefix = dataset_name + '_'
            # These channel name 'objective' must not vary, since callbacks that respond to the
            # values in the monitor use the name to find it.
            for cost_name in costs:
                cost = costs[cost_name]
                cost_value = cost(model, X, Y)
                if cost_value is not None:
                    if cost_name == '':
                        name = dprefix + 'objective'
                    else:
                        name = dprefix + cost_name
                    self.add_channel(name=name,
                                     ipt=ipt,
                                     val=cost_value,
                                     dataset=cur_dataset)
            for key in custom_channels:
                self.add_channel(name=dprefix + key,
                                 ipt=ipt,
                                 val=custom_channels[key],
                                 dataset=cur_dataset)
Beispiel #16
0
    def __init__(self, objective, params, inputs = None,
            param_constrainers = None, max_iter = -1,
            lr_scalers = None, verbose = 0, tol = None,
            init_alpha = None, min_init_alpha = 1e-3,
            reset_alpha = True, conjugate = False,
            reset_conjugate = True, gradients = None,
            gradient_updates = None, line_search_mode = None,
            accumulate = False, theano_function_mode=None):

        self.__dict__.update(locals())
        del self.self

        if line_search_mode is None:
            if init_alpha is None:
                init_alpha  = (.001, .005, .01, .05, .1)
        else:
            assert line_search_mode == 'exhaustive'
            if init_alpha is None:
                init_alpha = (.5, 1.)

        self.init_alpha = tuple([float(elem) for elem in init_alpha])

        if inputs is None:
            inputs = []

        if param_constrainers is None:
            param_constrainers = []

        obj = objective

        self.verbose = verbose

        param_to_grad_sym = OrderedDict()
        param_to_grad_shared = OrderedDict()
        updates = OrderedDict()
        if self.gradient_updates is not None:
            updates.update(self.gradient_updates)

        self.params = [ param for param in params ]

        for param in params:
            if self.gradients is not None and param in self.gradients:
                g = self.gradients[param]
            else:
                g = grad(objective, param)
            param_to_grad_sym[param] = g
            if param.name is not None:
                param_name = param.name
            else:
                param_name = 'anon_param'
            grad_name = 'BatchGradientDescent.grad_' + param_name
            grad_shared = sharedX( param.get_value() * 0., name=grad_name)
            param_to_grad_shared[param] = grad_shared
            updates[grad_shared] = g

        self.param_to_grad_shared = param_to_grad_shared

        if self.verbose:
            logger.info('batch gradient class compiling gradient function')
        t1 = time.time()
        if self.accumulate:
            self._compute_grad = Accumulator(inputs, updates = updates)
        else:
            self._compute_grad = function(inputs, updates = updates,
                    mode=self.theano_function_mode,
                    name='BatchGradientDescent._compute_grad')
        if self.verbose:
            t2 = time.time()
            logger.info('done. Took {0}'.format(t2-t1))

        if self.verbose:
            logger.info('batch gradient class compiling objective function')
        if self.accumulate:
            self.obj = Accumulator(inputs, obj)
        else:
            self.obj = function(inputs, obj, mode=self.theano_function_mode,
                    name='BatchGradientDescent.obj')

        if self.verbose:
            logger.info('done')

        self.param_to_cache = OrderedDict()
        alpha = T.scalar(name = 'alpha')
        alpha.tag.test_value = np.cast[alpha.dtype](.01)
        cache_updates = OrderedDict()
        goto_updates = OrderedDict()
        for param in params:
            if param.name is None:
                param_name = 'anon_param'
            else:
                param_name = param.name
            cache_name = 'BatchGradientDescent.param_to_cache[%s]' % param_name
            self.param_to_cache[param] = sharedX(param.get_value(borrow=False), name=cache_name)
            cache_updates[self.param_to_cache[param]] = param
            cached = self.param_to_cache[param]
            g = self.param_to_grad_shared[param]
            if lr_scalers is not None and param in lr_scalers:
                scaled_alpha = alpha * lr_scalers[param]
            else:
                scaled_alpha = alpha
            mul = scaled_alpha * g
            diff = cached - mul
            goto_updates[param] = diff
        self._cache_values = function([], updates = cache_updates, mode=self.theano_function_mode, name='BatchGradientDescent._cache_values')
        assert isinstance(param_constrainers, (list, tuple))
        for param_constrainer in param_constrainers:
            param_constrainer(goto_updates)
        self._goto_alpha = function([alpha], updates=goto_updates,
                mode=self.theano_function_mode, name='BatchGradientDescent._goto_alpha')

        norm = T.sqrt(sum([T.sqr(elem).sum() for elem in self.param_to_grad_shared.values()]))
        norm.name = 'BatchGradientDescent.norm'
        normalize_grad_updates = OrderedDict()
        for grad_shared in self.param_to_grad_shared.values():
            normalize_grad_updates[grad_shared] = grad_shared / norm

        # useful for monitoring
        self.ave_grad_size = sharedX(0.)
        self.new_weight = sharedX(1.)
        normalize_grad_updates[self.ave_grad_size] = self.new_weight * norm + (1.-self.new_weight) * self.ave_grad_size

        self._normalize_grad = function([], norm, updates=normalize_grad_updates, mode=self.theano_function_mode,
                name='BatchGradientDescent._normalize_grad')

        if self.conjugate:
            grad_shared = self.param_to_grad_shared.values()

            grad_to_old_grad = OrderedDict()
            for elem in grad_shared:
                grad_to_old_grad[elem] = sharedX(elem.get_value(), 'old_'+elem.name)

            self._store_old_grad = function([norm], updates = OrderedDict([(grad_to_old_grad[g_], g_ * norm)
                for g_ in grad_to_old_grad]), mode=self.theano_function_mode,
                name='BatchGradientDescent._store_old_grad')

            grad_ordered = list(grad_to_old_grad.keys())
            old_grad_ordered = [grad_to_old_grad[g_] for g_ in grad_ordered]

            def dot_product(x, y):
                return sum([ (x_elem * y_elem).sum() for x_elem, y_elem in safe_zip(x, y) ])

            beta_pr = (dot_product(grad_ordered, grad_ordered) - dot_product(grad_ordered, old_grad_ordered)) / \
                    (1e-7+dot_product(old_grad_ordered, old_grad_ordered))
            assert beta_pr.ndim == 0

            beta = T.maximum(beta_pr, 0.)

            #beta_pr is the Polak-Ribiere formula for beta.
            #According to wikipedia, the beta to use for NCG is "a matter of heuristics or taste"
            #but max(0, beta_pr) is "a popular choice... which provides direction reset automatically."
            #(ie, it is meant to revert to steepest descent when you have traveled far enough that
            #the objective function is behaving non-quadratically enough that the conjugate gradient
            #formulas aren't working anymore)

            #http://en.wikipedia.org/wiki/Nonlinear_conjugate_gradient_method

            assert grad not in grad_to_old_grad

            make_conjugate_updates = [(g_, g_ + beta * grad_to_old_grad[g_]) for g_ in grad_ordered]

            mode = self.theano_function_mode
            if mode is not None and hasattr(mode, 'record'):
                for v, u in make_conjugate_updates:
                    mode.record.handle_line('BatchGradientDescent._make_conjugate var ' \
                            + var_descriptor(v) + '\n')
                    mode.record.handle_line('BatchGradientDescent._make_conjugate update ' \
                            + var_descriptor(u) + '\n')

            self._make_conjugate = function([], updates=make_conjugate_updates,
                    mode=self.theano_function_mode, name='BatchGradientDescent._make_conjugate')

            if mode is not None and hasattr(mode, 'record'):
                for output in self._make_conjugate.maker.fgraph.outputs:
                    mode.record.handle_line('BatchGradientDescent._make_conjugate output ' \
                            + var_descriptor(output) + '\n')


        if tol is None:
            if objective.dtype == "float32":
                self.tol = 1e-6
            else:
                self.tol = 3e-7
        else:
            self.tol = tol

        self.ave_step_size = sharedX(0.)
        self.ave_grad_mult = sharedX(0.)
    def get_funcs(self, learning_rate, grads, inp, cost, errors, lr_scalers=None):
        """
        Compute the AdaDelta updates

        Parameters
        ----------
        learning_rate : float
            Learning rate coefficient.
        grads : dict
            A dictionary mapping from the model's parameters to their
            gradients.
        lr_scalers : dict
            A dictionary mapping from the model's parameters to a learning
            rate multiplier.
        """
        updates = OrderedDict()

        tot_norm_up = 0

        gshared = OrderedDict({p: sharedX(p.get_value() * 0.,
                             name='%s_grad' % p.name)
                             for p, g in grads.iteritems()})

        gsup = [(gshared[p], g) for p, g in grads.iteritems()]
        get_norms = lambda x: T.sqrt(sum(map(lambda y: (y**2).sum(), x)))
        gnorm = get_norms(grads.values())
        pnorm = get_norms(grads.keys())
        f_grad_shared = theano.function(inp,
                                        [cost, errors, gnorm, pnorm],
                                        updates=gsup)

        for param in gshared.keys():
            # mean_squared_grad := E[g^2]_{t-1}
            mean_square_grad = sharedX(param.get_value() * 0.)
            # mean_square_dx := E[(\Delta x)^2]_{t-1}
            mean_square_dx = sharedX(param.get_value() * 0.)

            if param.name is not None:
                mean_square_grad.name = 'mean_square_grad_' + param.name
                mean_square_dx.name = 'mean_square_dx_' + param.name

            # Accumulate gradient
            new_mean_squared_grad = (
                self.decay * mean_square_grad +
                (1 - self.decay) * T.sqr(gshared[param])
            )

            # Compute update
            epsilon = learning_rate
            rms_dx_tm1 = T.sqrt(mean_square_dx + epsilon)
            rms_grad_t = T.sqrt(new_mean_squared_grad + epsilon)
            delta_x_t = - rms_dx_tm1 / rms_grad_t * gshared[param]

            # Accumulate updates
            new_mean_square_dx = (
                self.decay * mean_square_dx +
                (1 - self.decay) * T.sqr(delta_x_t)
            )

            # Apply update
            updates[mean_square_grad] = new_mean_squared_grad
            updates[mean_square_dx] = new_mean_square_dx
            updates[param] = param + delta_x_t

            tot_norm_up += delta_x_t.norm(2)

        f_update = theano.function([learning_rate], [tot_norm_up],
                                   updates=updates,
                                   on_unused_input='ignore')

        return f_grad_shared, f_update
    def __init__(self, objective, params, inputs = None,
            param_constrainers = None, max_iter = -1,
            lr_scalers = None, verbose = 0, tol = None,
            init_alpha = None, min_init_alpha = 1e-3,
            reset_alpha = True, conjugate = False,
            reset_conjugate = True, gradients = None,
            gradient_updates = None, line_search_mode = None,
            accumulate = False, theano_function_mode=None):
        """
        Parameters
        ----------
        objective : tensor_like
            A theano expression to be minimized should be a function of \
            params and, if provided, inputs
        params : list
            A list of theano shared variables. These are the optimization \
            variables
        inputs : list, optional
            A list of theano variables to serve as inputs to the graph.
        param_constrainers : list
            A list of callables to be called on all updates dictionaries to \
            be applied to params. This is how you implement constrained \
            optimization.
        reset_alpha : bool
            If True, reverts to using init_alpha after each call. If False, \
            the final set of alphas is used at the start of the next call to \
            minimize.
        conjugate : bool
            If True, tries to pick conjugate gradient directions. For the \
            directions to be truly conjugate, you must use line_search_mode = \
            'exhaustive' and the objective function must be quadratic. \
            Using line_search_mode = 'exhaustive' on a non-quadratic \
            objective function implements nonlinear conjugate gradient descent.
        reset_conjugate : bool
            Has no effect unless conjugate == True. If reset_conjugate == \
            True, reverts to direction of steepest descent for the first \
            step in each call to minimize. Otherwise, tries to make the new \
            search direction conjugate to the last one (even though the \
            objective function might be totally different on each call to \
            minimize)
        gradients : WRITEME
            If None, compute the gradients of obj using T.grad otherwise, a \
            dictionary mapping from params to expressions for their gradients \
            (this allows you to use approximate gradients computed with \
            something other than T.grad)
        gradient_updates : dict
            A dictionary of shared variable updates to run each time the \
            gradient is computed

        Notes
        -----
        Calling the ``minimize'' method with values for for ``inputs'' will
        update ``params'' to minimize ``objective''.
        """

        self.__dict__.update(locals())
        del self.self

        if line_search_mode is None:
            if init_alpha is None:
                init_alpha  = (.001, .005, .01, .05, .1)
        else:
            assert line_search_mode == 'exhaustive'
            if init_alpha is None:
                init_alpha = (.5, 1.)

        self.init_alpha = tuple([float(elem) for elem in init_alpha])

        if inputs is None:
            inputs = []

        if param_constrainers is None:
            param_constrainers = []

        obj = objective

        self.verbose = verbose

        param_to_grad_sym = OrderedDict()
        param_to_grad_shared = OrderedDict()
        updates = OrderedDict()
        if self.gradient_updates is not None:
            updates.update(self.gradient_updates)

        self.params = [ param for param in params ]

        for param in params:
            if self.gradients is not None and param in self.gradients:
                g = self.gradients[param]
            else:
                g = grad(objective, param)
            param_to_grad_sym[param] = g
            if param.name is not None:
                param_name = param.name
            else:
                param_name = 'anon_param'
            grad_name = 'BatchGradientDescent.grad_' + param_name
            grad_shared = sharedX( param.get_value() * 0., name=grad_name)
            param_to_grad_shared[param] = grad_shared
            updates[grad_shared] = g

        self.param_to_grad_shared = param_to_grad_shared

        if self.verbose:
            print 'batch gradient class compiling gradient function'
        t1 = time.time()
        if self.accumulate:
            self._compute_grad = Accumulator(inputs, updates = updates)
        else:
            self._compute_grad = function(inputs, updates = updates,
                    mode=self.theano_function_mode,
                    name='BatchGradientDescent._compute_grad')
        if self.verbose:
            t2 = time.time()
            print 'done. Took ',t2-t1

        if self.verbose:
            print 'batch gradient class compiling objective function'
        if self.accumulate:
            self.obj = Accumulator(inputs, obj)
        else:
            self.obj = function(inputs, obj, mode=self.theano_function_mode,
                    name='BatchGradientDescent.obj')

        if self.verbose:
            print 'done'

        self.param_to_cache = OrderedDict()
        alpha = T.scalar(name = 'alpha')
        alpha.tag.test_value = np.cast[alpha.dtype](.01)
        cache_updates = OrderedDict()
        goto_updates = OrderedDict()
        for param in params:
            if param.name is None:
                param_name = 'anon_param'
            else:
                param_name = param.name
            cache_name = 'BatchGradientDescent.param_to_cache[%s]' % param_name
            self.param_to_cache[param] = sharedX(param.get_value(borrow=False), name=cache_name)
            cache_updates[self.param_to_cache[param]] = param
            cached = self.param_to_cache[param]
            g = self.param_to_grad_shared[param]
            if lr_scalers is not None and param in lr_scalers:
                scaled_alpha = alpha * lr_scalers[param]
            else:
                scaled_alpha = alpha
            mul = scaled_alpha * g
            diff = cached - mul
            goto_updates[param] = diff
        self._cache_values = function([], updates = cache_updates, mode=self.theano_function_mode, name='BatchGradientDescent._cache_values')
        assert isinstance(param_constrainers, (list, tuple))
        for param_constrainer in param_constrainers:
            param_constrainer(goto_updates)
        self._goto_alpha = function([alpha], updates=goto_updates,
                mode=self.theano_function_mode, name='BatchGradientDescent._goto_alpha')

        norm = T.sqrt(sum([T.sqr(elem).sum() for elem in self.param_to_grad_shared.values()]))
        norm.name = 'BatchGradientDescent.norm'
        normalize_grad_updates = OrderedDict()
        for grad_shared in self.param_to_grad_shared.values():
            normalize_grad_updates[grad_shared] = grad_shared / norm

        # useful for monitoring
        self.ave_grad_size = sharedX(0.)
        self.new_weight = sharedX(1.)
        normalize_grad_updates[self.ave_grad_size] = self.new_weight * norm + (1.-self.new_weight) * self.ave_grad_size

        self._normalize_grad = function([], norm, updates=normalize_grad_updates, mode=self.theano_function_mode,
                name='BatchGradientDescent._normalize_grad')

        if self.conjugate:
            grad_shared = self.param_to_grad_shared.values()

            grad_to_old_grad = OrderedDict()
            for elem in grad_shared:
                grad_to_old_grad[elem] = sharedX(elem.get_value(), 'old_'+elem.name)

            self._store_old_grad = function([norm], updates = OrderedDict([(grad_to_old_grad[g], g * norm)
                for g in grad_to_old_grad]), mode=self.theano_function_mode,
                name='BatchGradientDescent._store_old_grad')

            grad_ordered = list(grad_to_old_grad.keys())
            old_grad_ordered = [ grad_to_old_grad[g] for g in grad_ordered]

            def dot_product(x, y):
                return sum([ (x_elem * y_elem).sum() for x_elem, y_elem in safe_zip(x, y) ])

            beta_pr = (dot_product(grad_ordered, grad_ordered) - dot_product(grad_ordered, old_grad_ordered)) / \
                    (1e-7+dot_product(old_grad_ordered, old_grad_ordered))
            assert beta_pr.ndim == 0

            beta = T.maximum(beta_pr, 0.)

            """

            beta_pr is the Polak-Ribiere formula for beta.
            According to wikipedia, the beta to use for NCG is "a matter of heuristics or taste"
            but max(0, beta_pr) is "a popular choice... which provides direction reset automatically."
            (ie, it is meant to revert to steepest descent when you have traveled far enough that
            the objective function is behaving non-quadratically enough that the conjugate gradient
            formulas aren't working anymore)

            http://en.wikipedia.org/wiki/Nonlinear_conjugate_gradient_method

            """

            assert grad not in grad_to_old_grad

            make_conjugate_updates = [(g, g + beta * grad_to_old_grad[g]) for g in grad_ordered]

            mode = self.theano_function_mode
            if mode is not None and hasattr(mode, 'record'):
                for v, u in make_conjugate_updates:
                    mode.record.handle_line('BatchGradientDescent._make_conjugate var ' \
                            + var_descriptor(v) + '\n')
                    mode.record.handle_line('BatchGradientDescent._make_conjugate update ' \
                            + var_descriptor(u) + '\n')

            self._make_conjugate = function([], updates=make_conjugate_updates,
                    mode=self.theano_function_mode, name='BatchGradientDescent._make_conjugate')

            if mode is not None and hasattr(mode, 'record'):
                for output in self._make_conjugate.maker.fgraph.outputs:
                    mode.record.handle_line('BatchGradientDescent._make_conjugate output ' \
                            + var_descriptor(output) + '\n')


        if tol is None:
            if objective.dtype == "float32":
                self.tol = 1e-6
            else:
                self.tol = 3e-7
        else:
            self.tol = tol

        self.ave_step_size = sharedX(0.)
        self.ave_grad_mult = sharedX(0.)
Beispiel #19
0
class Optimizer(object):
    '''
    Default interface for an optimizer implementation - this provides the necessary parameter updates when
    training a model on a dataset using an online stochastic process.
    '''
    def __init__(self, model, dataset,
                 config=None, defaults=None,
                 n_epoch=None, batch_size=None, minimum_batch_size=None,
                 save_frequency=None, early_stop_threshold=None, early_stop_length=None,
                 learning_rate=None, lr_decay=None, lr_factor=None,
                 **kwargs):
        # Default values to use for some training parameters
        _defaults = {"n_epoch": 1000,
                     "batch_size": 100,
                     "minimum_batch_size": 1,
                     "save_frequency": 10,
                     "early_stop_threshold": .9995,
                     "early_stop_length": 30,
                     "learning_rate": 0.001,
                     "lr_decay": "exponential",
                     "lr_factor": 1,  # no learning rate decay by default
                     }

        log.debug("Initializing optimizer %s", str(type(self)))

        assert isinstance(model, Model), "Optimizer input model needs to be an opendeep Model class!"
        self.model = model
        self.dataset = dataset
        assert isinstance(dataset, Dataset), "Optimizer input dataset needs to be an opendeep Dataset class!"

        # set self.args to be the combination of the defaults and the config dictionaries from the subclass
        in_args = combine_config_and_defaults(config, defaults)
        self.args = combine_config_and_defaults(in_args, _defaults)

        # if the args are none, make it a blank dictionary
        if self.args is None:
            self.args = {}

        # now that our required variables are out of the way, do the same thing for everything else passed via kwargs
        for arg, val in kwargs.items():
            if (val is not None or str(arg) not in self.args) and str(arg) != 'kwargs':
                self.args[str(arg)] = val
            # flatten kwargs if it was passed as a variable
            elif str(arg) == 'kwargs':
                inner_kwargs = kwargs['kwargs']
                for key, item in inner_kwargs.items():
                    if item is not None or str(key) not in self.args:
                        self.args[str(key)] = item

        # now take care of overriding explicits passed in
        if n_epoch is not None:
            self.args['n_epoch'] = n_epoch
        if batch_size is not None:
            self.args['batch_size'] = batch_size
        if minimum_batch_size is not None:
            self.args['minimum_batch_size'] = minimum_batch_size
        if save_frequency is not None:
            self.args['save_frequency'] = save_frequency
        if early_stop_threshold is not None:
            self.args['early_stop_threshold'] = early_stop_threshold
        if early_stop_length is not None:
            self.args['early_stop_length'] = early_stop_length
        if learning_rate is not None:
            self.args['learning_rate'] = learning_rate
        if lr_decay is not None:
            self.args['lr_decay'] = lr_decay
        if lr_factor is not None:
            self.args['lr_factor'] = lr_factor

        # Magic! Now self.args contains the combination of all the initialization variables, overridden like so:
        # _defaults < defaults < config < kwargs (explicits passed to model's __init__)

        # log the arguments
        log.debug("optimizer config args: %s", str(self.args))

        # Finally, to make things really easy, update the class 'self' with everything in self.args to make
        # all the parameters accessible via self.<param>
        self.__dict__.update(self.args)

        # Learning rate - how drastic of a step do the parameters change
        self.learning_rate = sharedX(self.learning_rate, 'learning_rate')
        self.lr_scalers = self.model.get_lr_scalers()
        if self.lr_decay:
            self.learning_rate_decay = get_decay_function(self.lr_decay,
                                                          self.learning_rate,
                                                          self.learning_rate.get_value(),
                                                          self.lr_factor)
        else:
            self.learning_rate_decay = False

    def get_batch_indices(self, data_lengths):
        batch_indices = []
        start_idx = 0
        for len in raise_to_list(data_lengths):
            # integer division to determine number of whole batches for this length
            n_batches = len / int(self.batch_size)
            # add the (start_idx, end_idx) tuple to the list
            for i in range(n_batches):
                end_idx = start_idx + self.batch_size
                batch_indices.append((start_idx, end_idx))
                start_idx = end_idx
            # remainder to find number of leftover examples
            remainder = numpy.remainder(len, self.batch_size)
            end_idx = start_idx + remainder
            # check if it is bigger than the minimum allowed size
            if remainder >= self.minimum_batch_size:
                batch_indices.append((start_idx, end_idx))
            start_idx = end_idx
        return batch_indices

    def get_updates(self, gradients):
        """
        This returns the parameter updates to use during training. It defaults to only using (annealed) learning rate.
        :param gradients: (parameter, gradient) tuples representing the parameters to update and their gradients
        :type gradients: list(tuple)
        :return: the updates
        :rtype: updates
        """
        log.debug('Setting up Stochastic Gradient Descent for optimizer...')
        updates = OrderedDict()
        for (param, gradient) in six.iteritems(gradients):
            scaled_lr = self.learning_rate * self.lr_scalers.get(param, 1.)
            updates[param] = param - scaled_lr * gradient
        return updates

    def train(self, continue_training=False):
        """
        This method performs the training!!!
        :param continue_training:
        :type continue_training:
        :return:
        :rtype:
        """
        # grab the model parameters to use during training
        self.params = self.model.get_params()
        log.info("%s params: %s", str(type(self.model)), str(self.params))

        ###############################################
        # theano index variable to use on the dataset #
        ###############################################
        # index to a [mini]batch - both start and end
        data_idx = T.iscalar('data_index')
        data_end_idx = T.iscalar('data_end_index')
        batch_slice = slice(data_idx, data_end_idx)

        # compute number of minibatches for training, validation and testing
        # shapes is list of list - input list of datasets to optimizer (for multiple inputs), and each dataset
        # could be a list of shared variables (like multiple sequences from files)
        train_data_shapes = raise_to_list(self.dataset.getDataShape(TRAIN))
        valid_data_shapes = raise_to_list(self.dataset.getDataShape(VALID))
        test_data_shapes  = raise_to_list(self.dataset.getDataShape(TEST))

        # train_batches is going to be lists of tuples that contain the start and end indices for train data
        train_data_lens = [shape[0] for shape in train_data_shapes]
        self.train_batches = self.get_batch_indices(train_data_lens)

        if valid_data_shapes is not None:
            valid_data_lens = [shape[0] for shape in valid_data_shapes]
            self.valid_batches = self.get_batch_indices(valid_data_lens)
        else:
            self.valid_batches = None
        if test_data_shapes is not None:
            test_data_lens = [shape[0] for shape in test_data_shapes]
            self.test_batches = self.get_batch_indices(test_data_lens)
        else:
            self.test_batches = None

        # translate the data_idx into the givens for the model
        model_inputs = raise_to_list(self.model.get_inputs())
        model_targets = raise_to_list(self.model.get_targets())

        train_data, train_labels = self.dataset.getSubset(TRAIN)
        train_givens = OrderedDict(zip(model_inputs, [train_data[batch_slice]]))
        if model_targets is not None and len(model_targets) > 0:
            train_givens.update(OrderedDict(zip(model_targets, [train_labels[batch_slice]])))

        valid_data, valid_labels = self.dataset.getSubset(VALID)
        valid_givens = OrderedDict(zip(model_inputs, [valid_data[batch_slice]]))
        if model_targets is not None and len(model_targets) > 0:
            valid_givens.update(OrderedDict(zip(model_targets, [valid_labels[batch_slice]])))

        test_data, test_labels = self.dataset.getSubset(TEST)
        test_givens = OrderedDict(zip(model_inputs, [test_data[batch_slice]]))
        if model_targets is not None and len(model_targets) > 0:
            test_givens.update(OrderedDict(zip(model_targets, [test_labels[batch_slice]])))

        # Now time to create the training cost functions for the model - make sure to handle the possible
        # list of costs used for pretraining of certain parts of the model.
        train_costs = raise_to_list(self.model.get_train_cost())
        self.train_functions = []
        for i, train_cost in enumerate(train_costs):
            # Now create the training cost function for the model to use while training - update parameters
            # gradient!
            gradients, _ = self.model.get_gradient(cost=train_cost)

            # Calculate the optimizer updates each run
            # This is where the magic happens for a lot of sub-implementations of SGD, including AdaDelta!
            # It tells how to update the params each training epoch
            gradient_updates = self.get_updates(gradients)

            # Combine the updates from the model also if applicable
            train_updates = self.model.get_updates()
            if train_updates:
                train_updates.update(gradient_updates)
            else:
                train_updates = gradient_updates

            # Compile the training function!
            log.info('Compiling f_learn %d/%d function for model %s...', i + 1, len(train_costs),
                     str(type(self.model)))
            t = time.time()

            f_learn = function(inputs=[data_idx, data_end_idx],
                               updates=train_updates,
                               outputs=train_cost,
                               givens=train_givens,
                               name='f_learn_%d' % i)

            log.info('f_learn compilation took %s', make_time_units_string(time.time() - t))
            self.train_functions.append(f_learn)

        # grab the expression(s) to use to monitor different model values during training
        log.debug("Compiling monitor functions...")
        monitor_t = time.time()
        self.monitors = OrderedDict(self.model.get_monitors())
        self.monitor_names = self.monitors.keys()
        if len(self.monitors.keys()) > 0:
            self.train_monitor_function = function(
                inputs=[data_idx, data_end_idx],
                updates=self.model.get_updates(),
                outputs=self.monitors.values(),
                givens=train_givens,
                name="train_monitor_function"
            )
        if len(self.monitors.keys()) > 0:
            self.valid_monitor_function = function(
                inputs=[data_idx, data_end_idx],
                updates=self.model.get_updates(),
                outputs=self.monitors.values(),
                givens=valid_givens,
                name="valid_monitor_function"
            )
        if len(self.monitors.keys()) > 0:
            self.test_monitor_function = function(
                inputs=[data_idx, data_end_idx],
                updates=self.model.get_updates(),
                outputs=self.monitors.values(),
                givens=test_givens,
                name="test_monitor_function"
            )
        log.debug("Compilation done. Took %s", make_time_units_string(time.time() - monitor_t))

        self.noise_switches = raise_to_list(self.model.get_noise_switch())

        ##################
        # start training #
        ##################
        # make sure to deal with a list of train_cost functions - for layer-wise pretraining!
        # this list of training functions was created during __init__()
        start_time = time.time()
        for func_i, train_function in enumerate(self.train_functions):
            log.info("-----------TRAINING %s function %d/%d FOR %d EPOCHS (continue_training=%s)-----------",
                     str(type(self.model)), func_i + 1, len(self.train_functions), self.n_epoch, str(continue_training))

            log.debug("Train dataset size is: %s", self.dataset.getDataShape(TRAIN))
            if self.dataset.hasSubset(VALID):
                log.debug("Valid dataset size is: %s", self.dataset.getDataShape(VALID))
            if self.dataset.hasSubset(TEST):
                log.debug("Test dataset size is: %s", self.dataset.getDataShape(TEST))

            self.STOP = False
            self.epoch_counter = 0
            if not continue_training:
                # reset the learning rate
                if hasattr(self, 'learning_rate_decay') and self.learning_rate_decay:
                    self.learning_rate_decay.reset()
                # reset the other model decaying functions
                for decay_param in self.model.get_decay_params():
                    decay_param.reset()

            self.times = []
            self.best_cost = numpy.inf
            self.best_params = None
            self.patience = 0

            t = time.time()

            while not self.STOP:
                try:
                    self.STOP = self._perform_one_epoch(train_function)
                except KeyboardInterrupt:
                    log.info("STOPPING EARLY FROM KEYBOARDINTERRUPT")
                    self.STOP = True

            # save params
            if self.best_params is not None:
                log.debug("Restoring best model parameters...")
                set_shared_values(self.params, self.best_params)
            log.debug("Saving model parameters...")
            self.model.save_params('trained_epoch_' + str(self.epoch_counter) + '.pkl')

            log.info("------------TRAIN TIME TOOK %s---------", make_time_units_string(time.time() - t))

        log.info("------------TOTAL %s TRAIN TIME TOOK %s---------",
                 str(type(self.model)), make_time_units_string(time.time() - start_time))


    def _perform_one_epoch(self, f_learn):
        self.epoch_counter += 1
        t = time.time()
        log.info('EPOCH %s', str(self.epoch_counter))

        # set the noise switches on for training function! (this is where things like dropout happen)
        if len(self.noise_switches) > 0:
            log.debug("Turning on %s noise switches", str(len(self.noise_switches)))
            switch_vals = [switch.get_value() for switch in self.noise_switches]
            [switch.set_value(0.) for switch in self.noise_switches]

        # train
        train_costs = []
        train_monitors = {key: [] for key in self.monitors.keys()}
        for batch_start, batch_end in self.train_batches:
            train_costs.append(f_learn(batch_start, batch_end))
            self.call_monitors(monitor_function=self.train_monitor_function,
                               monitors_dict=train_monitors,
                               inputs=[batch_start, batch_end])
        log.info('Train cost: %s', trunc(numpy.mean(train_costs, 0)))
        if len(self.monitors.keys()) > 0:
            log.info('Train monitors: %s',
                     str({key: numpy.mean(value, 0) for key, value in train_monitors.items()}))


        # set the noise switches off for valid and test sets! we assume unseen data is noisy anyway :)
        if len(self.noise_switches) > 0:
            log.debug("Turning off %s noise switches", str(len(self.noise_switches)))
            [switch.set_value(0.) for switch in self.noise_switches]
        # valid
        if self.dataset.hasSubset(VALID) and len(self.monitors.keys()) > 0:
            valid_monitors = {key: [] for key in self.monitors.keys()}
            for batch_start, batch_end in self.valid_batches:
                self.call_monitors(monitor_function=self.valid_monitor_function,
                                   monitors_dict=valid_monitors,
                                   inputs=[batch_start, batch_end])
            log.info('Valid monitors: %s',
                     str({key: numpy.mean(value, 0) for key, value in valid_monitors.items()}))

        #test
        if self.dataset.hasSubset(TEST) and len(self.monitors.keys()) > 0:
            test_monitors = {key: [] for key in self.monitors.keys()}
            for batch_start, batch_end in self.test_batches:
                self.call_monitors(monitor_function=self.test_monitor_function,
                                   monitors_dict=test_monitors,
                                   inputs=[batch_start, batch_end])
            log.info('Test monitors: %s', str({key: numpy.mean(value, 0) for key, value in test_monitors.items()}))

        # check for early stopping on train costs
        cost = numpy.sum(train_costs)
        if cost < self.best_cost * self.early_stop_threshold:
            self.patience = 0
            self.best_cost = cost
            # save the parameters that made it the best
            self.best_params = get_shared_values(self.params)
        else:
            self.patience += 1

        # check for stopping either from n_epochs or from threshold/patience
        stop = False
        if self.epoch_counter >= self.n_epoch:
            log.info("Stopping (reached max number of epochs)...")
            stop = True
        if self.patience >= self.early_stop_length:
            log.info("Stopping early (reached stop threshold)...")
            stop = True

        timing = time.time() - t
        self.times.append(timing)

        log.info('time: ' + make_time_units_string(timing))

        log.info('remaining time: ' +
                 make_time_units_string((self.n_epoch - self.epoch_counter) * numpy.mean(self.times)))

        if (self.epoch_counter % self.save_frequency) == 0:
            #save params
            self.model.save_params('trained_epoch_' + str(self.epoch_counter) + '.pkl')

        # ANNEAL!
        if not stop:
            if hasattr(self, 'learning_rate_decay') and self.learning_rate_decay:
                self.learning_rate_decay.decay()
            if hasattr(self, 'momentum_decay') and self.momentum_decay:
                self.momentum_decay.decay()
            for decay_param in self.model.get_decay_params():
                decay_param.decay()

        # reset the switches
        if len(self.noise_switches) > 0:
            [switch.set_value(val) for switch, val in zip(self.noise_switches, switch_vals)]

        # return whether or not to stop this epoch
        return stop

    def call_monitors(self, monitors_dict, monitor_function, inputs):
        outs = monitor_function(*inputs)
        for i, out in enumerate(outs):
            monitors_dict[self.monitor_names[i]].append(out)
Beispiel #20
0
class Optimizer(object):
    """
    Default interface for an optimizer implementation - this provides the necessary parameter updates when
    training a model on a dataset using an online stochastic process. The base framework for performing
    stochastic gradient descent.
    """
    def __init__(self, dataset, loss=None, model=None,
                 epochs=1000, batch_size=100, min_batch_size=1,
                 save_freq=10, stop_threshold=None, stop_patience=50,
                 learning_rate=1e-3, lr_decay=None, lr_decay_factor=None,
                 grad_clip=None, hard_clip=False,
                 **kwargs):
        """
        Initialize the Optimizer.

        Parameters
        ----------
        dataset : Dataset
            The :class:`opendeep.data.Dataset` to use when training the Model.
        loss : Loss
            The :class:`opendeep.optimization.loss.Loss` function to compare the model to a 'target' result.
        model : Model
            The :class:`opendeep.models.Model` to train. Needed if the Optimizer isn't being passed to a
            Model's .train() method.
        epochs : int
            How many training iterations over the dataset to go.
        batch_size : int
            How many examples from the training dataset to use in parallel.
        min_batch_size : int
            The minimum number of examples required at a time (for things like time series, this would be > 1).
        save_freq : int, optional
            How many epochs to train between each new save of the Model's parameters.
        stop_threshold : float, optional
            The factor by how much the best validation training score needs to improve to determine early stopping.
        stop_patience : int, optional
            The patience or number of epochs to wait after the stop_threshold has been reached before stopping.
        learning_rate : float
            The multiplicative amount to adjust parameters based on their gradient values.
        lr_decay : str
            The decay function to use for changing the learning rate over epochs. See
            `opendeep.utils.decay` for classes of decay and documentation.
        lr_decay_factor : float
            The amount of decay to use for the ``lr_decay`` type of decay.
        grad_clip : float, optional
            Whether to clip gradients. This will clip the norm of the gradients either with a hard cutoff or rescaling.
        hard_clip : bool
            Whether to use a hard cutoff or rescaling for clipping gradients.
        """
        log.info("Initializing optimizer %s", str(self.__class__.__name__))

        # Deal with early stopping None initializations (no early stopping).
        if not stop_threshold:
            stop_threshold = numpy.inf
        if not save_freq:
            save_freq = 1000000
        if not stop_patience:
            stop_patience = 1

        # Put all init parameters in self.args so we can log the initial configuration.
        self.args = locals().copy()
        self.args.pop('self')
        kwargs = self.args.pop('kwargs')
        self.args = add_kwargs_to_dict(kwargs, self.args)
        # log the arguments
        log.info("Optimizer config args: %s", str(self.args))
        # if the optimizer wasn't initialized with a Model (train() being called from the model class itself),
        # just return. (This seems kinda hacky but hey, people wanted .train() to happen from Model and there
        # wasn't really a better way unless the epoch looping logic was in that method for Model. That wasn't
        # the best option because other methods besides stochastic ones can exist for optimizers in the future.
        # TODO: fix this up - feels like a hack just to make model.train() work...
        if not model:
            return
        # Otherwise, things are proceeding as normal. Carry on...

        assert isinstance(model, Model), "Optimizer input model needs to be a Model class! " \
                                         "Found %s" % str(model.__class__.__name__)
        assert isinstance(dataset, Dataset), "Optimizer input dataset needs to be a Dataset class! " \
                                             "Found %s" % str(dataset.__class__.__name__)
        # deal with loss expression/targets
        if loss is not None:
            assert isinstance(loss, Loss), "Optimizer input loss needs to be a Loss class! " \
                                           "Found %s" % str(loss.__class__.__name__)
        if isinstance(loss, Loss):
            self.loss_targets = loss.get_targets()
            self.loss_expression = loss.get_loss()
        else:
            assert model.get_loss() is not None, "No Loss specified, and the model does not have one implemented."
            if isinstance(model.get_loss(), tuple):
                self.loss_targets = raise_to_list(model.get_loss()[0])
                self.loss_expression = model.get_loss()[1]
            else:
                self.loss_targets = None
                self.loss_expression = model.get_loss()

        model_inputs = raise_to_list(model.get_inputs())
        n_model_inputs = len(model_inputs)

        model_targets = self.loss_targets or []
        for input in model_inputs:
            if input in model_targets:
                model_targets.remove(input)

        n_model_targets = len(model_targets)
        self.unsupervised = (n_model_targets is 0)
        # make sure the number of inputs/targets matches up with the dataset properties
        # train
        assert n_model_inputs == len(raise_to_list(dataset.train_inputs)), \
            "Dataset has %d train inputs, while model expects %d" % \
            (len(raise_to_list(dataset.train_inputs)), n_model_inputs)
        if not self.unsupervised:
            assert n_model_targets == len(raise_to_list(dataset.train_targets) or []), \
                "Dataset has %d train targets, while model expects %d" % \
                (len(raise_to_list(dataset.train_targets) or []), n_model_targets)
        # valid
        if dataset.valid_inputs is not None:
            assert n_model_inputs == len(raise_to_list(dataset.valid_inputs)), \
                "Dataset has %d valid inputs, while model expects %d" % \
                (len(raise_to_list(dataset.valid_inputs)), n_model_inputs)
            if not self.unsupervised:
                assert n_model_targets == len(raise_to_list(dataset.valid_targets) or []), \
                    "Dataset has %d valid targets, while model expects %d" % \
                    (len(raise_to_list(dataset.valid_targets) or []), n_model_targets)
        # test
        if dataset.test_inputs is not None:
            assert n_model_inputs == len(raise_to_list(dataset.test_inputs)), \
                "Dataset has %d test inputs, while model expects %d" % \
                (len(raise_to_list(dataset.test_inputs)), n_model_inputs)
            if not self.unsupervised:
                assert n_model_targets == len(raise_to_list(dataset.test_targets) or []), \
                    "Dataset has %d test targets, while model expects %d" % \
                    (len(raise_to_list(dataset.test_targets) or []), n_model_targets)

        # now we are happy, we can add them to `self`
        self.model = model
        self.dataset = dataset
        self.loss = loss

        # Learning rate - how drastic of a step do the parameters change
        self.learning_rate = sharedX(learning_rate, 'learning_rate')
        # whether to scale individual model parameters' learning rates.
        self.lr_scalers = self.model.get_lr_scalers()
        # whether to decay
        if lr_decay:
            self.learning_rate_decay = get_decay_function(lr_decay,
                                                          self.learning_rate,
                                                          learning_rate,
                                                          lr_decay_factor)
        else:
            self.learning_rate_decay = False

        # rest of initial parameters needed for training.
        self.batch_size = batch_size
        self.min_batch_size = min_batch_size
        self.n_epoch = epochs
        self.save_frequency = save_freq
        self.early_stop_threshold = stop_threshold
        self.early_stop_length = stop_patience
        self.grad_clip = grad_clip
        self.hard_clip = hard_clip

    def get_updates(self, gradients):
        """
        This returns the parameter updates to use during training. It defaults to only using (annealed) learning rate.

        Parameters
        ----------
        gradients : dict
            A dictionary mapping from the model's parameters to their gradients.

        Returns
        -------
        updates : OrderdDict
            A dictionary mapping from the old model parameters, to their new
            values after a single iteration of the learning rule.
        """
        log.debug('Setting up Stochastic Gradient Descent for optimizer...')
        updates = OrderedDict()
        for (param, gradient) in iteritems(gradients):
            scaled_lr = self.learning_rate * self.lr_scalers.get(param, 1.)
            updates[param] = param - scaled_lr * gradient
        return updates

    def train(self, monitor_channels=None, plot=None):
        """
        This method performs the training!!!
        It is an online training method that goes over minibatches from the dataset for a number of epochs,
        updating parameters after each minibatch.

        You can disrupt training with a KeyBoardInterrupt and it should exit/save parameters gracefully.

        Parameters
        ----------
        monitor_channels : list(MonitorsChannel or Monitor), optional
            The list of channels or monitors containing monitor expressions/variables to compile and evaluate
            on the data.
        plot : Plot, optional
            The Plot object to use if we want to graph the outputs (uses bokeh server).
        """
        if not self.model:
            log.error("No self.model for the Optimizer!")
            raise AssertionError("Needs to be initialized with a Model! (Or something went wrong if train() "
                                 "was called from the Model. Try initializing the Optimizer with the model param "
                                 "and calling optimizer.train().")

        #########################
        # gradients and updates #
        #########################
        # grab the model parameters to use during training
        self.params = self.model.get_params()
        # Now create the training cost function for the model to use while training - update parameters
        # gradient!
        # First find the basic variables that will be updated
        params = set()
        for param in self.params.values():
            params.update(base_variables(param))
        params = list(params)
        gradients = grad(cost=self.loss_expression, wrt=params)
        # now create the dictionary mapping the parameter with its gradient
        gradients = OrderedDict(
            [(param, g) for param, g in zip(params, gradients)]
        )
        # clip gradients if we want.
        gradients = clip_gradients(gradients, self.grad_clip, self.hard_clip)

        # Calculate the optimizer updates each run
        # This is where the magic happens for a lot of sub-implementations of SGD!
        # It tells how to update the params each training epoch
        gradient_updates = self.get_updates(gradients)

        # Combine the updates from the model also if applicable
        updates = self.model.get_updates()
        if updates:
            updates.update(gradient_updates)
        else:
            updates = gradient_updates

        log.info("%s params: %s", self.model._classname, str(list(self.params.keys())))

        ############
        # monitors #
        ############
        # deal with the monitor channels if they were given (or take them from the plot)
        if monitor_channels is None and plot is not None and len(plot.channels) > 0:
            monitor_channels = plot.channels
        self.train_monitors_dict = {}
        self.valid_monitors_dict = {}
        self.test_monitors_dict = {}
        self.train_monitors_outservice_dict = {}
        self.valid_monitors_outservice_dict = {}
        self.test_monitors_outservice_dict = {}
        if monitor_channels:
            # collapse the appropriate monitors into their (name, expression, out_service) tuples
            train_collapsed = collapse_channels(monitor_channels, train=True)
            valid_collapsed = collapse_channels(monitor_channels, valid=True)
            test_collapsed  = collapse_channels(monitor_channels, test=True)
            # get name: expression dictionary
            self.train_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in train_collapsed])
            self.valid_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in valid_collapsed])
            self.test_monitors_dict  = OrderedDict([(name, expression) for name, expression, _ in test_collapsed])
            # get name: outservice dictionary
            self.train_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in train_collapsed])
            self.valid_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in valid_collapsed])
            self.test_monitors_outservice_dict  = OrderedDict([(name, out) for name, _, out in test_collapsed])

        #######################################
        # compile train and monitor functions #
        #######################################
        function_input = raise_to_list(self.model.get_inputs())
        if self.loss_targets is not None:
            function_input += self.loss_targets
        # Compile the training function!
        log.info('Compiling f_learn function for model %s...', self.model._classname)
        t = time.time()

        f_learn = function(inputs=function_input,
                           updates=updates,
                           outputs=[self.loss_expression] + list(self.train_monitors_dict.values()),
                           name='f_learn')

        log.info('f_learn compilation took %s', make_time_units_string(time.time() - t))

        # figure out if we want valid and test (monitors)
        self.valid_flag = (self.dataset.valid_inputs is not None) and (len(self.valid_monitors_dict) > 0)
        self.test_flag = (self.dataset.test_inputs is not None) and (len(self.test_monitors_dict) > 0)
        # Now compile the monitor functions!
        log.debug("Compiling monitor functions...")
        monitor_t = time.time()
        # valid monitors
        if self.valid_flag:
            self.valid_monitor_function = function(
                inputs=function_input,
                updates=self.model.get_updates(),
                outputs=list(self.valid_monitors_dict.values()),
                name='valid_monitor_function'
            )
        else:
            self.valid_monitor_function = None

        # test monitors
        if self.test_flag:
            self.test_monitor_function = function(
                inputs=function_input,
                updates=self.model.get_updates(),
                outputs=list(self.test_monitors_dict.values()),
                name='test_monitor_function'
            )
        else:
            self.test_monitor_function = None

        log.debug("Compilation done. Took %s", make_time_units_string(time.time() - monitor_t))

        ##################
        # start training #
        ##################
        log.info("-----------TRAINING %s FOR %d EPOCHS-----------",
                 self.model._classname, self.n_epoch)

        self.STOP = False
        self.epoch_counter = 0
        # reset any decay params
        for decay_param in self.get_decay_params():
            decay_param.reset()

        self.times = []
        self.best_cost = numpy.inf
        self.best_params = None
        self.patience = 0

        t = time.time()

        while not self.STOP:
            try:
                self.STOP = self._perform_one_epoch(f_learn, plot)
            except KeyboardInterrupt:
                log.info("STOPPING EARLY FROM KEYBOARDINTERRUPT")
                self.STOP = True

        # save params
        if self.best_params is not None:
            log.debug("Restoring best model parameters...")
            self.model.set_param_values(self.best_params, borrow=False)
        log.debug("Saving model parameters...")
        self.model.save_params('trained_epoch_' + str(self.epoch_counter))

        log.info("------------TRAIN TIME TOOK %s---------", make_time_units_string(time.time() - t))

    def _perform_one_epoch(self, f_learn, plot=None):
        """
        Performs a single training iteration with the given learn function.
        """
        self.epoch_counter += 1
        t = time.time()
        log.info('EPOCH %s', str(self.epoch_counter))

        # set the noise switches on for training function! (this is where things like dropout happen)
        if not self.model.switches_on:
            self.model.turn_on_switches()

        #########
        # train #
        #########
        train_costs = []
        train_monitors = {key: [] for key in self.train_monitors_dict.keys()}
        train_data = [
            minibatch(input_data, self.batch_size, self.min_batch_size)
            for input_data in raise_to_list(self.dataset.train_inputs)
            ]
        if self.dataset.train_targets is not None and not self.unsupervised:
            train_data += [
                minibatch(target, self.batch_size, self.min_batch_size)
                for target in raise_to_list(self.dataset.train_targets)
                ]

        for batch in min_normalized_izip(*train_data):
            _outs = raise_to_list(f_learn(*batch))
            train_costs.append(_outs[0])
            # handle any user defined monitors (if different from the train cost)
            if len(train_monitors) > 0:
                current_monitors = zip(self.train_monitors_dict.keys(), _outs[1:])
                for name, val in current_monitors:
                    val = numpy.asarray(val)
                    train_monitors[name].append(val)

        # get the mean values for the batches
        mean_train = numpy.mean(train_costs, 0)
        current_mean_monitors = {key: numpy.mean(vals, 0) for key, vals in train_monitors.items()}
        # log the mean values!
        log.info('Train cost: %s', trunc(mean_train))
        if len(current_mean_monitors) > 0:
            log.info('Train monitors: %s', str(current_mean_monitors))
        # send the values to their outservices
        for name, service in self.train_monitors_outservice_dict.items():
            if name in current_mean_monitors and service:
                service.write(current_mean_monitors[name], "train")
        # if there is a plot, also send them over!
        if plot:
            plot.update_plots(epoch=self.epoch_counter, monitors=current_mean_monitors)

        # set the noise switches off for valid and test sets! we assume unseen data is noisy anyway :)
        if self.model.switches_on:
            self.model.turn_off_switches()

        #########
        # valid #
        #########
        self._compute_over_subset("valid", self.dataset.valid_inputs, self.dataset.valid_targets,
                                  self.valid_monitors_dict, self.valid_monitor_function,
                                  self.valid_monitors_outservice_dict, plot)

        ########
        # test #
        ########
        self._compute_over_subset("test", self.dataset.test_inputs, self.dataset.test_targets,
                                  self.test_monitors_dict, self.test_monitor_function,
                                  self.test_monitors_outservice_dict, plot)

        ###########
        # cleanup #
        ###########
        # check for early stopping on train costs
        cost = numpy.sum(train_costs)
        # if the cost improved, reset the patience and record the best cost.
        if cost < self.best_cost * self.early_stop_threshold:
            self.patience = 0
            self.best_cost = cost
            # save the parameters that made it the best
            self.best_params = self.model.get_param_values(borrow=False)
        elif not numpy.isnan(cost):
            self.patience += 1

        # check for stopping either from n_epochs or from threshold/patience
        stop = False
        if self.epoch_counter >= self.n_epoch:
            log.info("Stopping (reached max number of epochs)...")
            stop = True
        if self.patience >= self.early_stop_length:
            log.info("Stopping early (reached stop threshold)...")
            stop = True

        timing = time.time() - t
        self.times.append(timing)

        log.info('time: ' + make_time_units_string(timing))

        log.debug('remaining time: ' +
                 make_time_units_string((self.n_epoch - self.epoch_counter) * numpy.mean(self.times)))

        if (self.epoch_counter % self.save_frequency) == 0:
            #save params
            self.model.save_params('trained_epoch_' + str(self.epoch_counter))

        # ANNEAL!
        if not stop:
            # perform the appropriate decay on the decay functions/parameters for this optimizer and model
            for decay_param in self.get_decay_params():
                decay_param.decay()

        # return whether or not to stop this epoch
        return stop

    def _compute_over_subset(self, subset, inputs, targets,
                             monitors_dict, monitor_function, monitors_outservice_dict,
                             plot):
        inputs = raise_to_list(inputs)
        targets = raise_to_list(targets)
        if inputs is not None and len(monitors_dict) > 0:
            monitors = {key: [] for key in monitors_dict.keys()}
            data = [minibatch(input, self.batch_size, self.min_batch_size) for input in inputs]
            if targets is not None and not self.unsupervised:
                data += [minibatch(target, self.batch_size, self.min_batch_size) for target in targets]

            for batch in min_normalized_izip(*data):
                _outs = raise_to_list(monitor_function(*batch))
                current_monitors = zip(monitors_dict.keys(), _outs)
                for name, val in current_monitors:
                    val = numpy.asarray(val)
                    monitors[name].append(val)

            # get the mean values for the batches
            current_mean_monitors = {key: numpy.mean(vals, 0) for key, vals in monitors.items()}
            # log the mean values!
            log.info('%s monitors: %s', subset, str(current_mean_monitors))
            # send the values to their outservices
            for name, service in monitors_outservice_dict.items():
                if name in current_mean_monitors and service:
                    service.write(current_mean_monitors[name], subset)
            # if there is a plot, also send them over!
            if plot:
                plot.update_plots(epoch=self.epoch_counter, monitors=current_mean_monitors)

    def get_decay_params(self):
        """
        Returns a list of all the Decay objects to decay during training.

        Returns
        -------
        list
            List of Decay objects to use after each training epoch - in this case the
            learning rate decay.
        """
        decay_params = self.model.get_decay_params()
        if hasattr(self, 'learning_rate_decay') and self.learning_rate_decay:
            decay_params.append(self.learning_rate_decay)
        return decay_params
Beispiel #21
0
def main():
    var = theano.shared(T.zeros(shape=(88, 100),
                                dtype=theano.config.floatX).eval(),
                        name='W')
    updates = [(var, add_uniform(input=var, noise_level=.02))]

    stats = get_stats(var)
    l1 = stats.pop('l1')
    l2 = stats.pop('l2')
    min = stats.pop('min')
    max = stats.pop('max')
    var = stats.pop('var')
    std = stats.pop('std')
    mean = stats.pop('mean')

    mean_monitor = Monitor('mean', mean, train=True, valid=True)
    var_monitor = Monitor('var', var)

    w_channel = MonitorsChannel('W', monitors=mean_monitor)

    stat_channel = MonitorsChannel('stats', monitors=[var_monitor])

    monitors = [w_channel, stat_channel]

    train_collapsed = collapse_channels(monitors, train=True)
    train_collapsed = OrderedDict([(name, expression)
                                   for name, expression, _ in train_collapsed])
    valid_collapsed = collapse_channels(monitors, valid=True)
    valid_collapsed = OrderedDict([(name, expression)
                                   for name, expression, _ in valid_collapsed])

    plot = Plot(bokeh_doc_name='test_plots',
                monitor_channels=monitors,
                open_browser=True)

    log.debug('compiling...')
    f = theano.function(inputs=[],
                        outputs=list(train_collapsed.values()),
                        updates=updates)
    f2 = theano.function(inputs=[],
                         outputs=list(valid_collapsed.values()),
                         updates=updates)
    log.debug('done')

    t1 = time.time()

    for epoch in range(100):
        t = time.time()
        log.debug(epoch)
        vals = f()
        m = OrderedDict(zip(train_collapsed.keys(), vals))
        plot.update_plots(epoch, m)
        log.debug('----- ' + make_time_units_string(time.time() - t))

    for epoch in range(100):
        t = time.time()
        log.debug(epoch)
        vals = f2()
        m = OrderedDict(zip(valid_collapsed.keys(), vals))
        plot.update_plots(epoch, m)
        log.debug('----- ' + make_time_units_string(time.time() - t))

    log.debug("TOTAL TIME " + make_time_units_string(time.time() - t1))
Beispiel #22
0
class Optimizer(object):
    """
    Default interface for an optimizer implementation - this provides the necessary parameter updates when
    training a model on a dataset using an online stochastic process. The base framework for performing
    stochastic gradient descent.
    """
    def __init__(self, dataset, loss=None, model=None,
                 epochs=1000, batch_size=100, min_batch_size=1,
                 save_freq=10, stop_threshold=None, stop_patience=50,
                 learning_rate=1e-3, lr_decay=None, lr_decay_factor=None,
                 grad_clip=None, hard_clip=False,
                 **kwargs):
        """
        Initialize the Optimizer.

        Parameters
        ----------
        dataset : Dataset
            The :class:`opendeep.data.Dataset` to use when training the Model.
        loss : Loss
            The :class:`opendeep.optimization.loss.Loss` function to compare the model to a 'target' result.
        model : Model
            The :class:`opendeep.models.Model` to train. Needed if the Optimizer isn't being passed to a
            Model's .train() method.
        epochs : int
            How many training iterations over the dataset to go.
        batch_size : int
            How many examples from the training dataset to use in parallel.
        min_batch_size : int
            The minimum number of examples required at a time (for things like time series, this would be > 1).
        save_freq : int, optional
            How many epochs to train between each new save of the Model's parameters.
        stop_threshold : float, optional
            The factor by how much the best validation training score needs to improve to determine early stopping.
        stop_patience : int, optional
            The patience or number of epochs to wait after the stop_threshold has been reached before stopping.
        learning_rate : float
            The multiplicative amount to adjust parameters based on their gradient values.
        lr_decay : str
            The decay function to use for changing the learning rate over epochs. See
            `opendeep.utils.decay` for classes of decay and documentation.
        lr_decay_factor : float
            The amount of decay to use for the ``lr_decay`` type of decay.
        grad_clip : float, optional
            Whether to clip gradients. This will clip the norm of the gradients either with a hard cutoff or rescaling.
        hard_clip : bool
            Whether to use a hard cutoff or rescaling for clipping gradients.
        """
        log.info("Initializing optimizer %s", str(self.__class__.__name__))

        # Deal with early stopping None initializations (no early stopping).
        if not stop_threshold:
            stop_threshold = numpy.inf
        if not save_freq:
            save_freq = 1000000
        if not stop_patience:
            stop_patience = 1

        # Put all init parameters in self.args so we can log the initial configuration.
        self.args = locals().copy()
        self.args.pop('self')
        kwargs = self.args.pop('kwargs')
        self.args = add_kwargs_to_dict(kwargs, self.args)
        # log the arguments
        log.info("Optimizer config args: %s", str(self.args))
        # if the optimizer wasn't initialized with a Model (train() being called from the model class itself),
        # just return. (This seems kinda hacky but hey, people wanted .train() to happen from Model and there
        # wasn't really a better way unless the epoch looping logic was in that method for Model. That wasn't
        # the best option because other methods besides stochastic ones can exist for optimizers in the future.
        # TODO: fix this up - feels like a hack just to make model.train() work...
        if not model:
            return
        # Otherwise, things are proceeding as normal. Carry on...

        assert isinstance(model, Model), "Optimizer input model needs to be a Model class! " \
                                         "Found %s" % str(model.__class__.__name__)
        assert isinstance(dataset, Dataset), "Optimizer input dataset needs to be a Dataset class! " \
                                             "Found %s" % str(dataset.__class__.__name__)
        # deal with loss expression/targets
        if loss is not None:
            assert isinstance(loss, Loss), "Optimizer input loss needs to be a Loss class! " \
                                           "Found %s" % str(loss.__class__.__name__)
        if isinstance(loss, Loss):
            self.loss_targets = loss.get_targets()
            self.loss_expression = loss.get_loss()
        else:
            assert model.get_loss() is not None, "No Loss specified, and the model does not have one implemented."
            if isinstance(model.get_loss(), tuple):
                self.loss_targets = raise_to_list(model.get_loss()[0])
                self.loss_expression = model.get_loss()[1]
            else:
                self.loss_targets = None
                self.loss_expression = model.get_loss()

        model_inputs = raise_to_list(model.get_inputs())
        n_model_inputs = len(model_inputs)

        model_targets = self.loss_targets or []
        for input in model_inputs:
            if input in model_targets:
                model_targets.remove(input)

        n_model_targets = len(model_targets)
        self.unsupervised = (n_model_targets is 0)
        # make sure the number of inputs/targets matches up with the dataset properties
        # train
        assert n_model_inputs == len(raise_to_list(dataset.train_inputs)), \
            "Dataset has %d train inputs, while model expects %d" % \
            (len(raise_to_list(dataset.train_inputs)), n_model_inputs)
        if not self.unsupervised:
            assert n_model_targets == len(raise_to_list(dataset.train_targets) or []), \
                "Dataset has %d train targets, while model expects %d" % \
                (len(raise_to_list(dataset.train_targets) or []), n_model_targets)
        # valid
        if dataset.valid_inputs is not None:
            assert n_model_inputs == len(raise_to_list(dataset.valid_inputs)), \
                "Dataset has %d valid inputs, while model expects %d" % \
                (len(raise_to_list(dataset.valid_inputs)), n_model_inputs)
            if not self.unsupervised:
                assert n_model_targets == len(raise_to_list(dataset.valid_targets) or []), \
                    "Dataset has %d valid targets, while model expects %d" % \
                    (len(raise_to_list(dataset.valid_targets) or []), n_model_targets)
        # test
        if dataset.test_inputs is not None:
            assert n_model_inputs == len(raise_to_list(dataset.test_inputs)), \
                "Dataset has %d test inputs, while model expects %d" % \
                (len(raise_to_list(dataset.test_inputs)), n_model_inputs)
            if not self.unsupervised:
                assert n_model_targets == len(raise_to_list(dataset.test_targets) or []), \
                    "Dataset has %d test targets, while model expects %d" % \
                    (len(raise_to_list(dataset.test_targets) or []), n_model_targets)

        # now we are happy, we can add them to `self`
        self.model = model
        self.dataset = dataset
        self.loss = loss

        # Learning rate - how drastic of a step do the parameters change
        self.learning_rate = sharedX(learning_rate, 'learning_rate')
        # whether to scale individual model parameters' learning rates.
        self.lr_scalers = self.model.get_lr_scalers()
        # whether to decay
        if lr_decay:
            self.learning_rate_decay = get_decay_function(lr_decay,
                                                          self.learning_rate,
                                                          learning_rate,
                                                          lr_decay_factor)
        else:
            self.learning_rate_decay = False

        # rest of initial parameters needed for training.
        self.batch_size = batch_size
        self.min_batch_size = min_batch_size
        self.n_epoch = epochs
        self.save_frequency = save_freq
        self.early_stop_threshold = stop_threshold
        self.early_stop_length = stop_patience
        self.grad_clip = grad_clip
        self.hard_clip = hard_clip

    def get_updates(self, gradients):
        """
        This returns the parameter updates to use during training. It defaults to only using (annealed) learning rate.

        Parameters
        ----------
        gradients : dict
            A dictionary mapping from the model's parameters to their gradients.

        Returns
        -------
        updates : OrderdDict
            A dictionary mapping from the old model parameters, to their new
            values after a single iteration of the learning rule.
        """
        log.debug('Setting up Stochastic Gradient Descent for optimizer...')
        updates = OrderedDict()
        for (param, gradient) in six.iteritems(gradients):
            scaled_lr = self.learning_rate * self.lr_scalers.get(param, 1.)
            updates[param] = param - scaled_lr * gradient
        return updates

    def train(self, monitor_channels=None, train_outservice=None, plot=None):
        """
        This method performs the training!!!
        It is an online training method that goes over minibatches from the dataset for a number of epochs,
        updating parameters after each minibatch.

        You can disrupt training with a KeyBoardInterrupt and it should exit/save parameters gracefully.

        Parameters
        ----------
        monitor_channels : list(MonitorsChannel or Monitor), optional
            The list of channels or monitors containing monitor expressions/variables to compile and evaluate
            on the data.
        train_outservice : OutService, optional
            The OutService to use for the automatically created train_cost monitor. Default of None just outputs
            to logs.
        plot : Plot, optional
            The Plot object to use if we want to graph the outputs (uses bokeh server).
        """
        if not self.model:
            log.error("No self.model for the Optimizer!")
            raise AssertionError("Needs to be initialized with a Model! (Or something went wrong if train() "
                                 "was called from the Model. Try initializing the Optimizer with the model param "
                                 "and calling optimizer.train().")

        #########################
        # gradients and updates #
        #########################
        # grab the model parameters to use during training
        self.params = self.model.get_params()
        # Now create the training cost function for the model to use while training - update parameters
        # gradient!
        gradients = grad(cost=self.loss_expression, wrt=list(self.params.values()))
        # now create the dictionary mapping the parameter with its gradient
        gradients = OrderedDict(
            [(param, g) for param, g in zip(list(self.params.values()), gradients)]
        )
        # clip gradients if we want.
        gradients = clip_gradients(gradients, self.grad_clip, self.hard_clip)

        # Calculate the optimizer updates each run
        # This is where the magic happens for a lot of sub-implementations of SGD!
        # It tells how to update the params each training epoch
        gradient_updates = self.get_updates(gradients)

        # Combine the updates from the model also if applicable
        updates = self.model.get_updates()
        if updates:
            updates.update(gradient_updates)
        else:
            updates = gradient_updates

        log.info("%s params: %s", self.model._classname, str(list(self.params.keys())))

        ############
        # monitors #
        ############
        # deal with the monitor channels if they were given (or take them from the plot)
        if monitor_channels is None and plot is not None and len(plot.channels) > 0:
            monitor_channels = plot.channels
        self.train_monitors_dict = {}
        self.valid_monitors_dict = {}
        self.test_monitors_dict = {}
        self.train_monitors_outservice_dict = {}
        self.valid_monitors_outservice_dict = {}
        self.test_monitors_outservice_dict = {}
        if monitor_channels:
            # collapse the appropriate monitors into their (name, expression, out_service) tuples
            train_collapsed = collapse_channels(monitor_channels, train=True)
            valid_collapsed = collapse_channels(monitor_channels, valid=True)
            test_collapsed  = collapse_channels(monitor_channels, test=True)
            # get name: expression dictionary
            self.train_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in train_collapsed])
            self.valid_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in valid_collapsed])
            self.test_monitors_dict  = OrderedDict([(name, expression) for name, expression, _ in test_collapsed])
            # get name: outservice dictionary
            self.train_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in train_collapsed])
            self.valid_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in valid_collapsed])
            self.test_monitors_outservice_dict  = OrderedDict([(name, out) for name, _, out in test_collapsed])
        # finally deal with an outservice provided to monitor training cost
        self.train_outservice = train_outservice
        # remove redundant files made by the fileservice for the train monitor.
        # TODO: THIS FEELS LIKE A HACK. I don't like it.
        if isinstance(self.train_outservice, FileService):
            os.remove(self.train_outservice.valid_filename)
            os.remove(self.train_outservice.test_filename)

        #######################################
        # compile train and monitor functions #
        #######################################
        function_input = raise_to_list(self.model.get_inputs())
        if self.loss_targets is not None:
            function_input += self.loss_targets
        # Compile the training function!
        log.info('Compiling f_learn function for model %s...', self.model._classname)
        t = time.time()

        f_learn = function(inputs=function_input,
                           updates=updates,
                           outputs=[self.loss_expression] + list(self.train_monitors_dict.values()),
                           name='f_learn')

        log.info('f_learn compilation took %s', make_time_units_string(time.time() - t))

        # figure out if we want valid and test (monitors)
        self.valid_flag = (self.dataset.valid_inputs is not None) and (len(self.valid_monitors_dict) > 0)
        self.test_flag = (self.dataset.test_inputs is not None) and (len(self.test_monitors_dict) > 0)
        # Now compile the monitor functions!
        log.debug("Compiling monitor functions...")
        monitor_t = time.time()
        # valid monitors
        if self.valid_flag:
            self.valid_monitor_function = function(
                inputs=function_input,
                updates=self.model.get_updates(),
                outputs=list(self.valid_monitors_dict.values()),
                name='valid_monitor_function'
            )
        else:
            self.valid_monitor_function = None

        # test monitors
        if self.test_flag:
            self.test_monitor_function = function(
                inputs=function_input,
                updates=self.model.get_updates(),
                outputs=list(self.test_monitors_dict.values()),
                name='test_monitor_function'
            )
        else:
            self.test_monitor_function = None

        log.debug("Compilation done. Took %s", make_time_units_string(time.time() - monitor_t))

        ##################
        # start training #
        ##################
        log.info("-----------TRAINING %s FOR %d EPOCHS-----------",
                 self.model._classname, self.n_epoch)

        self.STOP = False
        self.epoch_counter = 0
        # reset any decay params
        for decay_param in self.get_decay_params():
            decay_param.reset()

        self.times = []
        self.best_cost = numpy.inf
        self.best_params = None
        self.patience = 0

        t = time.time()

        while not self.STOP:
            try:
                self.STOP = self._perform_one_epoch(f_learn, plot)
            except KeyboardInterrupt:
                log.info("STOPPING EARLY FROM KEYBOARDINTERRUPT")
                self.STOP = True

        # save params
        if self.best_params is not None:
            log.debug("Restoring best model parameters...")
            for best_param, param_value in self.best_params.items():
                self.params[best_param].set_value(param_value, borrow=False)
        log.debug("Saving model parameters...")
        self.model.save_params('trained_epoch_' + str(self.epoch_counter))

        log.info("------------TRAIN TIME TOOK %s---------", make_time_units_string(time.time() - t))

    def _perform_one_epoch(self, f_learn, plot=None):
        """
        Performs a single training iteration with the given learn function.
        """
        self.epoch_counter += 1
        t = time.time()
        log.info('EPOCH %s', str(self.epoch_counter))

        # set the noise switches on for training function! (this is where things like dropout happen)
        if not self.model.switches_on:
            self.model.turn_on_switches()

        #########
        # train #
        #########
        train_costs = []
        train_monitors = {key: [] for key in self.train_monitors_dict.keys()}
        train_data = [
            minibatch(input_data, self.batch_size, self.min_batch_size)
            for input_data in raise_to_list(self.dataset.train_inputs)
            ]
        if self.dataset.train_targets is not None and not self.unsupervised:
            train_data += [
                minibatch(target, self.batch_size, self.min_batch_size)
                for target in raise_to_list(self.dataset.train_targets)
                ]

        for batch in min_normalized_izip(*train_data):
            _outs = raise_to_list(f_learn(*batch))
            train_costs.append(_outs[0])
            # handle any user defined monitors
            if len(train_monitors) > 0:
                current_monitors = zip(self.train_monitors_dict.keys(), _outs[1:])
                for name, val in current_monitors:
                    val = numpy.asarray(val)
                    train_monitors[name].append(val)

        # get the mean values for the batches
        mean_train = numpy.mean(train_costs, 0)
        current_mean_monitors = {key: numpy.mean(vals, 0) for key, vals in train_monitors.items()}
        # log the mean values!
        log.info('Train cost: %s', trunc(mean_train))
        if len(current_mean_monitors) > 0:
            log.info('Train monitors: %s', str(current_mean_monitors))
        # send the values to their outservices
        if self.train_outservice:
            self.train_outservice.write(mean_train, "train")
        for name, service in self.train_monitors_outservice_dict.items():
            if name in current_mean_monitors and service:
                service.write(current_mean_monitors[name], "train")
        # if there is a plot, also send them over!
        if plot:
            current_mean_monitors.update({TRAIN_COST_KEY: mean_train})
            plot.update_plots(epoch=self.epoch_counter, monitors=current_mean_monitors)

        # set the noise switches off for valid and test sets! we assume unseen data is noisy anyway :)
        if self.model.switches_on:
            self.model.turn_off_switches()

        #########
        # valid #
        #########
        self._compute_over_subset("valid", self.dataset.valid_inputs, self.dataset.valid_targets,
                                  self.valid_monitors_dict, self.valid_monitor_function,
                                  self.valid_monitors_outservice_dict, plot)

        ########
        # test #
        ########
        self._compute_over_subset("test", self.dataset.test_inputs, self.dataset.test_targets,
                                  self.test_monitors_dict, self.test_monitor_function,
                                  self.test_monitors_outservice_dict, plot)

        ###########
        # cleanup #
        ###########
        # check for early stopping on train costs
        cost = numpy.sum(train_costs)
        # if the cost improved, reset the patience and record the best cost.
        if cost < self.best_cost * self.early_stop_threshold:
            self.patience = 0
            self.best_cost = cost
            # save the parameters that made it the best
            self.best_params = {key: param.get_value(borrow=False) for key, param in self.params.items()}
        elif not numpy.isnan(cost):
            self.patience += 1

        # check for stopping either from n_epochs or from threshold/patience
        stop = False
        if self.epoch_counter >= self.n_epoch:
            log.info("Stopping (reached max number of epochs)...")
            stop = True
        if self.patience >= self.early_stop_length:
            log.info("Stopping early (reached stop threshold)...")
            stop = True

        timing = time.time() - t
        self.times.append(timing)

        log.info('time: ' + make_time_units_string(timing))

        log.debug('remaining time: ' +
                 make_time_units_string((self.n_epoch - self.epoch_counter) * numpy.mean(self.times)))

        if (self.epoch_counter % self.save_frequency) == 0:
            #save params
            self.model.save_params('trained_epoch_' + str(self.epoch_counter))

        # ANNEAL!
        if not stop:
            # perform the appropriate decay on the decay functions/parameters for this optimizer and model
            for decay_param in self.get_decay_params():
                decay_param.decay()

        # return whether or not to stop this epoch
        return stop

    def _compute_over_subset(self, subset, inputs, targets,
                             monitors_dict, monitor_function, monitors_outservice_dict,
                             plot):
        inputs = raise_to_list(inputs)
        targets = raise_to_list(targets)
        if inputs is not None and len(monitors_dict) > 0:
            monitors = {key: [] for key in monitors_dict.keys()}
            data = [minibatch(input, self.batch_size, self.min_batch_size) for input in inputs]
            if targets is not None and not self.unsupervised:
                data += [minibatch(target, self.batch_size, self.min_batch_size) for target in targets]

            for batch in min_normalized_izip(*data):
                _outs = raise_to_list(monitor_function(*batch))
                current_monitors = zip(monitors_dict.keys(), _outs)
                for name, val in current_monitors:
                    val = numpy.asarray(val)
                    monitors[name].append(val)

            # get the mean values for the batches
            current_mean_monitors = {key: numpy.mean(vals, 0) for key, vals in monitors.items()}
            # log the mean values!
            log.info('%s monitors: %s', subset, str(current_mean_monitors))
            # send the values to their outservices
            for name, service in monitors_outservice_dict.items():
                if name in current_mean_monitors and service:
                    service.write(current_mean_monitors[name], subset)
            # if there is a plot, also send them over!
            if plot:
                plot.update_plots(epoch=self.epoch_counter, monitors=current_mean_monitors)

    def get_decay_params(self):
        """
        Returns a list of all the Decay objects to decay during training.

        Returns
        -------
        list
            List of Decay objects to use after each training epoch - in this case the
            learning rate decay.
        """
        decay_params = self.model.get_decay_params()
        if hasattr(self, 'learning_rate_decay') and self.learning_rate_decay:
            decay_params.append(self.learning_rate_decay)
        return decay_params
Beispiel #23
0
class Monitor(object):
    """
    A class for monitoring Models while they are being trained.

    A monitor object records the number of minibatches and number of
    examples the model has trained, as well as any number of "channels"
    that track quantities of interest (examples: the objective
    function, measures of hidden unit activity, reconstruction error,
    sum of squared second derivatives, average norm of the weight
    vectors, etc.)

    Parameters
    ----------
    model : `pylearn2.models.model.Model`
    """

    def __init__(self, model):
        self.training_succeeded = False
        self.model = model
        self.channels = OrderedDict()
        self._num_batches_seen = 0
        self._examples_seen = 0
        self._epochs_seen = 0
        self._datasets = []
        self._iteration_mode = []
        self._batch_size = []
        self._num_batches = []
        self._dirty = True
        self._rng_seed = []
        self.names_to_del = ['theano_function_mode']
        self.t0 = time.time()
        self.theano_function_mode = None

        # Initialize self._nested_data_specs, self._data_specs_mapping,
        # and self._flat_data_specs
        self._build_data_specs()

    def _build_data_specs(self):
        """
        Computes a nested data_specs for input and all channels

        Also computes the mapping to flatten it. This function is
        called from redo_theano.
        """
        # Ask the model what it needs
        m_space, m_source = self.model.get_monitoring_data_specs()
        input_spaces = [m_space]
        input_sources = [m_source]
        for channel in self.channels.values():
            space = channel.data_specs[0]
            assert isinstance(space, Space)
            input_spaces.append(space)
            input_sources.append(channel.data_specs[1])

        nested_space = CompositeSpace(input_spaces)
        nested_source = tuple(input_sources)

        self._nested_data_specs = (nested_space, nested_source)
        self._data_specs_mapping = DataSpecsMapping(self._nested_data_specs)

        flat_space = self._data_specs_mapping.flatten(nested_space,
                                                      return_tuple=True)
        flat_source = self._data_specs_mapping.flatten(nested_source,
                                                       return_tuple=True)
        self._flat_data_specs = (CompositeSpace(flat_space), flat_source)

    def set_theano_function_mode(self, mode):
        """
        .. todo::

            WRITEME

        Parameters
        ----------
        mode : theano.compile.Mode
            Theano functions for the monitoring channels will be
            compiled and run using this mode.
        """
        if self.theano_function_mode != mode:
            self._dirty = True
            self.theano_function_mode = mode

    def add_dataset(self, dataset, mode='sequential', batch_size=None,
                    num_batches=None, seed=None):
        """
        Determines the data used to calculate the values of each channel.

        Parameters
        ----------
        dataset : object
            A `pylearn2.datasets.Dataset` object.
        mode : str or object, optional
            Iteration mode; see the docstring of the `iterator` method
            on `pylearn2.datasets.Dataset` for details.
        batch_size : int, optional
            The size of an individual batch. Optional if `mode` is
            'sequential' and `num_batches` is specified (batch size
            will be calculated based on full dataset size).
        num_batches : int, optional
            The total number of batches. Unnecessary if `mode` is
            'sequential' and `batch_size` is specified (number of
            batches will be calculated based on full dataset size).
        seed : int, optional
            Optional. The seed to be used for random iteration modes.
        """
        # The user can ommit using lists if only one dataset is set
        if not isinstance(dataset, list):
            dataset = [dataset]
        if not isinstance(mode, list):
            mode = [mode]
        if not isinstance(batch_size, list):
            batch_size = [batch_size]
        if not isinstance(num_batches, list):
            num_batches = [num_batches]
        if seed is None:
            seed = [None] * len(dataset)
        if not isinstance(seed, list):
            seed = [seed]
        if len(mode) != len(dataset):
            raise ValueError("Received " + str(len(dataset)) +
                             " dataset but " + str(len(mode)) + " modes.")
        if any([len(l) != len(dataset) for l in [batch_size, seed]]):
            raise ValueError("make sure each dataset has its iteration " +
                             "batch size and number of batches.")
        for (d, m, b, n, sd) in safe_izip(dataset, mode, batch_size,
                                          num_batches, seed):
            try:
                it = d.iterator(mode=m,
                                batch_size=b,
                                num_batches=n,
                                data_specs=self._flat_data_specs,
                                return_tuple=True,
                                rng=sd)
            except ValueError as exc:
                reraise_as(ValueError("invalid iteration parameters in " +
                                      "Monitor.add_dataset: " + str(exc)))
            if it.stochastic:
                # Must be a seed, not a random number generator. If it were a
                # random number generator, different iterators using it would
                # update its state, so we would not get the same iterator
                # each time. Also, must not be None, because this makes the
                # iterator pick a seed based on the clock
                if sd is None:
                    raise TypeError("Monitor requires a seed when using " +
                                    "stochastic iteration modes.")
                if not isinstance(sd, (list, tuple, int)):
                    raise TypeError("Monitor requires a seed (not a random " +
                                    "number generator) when using " +
                                    "stochastic iteration modes.")
            else:
                # The iterator should catch this, but let's double-check
                assert sd is None

            if not d in self._datasets:
                self._datasets.append(d)
                self._iteration_mode.append(m)
                self._batch_size.append(b)
                self._num_batches.append(n)
                self._rng_seed.append(sd)

    def __call__(self):
        """
        Runs the model on the monitoring dataset in order to add one
        data point to each of the channels.
        """

        # If the channels have changed at all, we need to recompile the theano
        # functions used to compute them
        if self._dirty:
            self.redo_theano()

        datasets = self._datasets

        # Set all channels' val_shared to 0
        self.begin_record_entry()
        for d, i, b, n, a, sd, ne in safe_izip(datasets,
                                               self._iteration_mode,
                                               self._batch_size,
                                               self._num_batches,
                                               self.accum,
                                               self._rng_seed,
                                               self.num_examples):
            if isinstance(d, basestring):
                d = yaml_parse.load(d)
                raise NotImplementedError()

            # need to put d back into self._datasets
            myiterator = d.iterator(mode=i,
                                    batch_size=b,
                                    num_batches=n,
                                    data_specs=self._flat_data_specs,
                                    return_tuple=True,
                                    rng=sd)

            # If self._flat_data_specs is empty, no channel needs data,
            # so we do not need to call the iterator in order to average
            # the monitored values across different batches, we only
            # have to call them once.
            if len(self._flat_data_specs[1]) == 0:
                X = ()
                self.run_prereqs(X, d)
                a(*X)

            else:
                actual_ne = 0
                for X in myiterator:
                    # X is a flat (not nested) tuple
                    self.run_prereqs(X, d)
                    a(*X)
                    actual_ne += self._flat_data_specs[0].np_batch_size(X)
                # end for X
                if actual_ne != ne:
                    raise RuntimeError("At compile time, your iterator said "
                                       "it had %d examples total, but at "
                                       "runtime it gave us %d." %
                                       (ne, actual_ne))
        # end for d

        log.info("Monitoring step:")
        log.info("\tEpochs seen: %d" % self._epochs_seen)
        log.info("\tBatches seen: %d" % self._num_batches_seen)
        log.info("\tExamples seen: %d" % self._examples_seen)
        t = time.time() - self.t0
        for channel_name in sorted(self.channels.keys(),
                                   key=number_aware_alphabetical_key):
            channel = self.channels[channel_name]
            channel.time_record.append(t)
            channel.batch_record.append(self._num_batches_seen)
            channel.example_record.append(self._examples_seen)
            channel.epoch_record.append(self._epochs_seen)
            val = channel.val_shared.get_value()
            channel.val_record.append(val)
            # TODO: use logging infrastructure so that user can configure
            # formatting
            if abs(val) < 1e4:
                val_str = str(val)
            else:
                val_str = '%.3e' % val

            log.info("\t%s: %s" % (channel_name, val_str))

    def run_prereqs(self, data, dataset):
        """
        Runs all "prerequistie functions" on a batch of data. Always
        called right before computing the monitoring channels on that
        batch.

        Parameters
        ----------
        data : tuple or Variable
            a member of the Space used as input to the monitoring
            functions
        dataset : Dataset
            the Dataset the data was drawn from
        """
        if dataset not in self.prereqs:
            return
        for prereq in self.prereqs[dataset]:
            prereq(*data)

    def get_batches_seen(self):
        """
        Returns the number of batches the model has learned on
        (assuming that the learning code has been calling
        Monitor.report_batch correctly).
        """
        return self._num_batches_seen

    def get_epochs_seen(self):
        """
        .. todo::

            WRITEME

        Returns
        -------
        epochs_seen : int
            The number of epochs the model has been trained on.
            One "epoch" is one pass through Dataset.iterator.
        """
        return self._epochs_seen

    def get_examples_seen(self):
        """
        .. todo::

            WRITEME

        Returns
        -------
        examples_seen : int
            The number of examples the model has learned on (assuming
            that the learning code has been calling Monitor.report_batch
            correctly)
        """
        return self._examples_seen

    def report_batch(self, num_examples):
        """
        Call this whenever the model has learned on another batch of
        examples. Report how many examples were learned on.

        Parameters
        ----------
        num_examples : int
            The number of examples learned on in this minibatch.
        """
        self._examples_seen += num_examples
        self._num_batches_seen += 1

    def report_epoch(self):
        """
        Call this whenever the model has completed another "epoch" of
        learning. We regard one pass through Dataset.iterator as one
        epoch.
        """
        self._epochs_seen += 1

    def redo_theano(self):
        """
        Recompiles Theano functions used by this monitor.

        This is called any time we need to evaluate the channels and
        the channel definitions have changed since last we called it,
        or if the theano functions are unavailable for any other reason
        (first time they are needed after construction or
        deserialization, etc.)

        All channels are compiled as part of the same theano function
        so that the theano optimizations can eliminate subexpressions
        that are shared between multiple channels.
        """
        self._dirty = False

        # Recompute the data specs, since the channels may have changed.
        self._build_data_specs()

        init_names = dir(self)
        self.prereqs = OrderedDict()
        for channel in self.channels.values():
            if channel.prereqs is not None:
                dataset = channel.dataset
                if dataset not in self.prereqs:
                    self.prereqs[dataset] = []
                prereqs = self.prereqs[dataset]
                for prereq in channel.prereqs:
                    if prereq not in prereqs:
                        prereqs.append(prereq)

        updates = OrderedDict()
        for channel in self.channels.values():
            updates[channel.val_shared] = np.cast[config.floatX](0.0)
        with log_timing(log, "compiling begin_record_entry"):
            self.begin_record_entry = function(
                inputs=[],
                updates=updates,
                mode=self.theano_function_mode,
                name='Monitor.begin_record_entry'
            )
        updates = OrderedDict()
        givens = OrderedDict()
        # Get the appropriate kind of theano variable to represent the data
        # the model acts on
        batch_names = ['monitoring_%s' % s for s in self._flat_data_specs[1]]
        theano_args = self._flat_data_specs[0].make_theano_batch(batch_names)

        # Get a symbolic expression of the batch size
        # We do it here, rather than for each channel, because channels with an
        # empty data_specs do not use data, and are unable to extract the batch
        # size. The case where the whole data specs is empty is not supported.
        batch_size = self._flat_data_specs[0].batch_size(theano_args)

        # Also get a nested representation, for joint iteration
        # with each of channel.graph_input
        nested_theano_args = self._data_specs_mapping.nest(theano_args)
        if not isinstance(nested_theano_args, tuple):
            nested_theano_args = (nested_theano_args,)
        assert len(nested_theano_args) == (len(self.channels) + 1)

        log.info('Monitored channels: ')
        for key in sorted(self.channels.keys()):
            mode = self.theano_function_mode
            if mode is not None and hasattr(mode, 'record'):
                mode.record.handle_line('compiling monitor including ' +
                                        'channel ' + key + '\n')
            log.info('\t%s' % key)
        it = [d.iterator(mode=i, num_batches=n, batch_size=b,
                         data_specs=self._flat_data_specs,
                         return_tuple=True)
              for d, i, n, b in safe_izip(self._datasets, self._iteration_mode,
                                          self._num_batches, self._batch_size)]
        self.num_examples = [np.cast[config.floatX](float(i.num_examples))
                             for i in it]
        self.num_examples = [float(i.num_examples) for i in it]

        givens = [OrderedDict() for d in self._datasets]
        updates = [OrderedDict() for d in self._datasets]
        for i, channel in enumerate(self.channels.values()):
            index = self._datasets.index(channel.dataset)
            d = self._datasets[index]
            g = givens[index]
            inv_cur_num_examples = as_floatX(1./self.num_examples[index])
            u = updates[index]

            # Flatten channel.graph_input and the appropriate part of
            # nested_theano_args, to iterate jointly over them.
            c_mapping = DataSpecsMapping(channel.data_specs)
            channel_inputs = c_mapping.flatten(channel.graph_input,
                                               return_tuple=True)
            inputs = c_mapping.flatten(nested_theano_args[i + 1],
                                       return_tuple=True)

            for (channel_X, X) in safe_izip(channel_inputs, inputs):
                assert channel_X not in g or g[channel_X] is X
                assert channel_X.type == X.type, (channel_X.type, X.type)
                g[channel_X] = X

            if batch_size == 0:
                # No channel does need any data, so there is not need to
                # average results, and we will call the accum functions only
                # once.
                # TODO: better handling of channels not needing data when
                # some other channels need data.
                assert len(self._flat_data_specs[1]) == 0
                val = channel.val
            else:
                if n == 0:
                    raise ValueError("Iterating over 0 examples results in " +
                                     "divide by 0")
                val = (channel.val * T.cast(batch_size, config.floatX)
                       * inv_cur_num_examples)
            u[channel.val_shared] = channel.val_shared + val

        with log_timing(log, "Compiling accum"):
            # Check type of update expressions
            for up in updates:
                for key in up:
                    if key.dtype != up[key].dtype:
                        raise TypeError('Monitoring channel shared variable ' +
                                        key.name + ' has dtype ' + key.dtype +
                                        ' but is driven by an expression ' +
                                        'with type ' + up[key].dtype)

            self.accum = []
            for idx, packed in enumerate(safe_izip(givens, updates)):
                g, u = packed
                mode = self.theano_function_mode
                if mode is not None and hasattr(mode, 'record'):
                    for elem in g:
                        mode.record.handle_line('g key ' +
                                                var_descriptor(elem) + '\n')
                        mode.record.handle_line('g val ' +
                                                var_descriptor(g[elem]) + '\n')
                    for elem in u:
                        mode.record.handle_line('u key ' +
                                                var_descriptor(elem) + '\n')
                        mode.record.handle_line('u val ' +
                                                var_descriptor(u[elem]) + '\n')
                function_name = 'Monitor.accum[%d]' % idx
                if mode is not None and hasattr(mode, 'record'):
                    mode.record.handle_line('compiling supervised accum\n')
                # Some channels may not depend on the data, ie, they might just
                # monitor the model parameters, or some shared variable updated
                # by the training algorithm, so we need to ignore the unused
                # input error
                self.accum.append(function(theano_args,
                                           givens=g,
                                           updates=u,
                                           mode=self.theano_function_mode,
                                           name=function_name))
            for a in self.accum:
                if mode is not None and hasattr(mode, 'record'):
                    for elem in a.maker.fgraph.outputs:
                        mode.record.handle_line('accum output ' +
                                                var_descriptor(elem) + '\n')
                log.info("graph size: %d" % len(a.maker.fgraph.toposort()))
        final_names = dir(self)
        self.register_names_to_del([name for name in final_names
                                    if name not in init_names])

    def register_names_to_del(self, names):
        """
        Register names of fields that should be deleted before pickling.

        Parameters
        ----------
        names : list
            A list of attribute names as strings.
        """
        for name in names:
            if name not in self.names_to_del:
                self.names_to_del.append(name)

    def __getstate__(self):
        """
        In order to avoid pickling a copy of the dataset whenever a
        monitor is saved, the __getstate__ method replaces the dataset
        field with the dataset's yaml source. This is not a perfect
        solution because it won't work with job resuming, which would
        require saving the state of the dataset's random number
        generator.

        Like in the Model class, we also need to avoid saving any
        Theano functions, so we delete everything that can be
        regenerated with `redo_theano` by deleting the fields in
        `self.names_to_del`
        """

        # Patch old pickled monitors
        if not hasattr(self, '_datasets'):
            self._datasets = [self._dataset]
            del self._dataset

        temp = self._datasets

        if self._datasets:
            self._datasets = []
            for dataset in temp:
                if isinstance(dataset, basestring):
                    self._datasets.append(dataset)
                else:
                    try:
                        self._datasets.append(dataset.yaml_src)
                    except AttributeError:
                        warnings.warn('Trained model saved without ' +
                                      'indicating yaml_src')
        d = copy.copy(self.__dict__)
        self._datasets = temp
        for name in self.names_to_del:
            if name in d:
                del d[name]

        return d

    def __setstate__(self, d):
        """
        Sets the object to have the state described by `d`.

        Parameters
        ----------
        d : dict
            A dictionary mapping string names of fields to values for
            these fields.
        """
        # patch old pkl files
        if '_dataset' in d:
            d['_datasets'] = [d['_dataset']]
            del d['_dataset']

        self.__dict__.update(d)

    def add_channel(self, name, ipt, val, dataset=None, prereqs=None,
                    data_specs=None):
        """
        Asks the monitor to start tracking a new value.  Can be called
        even after the monitor is already in use.

        Parameters
        ----------
        name : str
            The display name in the monitor.
        ipt : tensor_like
            The symbolic tensor which should be clamped to the data.
            (or a list/tuple containing symbolic tensors, following the
            data_specs)
        val : tensor_like
            The value (function of `ipt`) to be tracked.
        dataset : pylearn2.datasets.Dataset
            Which dataset to compute this channel on
        prereqs : list of callables that take a list of numpy tensors
            Each prereq must be called exactly once per each new batch
            of data drawn *from dataset* before the channel value is
            computed if two channels provide a prereq with exactly the
            same id, that prereq will only be called once
        data_specs : (space, source) pair
            Identifies the order, format and semantics of ipt
        """
        if isinstance(val, (float, int, long)):
            val = np.cast[theano.config.floatX](val)

        val = T.as_tensor_variable(val)

        if data_specs is None:
            warnings.warn("parameter 'data_specs' should be provided when " +
                          "calling add_channel. We will build a default one.",
                          stacklevel=2)
            if isinstance(ipt, list):
                ipt = tuple(ipt)
            if ipt is not None and not isinstance(ipt, tuple):
                ipt = (ipt,)

            if ipt is None:
                data_specs = (NullSpace(), '')
            elif len(ipt) == 0:
                data_specs = (CompositeSpace([]), ())
            elif hasattr(dataset, 'get_data_specs'):
                dataset_space, dataset_source = dataset.get_data_specs()
                if (len(ipt) == 1 and
                        dataset_source is not None and
                        (not isinstance(dataset_source, tuple) or
                            len(dataset_source) == 1) and
                        'features' in dataset_source):
                    data_specs = (dataset_space, dataset_source)
                elif (len(ipt) == 2 and
                        dataset_source == ('features', 'targets')):
                    data_specs = (dataset_space, dataset_source)
                else:
                    raise ValueError("Cannot infer default data_specs for " +
                                     "the following input points and " +
                                     "dataset: ipt = %s, dataset = %s"
                                     % (ipt, dataset))

        data_specs[0].validate(ipt)

        mapping = DataSpecsMapping(data_specs)
        flat_ipt = mapping.flatten(ipt)
        if not isinstance(flat_ipt, tuple):
            flat_ipt = (flat_ipt,)
        inputs = theano.gof.graph.inputs([val])
        for elem in inputs:
            if not hasattr(elem, 'get_value') and \
               not isinstance(elem, theano.gof.graph.Constant):
                if elem not in flat_ipt:
                    raise ValueError("Unspecified input: " + str(elem) +
                                     ". This may be due to an incorrect " +
                                     "implementation of a cost's " +
                                     "get_data_specs() method, or of a " +
                                     "model's get_monitoring_data_specs() " +
                                     "method.")

        mode = self.theano_function_mode
        if mode is not None and hasattr(mode, 'record'):
            mode.record.handle_line('Adding monitor channel '+name+'\n')
            assert isinstance(flat_ipt, tuple)
            if len(flat_ipt) != 1:
                for elem in flat_ipt:
                    mode.record.handle_line('Includes input var ' +
                                            var_descriptor(elem) + '\n')
            else:
                mode.record.handle_line(name + ' input var is ' +
                                        var_descriptor(flat_ipt[0]) + '\n')
            mode.record.handle_line('channel ' + name + ' is ' +
                                    var_descriptor(val) + '\n')

        if dataset is None:
            if len(self._datasets) == 1:
                dataset = self._datasets[0]
            elif len(self._datasets) == 0:
                raise ValueError(_err_no_data)
            else:
                raise ValueError(_err_ambig_data)

        try:
            self._datasets.index(dataset)
        except ValueError:
            reraise_as(ValueError("The dataset specified is not one of the " +
                                  "monitor's datasets"))

        if name in self.channels:
            raise ValueError("Tried to create the same channel twice (%s)" %
                             name)

        self.channels[name] = MonitorChannel(ipt, val, name, data_specs,
                                             dataset, prereqs)
        self._dirty = True

    def _sanity_check(self):
        """
        Sometimes we serialize models and then load them somewhere else
        but still try to use their Monitor, and the Monitor is in a
        mangled state. I've added some calls to _sanity_check to try to
        catch when that happens. Not sure what to do for a long term
        fix. I think it requires making theano graphs serializable
        first.
        """
        for name in self.channels:
            channel = self.channels[name]
            assert hasattr(channel, 'prereqs')

    @classmethod
    def get_monitor(cls, model):
        """
        Returns a model's monitor. If the model doesn't have a monitor
        yet, installs one and returns that.

        Parameters
        ----------
        model : object
            An object that implements the `Model` interface specified
            in `pylearn2.models`.
        """

        if hasattr(model, 'monitor'):
            rval = model.monitor
            rval._sanity_check()
        else:
            rval = Monitor(model)
            model.monitor = rval

        return rval

    # TODO: find out if this method is used anywhere, remove if not.
    @property
    def batch_size(self):
        """
        .. todo::

            WRITEME

        Returns
        -------
        batch_size : int
            The size of the batches used for monitoring
        """
        return self._batch_size

    # TODO: find out if this method is used anywhere, remove if not.
    @property
    def num_batches(self):
        """
        .. todo::

            WRITEME

        Returns
        -------
        num_batches : int
            The number of batches used for monitoring
        """
        return self._num_batches

    def setup(self, dataset, cost, batch_size, num_batches=None,
              extra_costs=None, mode='sequential', obj_prereqs=None,
              cost_monitoring_args=None):
        """
        Sets up the monitor for a cost minimization problem.
        Adds channels defined by both the model and the cost for
        the specified dataset(s), as well as a channel called
        'objective' defined by the costs' __call__ method.

        Parameters
        ----------
        dataset : pylearn2.datasets.Dataset
            Dataset or dictionary mapping string names to Datasets.
            If string names are used, then for every dataset, each
            channel defined by the model or cost will be replicated
            with that dataset's name followed by an underscore as the
            prefix. For example, if your cost defines a channel called
            'misclass', and datasets is
            {'train' : train_dataset, 'valid' : valid_dataset},
            you will get channels called 'train_misclass' and
            'valid_misclass'.
        cost : pylearn2.costs.Cost
            The cost being optimized by training. The value of the cost
            will appear as the `objective` channel. Its
            `get_monitoring_channels` method will also be used to
            supply other channels.
        extra_costs : OrderedDict, optional
            A dictionary mapping channel names to Cost objects.
            Their value will appear as the specified channel name.
            They will also provide more monitoring channels via their
            `get_monitoring_channels` method.
        obj_prereqs : None, or list of functions
            Functions to pass as prerequisites to the `objective` channel.
        cost_monitoring_args : dict
            Dictionary of kwargs that will be passed to
            `cost.get_monitoring_channels()`
            (but not for the extra_costs).
        """

        if dataset is None:
            return
        if isinstance(dataset, Dataset):
            dataset = {'': dataset}
        else:
            assert isinstance(dataset, dict)
            assert all(isinstance(key, str) for key in dataset)
            assert all(isinstance(dataset[key], Dataset) for key in dataset)

        if extra_costs is None:
            costs = {}
        else:
            assert isinstance(extra_costs, (OrderedDict, dict))
            costs = extra_costs
        assert '' not in costs
        costs[''] = cost

        if cost_monitoring_args is None:
            cost_monitoring_args = {}

        model = self.model

        # Build a composite data_specs containing the specs for all costs,
        # then the specs of the model
        cost_names = sorted(costs.keys())
        spaces = []
        sources = []
        for c in cost_names:
            c_space, c_source = costs[c].get_data_specs(model)
            spaces.append(c_space)
            sources.append(c_source)

        # Ask the model for the data_specs needed
        m_space, m_source = model.get_monitoring_data_specs()
        spaces.append(m_space)
        sources.append(m_source)

        nested_space = CompositeSpace(spaces)
        nested_sources = tuple(sources)

        # Flatten this data_specs, so we build only one symbolic Theano
        # variable for each of the unique (space, source) pairs.
        mapping = DataSpecsMapping((nested_space, nested_sources))
        space_tuple = mapping.flatten(nested_space, return_tuple=True)
        source_tuple = mapping.flatten(nested_sources, return_tuple=True)
        ipt = tuple(space.make_theano_batch(name='monitor_%s' % source,
                                            batch_size=None)
                    for (space, source) in safe_zip(space_tuple, source_tuple))

        # Build a nested tuple from ipt, to dispatch the appropriate parts
        # of the ipt batch to each cost
        nested_ipt = mapping.nest(ipt)

        custom_channels = {}
        for i, cost_name in enumerate(cost_names):
            if cost_name == '':
                prefix = ''
            else:
                prefix = cost_name + '_'
            cost = costs[cost_name]
            cost_ipt = nested_ipt[i]
            raw_channels = cost.get_monitoring_channels(model, cost_ipt)
            channels = {}
            for name in raw_channels:
                # We need three things: the value itself (raw_channels[name]),
                # the input variables (cost_ipt), and the data_specs for
                # these input variables ((spaces[i], sources[i]))
                channels[prefix + name] = (raw_channels[name],
                                           cost_ipt,
                                           (spaces[i], sources[i]))
            custom_channels.update(channels)

        # Use the last inputs from nested_ipt for the model
        model_channels = model.get_monitoring_channels(nested_ipt[-1])
        channels = {}
        for name in model_channels:
            # Note: some code used to consider that model_channels[name]
            # could be a a (channel, prereqs) pair, this is not supported.
            channels[name] = (model_channels[name],
                              nested_ipt[-1],
                              (spaces[-1], sources[-1]))
        custom_channels.update(channels)

        if is_stochastic(mode):
            seed = [[2013, 02, 22]]
        else:
            seed = None

        for dataset_name in dataset:
            cur_dataset = dataset[dataset_name]
            self.add_dataset(dataset=cur_dataset,
                             mode=mode,
                             batch_size=batch_size,
                             num_batches=num_batches,
                             seed=seed)
            if dataset_name == '':
                dprefix = ''
            else:
                dprefix = dataset_name + '_'
            # These channel name 'objective' must not vary, since callbacks
            # that respond to the values in the monitor use the name to find
            # it.
            for i, cost_name in enumerate(cost_names):
                cost = costs[cost_name]
                cost_ipt = nested_ipt[i]
                cost_value = cost.expr(model, cost_ipt)
                if cost_value is not None:
                    if cost_name == '':
                        name = dprefix + 'objective'
                        prereqs = obj_prereqs
                    else:
                        name = dprefix + cost_name
                        prereqs = None

                    cost.get_data_specs(model)[0].validate(cost_ipt)
                    self.add_channel(name=name,
                                     ipt=cost_ipt,
                                     val=cost_value,
                                     data_specs=cost.get_data_specs(model),
                                     dataset=cur_dataset,
                                     prereqs=prereqs)

            for key in custom_channels:
                val, ipt, data_specs = custom_channels[key]
                data_specs[0].validate(ipt)
                self.add_channel(name=dprefix + key,
                                 ipt=ipt,
                                 val=val,
                                 data_specs=data_specs,
                                 dataset=cur_dataset)
Beispiel #24
0
    def get_gradients(self, model, data):
        """
        PCD approximation to the gradient.
        Keep in mind this is a cost, so we use
        the negative log likelihood.
        """
        self.get_data_specs(model)[0].validate(data)
        if self.supervised:
            X, Y = data
            assert Y is not None
        else:
            X = data

        layer_to_clamp = OrderedDict([(model.visible_layer, True )])
        layer_to_pos_samples = OrderedDict([(model.visible_layer, X)])
        if self.supervised:
            # note: if the Y layer changes to something without linear energy,
            # we'll need to make the expected energy clamp Y in the positive phase
            assert isinstance(model.hidden_layers[-1], dbm.Softmax)
            layer_to_clamp[model.hidden_layers[-1]] = True
            layer_to_pos_samples[model.hidden_layers[-1]] = Y
            hid = model.hidden_layers[:-1]
        else:
            hid = model.hidden_layers

        for layer in hid:
            mf_state = layer.init_mf_state()
            def recurse_zeros(x):
                if isinstance(x, tuple):
                    return tuple([recurse_zeros(e) for e in x])
                return x.zeros_like()
            layer_to_pos_samples[layer] = recurse_zeros(mf_state)

        layer_to_pos_samples = model.mcmc_steps(
                layer_to_state=layer_to_pos_samples,
                layer_to_clamp=layer_to_clamp,
                num_steps=self.num_gibbs_steps,
                theano_rng = self.theano_rng)

        q = [layer_to_pos_samples[layer] for layer in model.hidden_layers]

        pos_samples = flatten(q)

        # The gradients of the expected energy under q are easy, we can just do that in theano
        expected_energy_q = model.energy(X, q).mean()
        params = list(model.get_params())
        gradients = OrderedDict(safe_zip(params, T.grad(expected_energy_q, params,
            consider_constant = pos_samples,
            disconnected_inputs = 'ignore')))

        """
        d/d theta log Z = (d/d theta Z) / Z
                        = (d/d theta sum_h sum_v exp(-E(v,h)) ) / Z
                        = (sum_h sum_v - exp(-E(v,h)) d/d theta E(v,h) ) / Z
                        = - sum_h sum_v P(v,h)  d/d theta E(v,h)
        """

        layer_to_chains = model.make_layer_to_state(self.num_chains)

        def recurse_check(l):
            if isinstance(l, (list, tuple)):
                for elem in l:
                    recurse_check(elem)
            else:
                assert l.get_value().shape[0] == self.num_chains

        recurse_check(layer_to_chains.values())

        model.layer_to_chains = layer_to_chains

        # Note that we replace layer_to_chains with a dict mapping to the new
        # state of the chains
        updates, layer_to_chains = model.get_sampling_updates(layer_to_chains,
                self.theano_rng, num_steps=self.num_gibbs_steps,
                return_layer_to_updated = True)


        if self.toronto_neg:
            # Ruslan Salakhutdinov's undocumented negative phase from
            # http://www.mit.edu/~rsalakhu/code_DBM/dbm_mf.m
            # IG copied it here without fully understanding it, so it
            # only applies to exactly the same model structure as
            # in that code.

            assert isinstance(model.visible_layer, dbm.BinaryVector)
            assert isinstance(model.hidden_layers[0], dbm.BinaryVectorMaxPool)
            assert model.hidden_layers[0].pool_size == 1
            assert isinstance(model.hidden_layers[1], dbm.BinaryVectorMaxPool)
            assert model.hidden_layers[1].pool_size == 1
            assert isinstance(model.hidden_layers[2], dbm.Softmax)
            assert len(model.hidden_layers) == 3

            V_samples = layer_to_chains[model.visible_layer]
            H1_samples, H2_samples, Y_samples = [layer_to_chains[layer] for layer in model.hidden_layers]

            H1_mf = model.hidden_layers[0].mf_update(state_below=model.visible_layer.upward_state(V_samples),
                                                    state_above=model.hidden_layers[1].downward_state(H2_samples),
                                                    layer_above=model.hidden_layers[1])
            Y_mf = model.hidden_layers[2].mf_update(state_below=model.hidden_layers[1].upward_state(H2_samples))
            H2_mf = model.hidden_layers[1].mf_update(state_below=model.hidden_layers[0].upward_state(H1_mf),
                                                    state_above=model.hidden_layers[2].downward_state(Y_mf),
                                                    layer_above=model.hidden_layers[2])

            expected_energy_p = model.energy(V_samples, [H1_mf, H2_mf, Y_samples]).mean()

            constants = flatten([V_samples, H1_mf, H2_mf, Y_samples])

            neg_phase_grads = OrderedDict(safe_zip(params, T.grad(-expected_energy_p, params, consider_constant = constants)))
        else:
            warnings.warn("""TODO: reduce variance of negative phase by integrating out
                    the even-numbered layers. The Rao-Blackwellize method can do this
                    for you when expected gradient = gradient of expectation, but doing
                    this in general is trickier.""")
            #layer_to_chains = model.rao_blackwellize(layer_to_chains)
            expected_energy_p = model.energy(layer_to_chains[model.visible_layer],
                    [layer_to_chains[layer] for layer in model.hidden_layers]).mean()

            samples = flatten(layer_to_chains.values())
            for i, sample in enumerate(samples):
                if sample.name is None:
                    sample.name = 'sample_'+str(i)

            neg_phase_grads = OrderedDict(safe_zip(params, T.grad(-expected_energy_p, params, consider_constant
                = samples, disconnected_inputs='ignore')))


        for param in list(gradients.keys()):
            #print param.name,': '
            #print theano.printing.min_informative_str(neg_phase_grads[param])
            gradients[param] =  neg_phase_grads[param]  + gradients[param]

        return gradients, updates
    def get_funcs(self, learning_rate, grads, inp, cost, errors, lr_scalers=None):
        """
        .. todo::
            WRITEME
        Parameters
        ----------
        learning_rate : float
            Learning rate coefficient. Learning rate is not being used but, pylearn2 requires a
            learning rate to be defined.
        grads : dict
            A dictionary mapping from the model's parameters to their
            gradients.
        lr_scalers : dict
            A dictionary mapping from the model's parameters to a learning
            rate multiplier.
        """

        updates = OrderedDict({})
        eps = self.damping
        step = sharedX(0., name="step")

        if self.skip_nan_inf:
            #If norm of the gradients of a parameter is inf or nan don't update that parameter
            #That might be useful for RNNs.
            grads = OrderedDict({p: T.switch(T.or_(T.isinf(grads[p]),
                T.isnan(grads[p])), 0, grads[p]) for
                p in grads.keys()})

        # Block-normalize gradients:
        nparams = len(grads.keys())

        # Apply the gradient clipping, this is only sometimes
        # necessary for RNNs and sometimes for very deep networks
        if self.grad_clip:
            assert self.grad_clip > 0.
            assert self.grad_clip <= 1., "Norm of the gradients per layer can not be larger than 1."

            gnorm = sum([g.norm(2) for g in grads.values()])
            notfinite = T.or_(T.isnan(gnorm), T.isinf(gnorm))

            for p, g in grads.iteritems():
                tmpg = T.switch(gnorm / nparams > self.grad_clip,
                                 g * self.grad_clip * nparams / gnorm , g)
                grads[p] = T.switch(notfinite, as_floatX(0.1)*p, tmpg)

        tot_norm_up = 0
        gshared = OrderedDict({p: sharedX(p.get_value() * 0.,
                             name='%s_grad' % p.name)
                             for p, g in grads.iteritems()})

        gsup = [(gshared[p], g) for p, g in grads.iteritems()]
        get_norms = lambda x: T.sqrt(sum(map(lambda y: (y**2).sum(), x)))
        gnorm = get_norms(grads.values())
        pnorm = get_norms(grads.keys())
        f_grad_shared = theano.function(inp,
                                        [cost, errors, gnorm, pnorm],
                                        updates=gsup)

        fix_decay = self.slow_decay**(step + 1)

        for param in gshared.keys():
            gshared[param].name = "grad_%s" % param.name
            mean_grad = sharedX(param.get_value() * 0. + eps, name="mean_grad_%s" % param.name)
            gnorm_sqr = sharedX(0.0 + eps, name="gnorm_%s" % param.name)

            prod_taus = sharedX((np.ones_like(param.get_value()) - 2*eps),
                                 name="prod_taus_x_t_" + param.name)
            slow_constant = 2.1

            if self.use_adagrad:
                # sum_square_grad := \sum_i g_i^2
                sum_square_grad = sharedX(param.get_value(borrow=True) * 0.,
                                          name="sum_square_grad_%s" % param.name)

            """
               Initialization of accumulators
            """
            taus_x_t = sharedX((np.ones_like(param.get_value()) + eps) * slow_constant,
                               name="taus_x_t_" + param.name)
            self.taus_x_t = taus_x_t

            #Variance reduction parameters
            #Numerator of the gamma:
            gamma_nume_sqr = sharedX(np.zeros_like(param.get_value()) + eps,
                                     name="gamma_nume_sqr_" + param.name)

            #Denominator of the gamma:
            gamma_deno_sqr = sharedX(np.zeros_like(param.get_value()) + eps,
                                     name="gamma_deno_sqr_" + param.name)

            #For the covariance parameter := E[\gamma \alpha]_{t-1}
            cov_num_t = sharedX(np.zeros_like(param.get_value()) + eps,
                                name="cov_num_t_" + param.name)

            # mean_squared_grad := E[g^2]_{t-1}
            mean_square_grad = sharedX(np.zeros_like(param.get_value()) + eps,
                                       name="msg_" + param.name)

            # mean_square_dx := E[(\Delta x)^2]_{t-1}
            mean_square_dx = sharedX(param.get_value() * 0., name="msd_" + param.name)

            if self.use_corrected_grad:
                old_grad = sharedX(param.get_value() * 0. + eps)

            #The uncorrected gradient of previous of the previous update:
            old_plain_grad = sharedX(param.get_value() * 0. + eps)
            mean_curvature = sharedX(param.get_value() * 0. + eps)
            mean_curvature_sqr = sharedX(param.get_value() * 0. + eps)

            # Initialize the E[\Delta]_{t-1}
            mean_dx = sharedX(param.get_value() * 0.)

            # Block-wise normalize the gradient:
            norm_grad = gshared[param]

            #For the first time-step, assume that delta_x_t := norm_grad
            gnorm = T.sqr(norm_grad).sum()

            cond = T.eq(step, 0)
            gnorm_sqr_o = cond * gnorm + (1 - cond) * gnorm_sqr
            gnorm_sqr_b = gnorm_sqr_o / (1 - fix_decay)

            norm_grad = norm_grad / (T.sqrt(gnorm_sqr_b) + eps)
            msdx = cond * norm_grad**2 + (1 - cond) * mean_square_dx
            mdx = cond * norm_grad + (1 - cond) * mean_dx

            new_prod_taus = (
                prod_taus * (1 - 1 / taus_x_t)
            )

            """
                Compute the new updated values.
            """
            # E[g_i^2]_t
            new_mean_squared_grad = (
                mean_square_grad * (1 - 1 / taus_x_t)  +
                T.sqr(norm_grad) / (taus_x_t)
            )
            new_mean_squared_grad.name = "msg_" + param.name

            # E[g_i]_t
            new_mean_grad = (
                mean_grad * (1 - 1 / taus_x_t) +
                norm_grad / taus_x_t
            )

            new_mean_grad.name = "nmg_" + param.name
            mg = new_mean_grad / (1 - new_prod_taus)
            mgsq = new_mean_squared_grad / (1 - new_prod_taus)

            new_gnorm_sqr = (
                    gnorm_sqr_o * self.slow_decay +
                    T.sqr(norm_grad).sum() * (1 - self.slow_decay)
            )

            # Keep the rms for numerator and denominator of gamma.
            new_gamma_nume_sqr = (
                gamma_nume_sqr * (1 - 1 / taus_x_t) +
                T.sqr((norm_grad - old_grad) * (old_grad - mg)) / taus_x_t
            )
            new_gamma_nume_sqr.name = "ngammasqr_num_" + param.name

            new_gamma_deno_sqr = (
                gamma_deno_sqr * (1 - 1 / taus_x_t) +
                T.sqr((mg - norm_grad) * (old_grad - mg)) / taus_x_t
            )

            new_gamma_deno_sqr.name = "ngammasqr_den_" + param.name

            gamma = T.sqrt(gamma_nume_sqr) / (T.sqrt(gamma_deno_sqr + eps) + \
                    self.gamma_reg)

            gamma.name = "gamma_" + param.name

            if self.gamma_clip and self.gamma_clip > -1:
                gamma = T.minimum(gamma, self.gamma_clip)

            momentum_step = gamma * mg
            corrected_grad_cand = (norm_grad + momentum_step) / (1 + gamma)

            #For starting the variance reduction.
            if self.start_var_reduction > -1:
                cond = T.le(self.start_var_reduction, step)
                corrected_grad = cond * corrected_grad_cand + (1 - cond) * norm_grad
            else:
                corrected_grad = norm_grad

            if self.use_adagrad:
                g = corrected_grad
                # Accumulate gradient
                new_sum_squared_grad = (
                    sum_square_grad + T.sqr(g)
                )
                rms_g_t = T.sqrt(new_sum_squared_grad)
                rms_g_t = T.maximum(rms_g_t, 1.0)

            #Use the gradients from the previous update
            #to compute the \nabla f(x_t) - \nabla f(x_{t-1})
            cur_curvature = norm_grad - old_plain_grad
            #cur_curvature = theano.printing.Print("Curvature: ")(cur_curvature)
            cur_curvature_sqr = T.sqr(cur_curvature)

            new_curvature_ave = (
                mean_curvature * (1 - 1 / taus_x_t) +
                (cur_curvature / taus_x_t)
            )
            new_curvature_ave.name = "ncurve_ave_" + param.name

            #Average average curvature
            nc_ave = new_curvature_ave / (1 - new_prod_taus)

            new_curvature_sqr_ave = (
                mean_curvature_sqr * (1 - 1 / taus_x_t) +
                (cur_curvature_sqr / taus_x_t)
            )
            new_curvature_sqr_ave.name = "ncurve_sqr_ave_" + param.name

            #Unbiased average squared curvature
            nc_sq_ave = new_curvature_sqr_ave / (1 - new_prod_taus)

            epsilon = 1e-7
            #lr_scalers.get(param, 1.) * learning_rate
            scaled_lr = sharedX(1.0)
            rms_dx_tm1 = T.sqrt(msdx + epsilon)

            rms_curve_t = T.sqrt(new_curvature_sqr_ave + epsilon)

            #This is where the update step is being defined
            delta_x_t = -scaled_lr * (rms_dx_tm1 / rms_curve_t - cov_num_t / (new_curvature_sqr_ave + epsilon))
            delta_x_t.name = "delta_x_t_" + param.name

            # This part seems to be necessary for only RNNs
            # For feedforward networks this does not seem to be important.
            if self.delta_clip:
                logger.info("Clipping will be applied on the adaptive step size.")
                delta_x_t = delta_x_t.clip(-self.delta_clip, self.delta_clip)
                if self.use_adagrad:
                    delta_x_t = delta_x_t * corrected_grad / rms_g_t
                else:
                    logger.info("Clipped adagrad is disabled.")
                    delta_x_t = delta_x_t * corrected_grad
            else:
                logger.info("Clipping will not be applied on the adaptive step size.")
                if self.use_adagrad:
                    delta_x_t = delta_x_t * corrected_grad / rms_g_t
                else:
                    logger.info("Clipped adagrad will not be used.")
                    delta_x_t = delta_x_t * corrected_grad

            new_taus_t = (1 - T.sqr(mdx) / (msdx + eps)) * taus_x_t + sharedX(1 + eps, "stabilized")

            #To compute the E[\Delta^2]_t
            new_mean_square_dx = (
                 msdx * (1 - 1 / taus_x_t) +
                 (T.sqr(delta_x_t) / taus_x_t)
             )

            #To compute the E[\Delta]_t
            new_mean_dx = (
                mdx * (1 - 1 / taus_x_t) +
                (delta_x_t / (taus_x_t))
            )

            #Perform the outlier detection:
            #This outlier detection is slightly different:
            new_taus_t = T.switch(T.or_(abs(norm_grad - mg) > (2 * T.sqrt(mgsq  - mg**2)),
                                        abs(cur_curvature - nc_ave) > (2 * T.sqrt(nc_sq_ave - nc_ave**2))),
                                        T.switch(new_taus_t > 2.5, sharedX(2.5), new_taus_t + sharedX(1.0) + eps), new_taus_t)

            #Apply the bound constraints on tau:
            new_taus_t = T.maximum(self.lower_bound_tau, new_taus_t)
            new_taus_t = T.minimum(self.upper_bound_tau, new_taus_t)

            new_cov_num_t = (
                cov_num_t * (1 - 1 / taus_x_t) +
                (delta_x_t * cur_curvature) * (1 / taus_x_t)
            )

            update_step = delta_x_t

            tot_norm_up += update_step.norm(2)
            # Apply updates
            updates[mean_square_grad] = new_mean_squared_grad
            updates[mean_square_dx] = new_mean_square_dx
            updates[mean_dx] = new_mean_dx
            updates[gnorm_sqr] = new_gnorm_sqr
            updates[gamma_nume_sqr] = new_gamma_nume_sqr
            updates[gamma_deno_sqr] = new_gamma_deno_sqr
            updates[taus_x_t] = new_taus_t
            updates[cov_num_t] = new_cov_num_t
            updates[mean_grad] = new_mean_grad
            updates[old_plain_grad] = norm_grad
            updates[mean_curvature] = new_curvature_ave
            updates[mean_curvature_sqr] = new_curvature_sqr_ave

            if self.perform_update:
                updates[param] = param + update_step

            updates[step] = step + 1
            updates[prod_taus] = new_prod_taus

            if self.use_adagrad:
                updates[sum_square_grad] = new_sum_squared_grad

            if self.use_corrected_grad:
                updates[old_grad] = corrected_grad

        f_update = theano.function([learning_rate], [tot_norm_up],
                                   updates=updates,
                                   on_unused_input='ignore')

        return f_grad_shared, f_update
    def get_funcs(self, learning_rate, grads, inp, cost, errors, lr_scalers=None):
        """
        .. todo::

            WRITEME
        """
        if self.gradient_clipping is not None:
            grads_norm = sum(
                map(lambda X: T.sqr(X).sum(),
                    [grads[param] for param in grads.keys()])
            )
            grads_norm = T.sqrt(grads_norm)
            scaling_den = T.maximum(self.gradient_clipping, grads_norm)
            scaling_num = self.gradient_clipping
            for param in grads.keys():
                grads[param] = scaling_num * grads[param] / scaling_den

        updates = OrderedDict()
        velocity = OrderedDict()
        normalized_velocities = OrderedDict()

        counter = sharedX(0, 'counter')
        tot_norm_up = 0
        gshared = OrderedDict({p: sharedX(p.get_value() * 0.,
                             name='%s_grad' % p.name)
                             for p, g in grads.iteritems()})

        gsup = [(gshared[p], g) for p, g in grads.iteritems()]
        get_norms = lambda x: T.sqrt(sum(map(lambda y: (y**2).sum(), x)))
        gnorm = get_norms(grads.values())
        pnorm = get_norms(grads.keys())
        f_grad_shared = theano.function(inp,
                                        [cost, errors, gnorm, pnorm],
                                        updates=gsup)
        for param in gshared.keys():
            avg_grad_sqr = sharedX(np.zeros_like(param.get_value()))
            velocity[param] = sharedX(np.zeros_like(param.get_value()))

            next_counter = counter + 1.

            fix_first_moment = 1. - self.momentum**next_counter
            fix_second_moment = 1. - self.averaging_coeff**next_counter

            if param.name is not None:
                avg_grad_sqr.name = 'avg_grad_sqr_' + param.name

            new_avg_grad_sqr = self.averaging_coeff*avg_grad_sqr \
                + (1 - self.averaging_coeff)*T.sqr(gshared[param])

            rms_grad_t = T.sqrt(new_avg_grad_sqr)
            rms_grad_t = T.maximum(rms_grad_t, self.stabilizer)
            new_velocity = self.momentum * velocity[param] \
                - (1 - self.momentum) * gshared[param]
            normalized_velocity = (new_velocity * T.sqrt(fix_second_moment)) \
                / (rms_grad_t * fix_first_moment)

            tot_norm_up += learning_rate*normalized_velocity.norm(2)

            normalized_velocities[param] = normalized_velocity
            updates[avg_grad_sqr] = new_avg_grad_sqr
            updates[velocity[param]] = new_velocity
            updates[param] = param + normalized_velocities[param]

        updates[counter] = counter + 1
        f_update = theano.function([learning_rate], [tot_norm_up],
                                   updates=updates,
                                   on_unused_input='ignore')

        return f_grad_shared, f_update
    def get_funcs(self, learning_rate, grads, inp, cost, errors, lr_scalers=None):
        """
        .. todo::

            WRITEME
        """
        updates = OrderedDict()
        velocity = OrderedDict()
        tot_norm_up = 0

        gshared = OrderedDict({p: sharedX(p.get_value() * 0.,
                             name='%s_grad' % p.name)
                             for p, g in grads.iteritems()})

        gsup = [(gshared[p], g) for p, g in grads.iteritems()]
        get_norms = lambda x: T.sqrt(sum(map(lambda y: (y**2).sum(), x)))
        gnorm = get_norms(grads.values())
        pnorm = get_norms(grads.keys())
        f_grad_shared = theano.function(inp,
                                        [cost, errors, gnorm, pnorm],
                                        updates=gsup)

        for param in gshared.keys():
            avg_grad_sqr = sharedX(np.zeros_like(param.get_value()))
            velocity[param] = sharedX(np.zeros_like(param.get_value()))

            if param.name is not None:
                avg_grad_sqr.name = 'avg_grad_sqr_' + param.name

            new_avg_grad_sqr = self.averaging_coeff * avg_grad_sqr +\
                (1 - self.averaging_coeff) * T.sqr(gshared[param])
            if self.use_first_order:
                avg_grad = sharedX(np.zeros_like(param.get_value()))
                if param.name is not None:
                    avg_grad.name = 'avg_grad_' + param.name
                new_avg_grad = self.averaging_coeff * avg_grad +\
                    (1 - self.averaging_coeff) * gshared[param]
                rms_grad_t = T.sqrt(new_avg_grad_sqr - new_avg_grad**2)
                updates[avg_grad] = new_avg_grad
            else:
                rms_grad_t = T.sqrt(new_avg_grad_sqr)

            rms_grad_t = T.maximum(rms_grad_t, self.stabilizer)
            normalized_grad = gshared[param] / (rms_grad_t)
            new_velocity = self.momentum * velocity[param] -\
                learning_rate * normalized_grad
            tot_norm_up += new_velocity.norm(2)

            updates[avg_grad_sqr] = new_avg_grad_sqr
            updates[velocity[param]] = new_velocity
            updates[param] = param + new_velocity

        if self.momentum_clipping is not None:
            tot_norm_up = 0

            new_mom_norm = sum(
                map(lambda X: T.sqr(X).sum(),
                    [updates[velocity[param]] for param in grads.keys()])
            )
            new_mom_norm = T.sqrt(new_mom_norm)
            scaling_den = T.maximum(self.momentum_clipping, new_mom_norm)
            scaling_num = self.momentum_clipping

            for param in grads.keys():
                if self.bound_inc:
                    updates[velocity[param]] *= (scaling_num / scaling_den)
                    updates[param] = param + updates[velocity[param]]
                else:
                    update_step = updates[velocity[param]] * (scaling_num / scaling_den)
                    tot_norm_up += update_step.norm(2)
                    updates[param] = param + update_step

        f_update = theano.function([learning_rate], [tot_norm_up],
                                   updates=updates,
                                   on_unused_input='ignore')

        return f_grad_shared, f_update
Beispiel #28
0
    def get_updates(self, learning_rate, grads, lr_scalers=None):
        """
        .. todo::
            WRITEME
        Parameters
        ----------
        learning_rate : float
            Learning rate coefficient. Learning rate is not being used but, pylearn2 requires a
            learning rate to be defined.
        grads : dict
            A dictionary mapping from the model's parameters to their
            gradients.
        lr_scalers : dict
            A dictionary mapping from the model's parameters to a learning
            rate multiplier.
        """

        updates = OrderedDict({})
        eps = self.damping
        step = sharedX(0., name="step")

        if self.skip_nan_inf:
            #If norm of the gradients of a parameter is inf or nan don't update that parameter
            #That might be useful for RNNs.
            grads = OrderedDict({
                p: T.switch(T.or_(T.isinf(grads[p]), T.isnan(grads[p])), 0,
                            grads[p])
                for p in grads.keys()
            })

        #Block-normalize gradients:
        nparams = len(grads.keys())

        #Apply the gradient clipping, this is only sometimes
        #necessary for RNNs and sometimes for very deep networks
        if self.grad_clip:
            assert self.grad_clip > 0.
            assert self.grad_clip <= 1., "Norm of the gradients per layer can not be larger than 1."

            gnorm = sum([g.norm(2) for g in grads.values()])
            notfinite = T.or_(T.isnan(gnorm), T.isinf(gnorm))

            for p, g in grads.iteritems():
                tmpg = T.switch(gnorm / nparams > self.grad_clip,
                                g * self.grad_clip * nparams / gnorm, g)
                grads[p] = T.switch(notfinite, as_floatX(0.1) * p, tmpg)

        tot_norm_up = 0
        tot_param_norm = 0

        fix_decay = self.slow_decay**(step + 1)
        for param in grads.keys():
            grads[param].name = "grad_%s" % param.name
            mean_grad = sharedX(param.get_value() * 0. + eps,
                                name="mean_grad_%s" % param.name)
            mean_corrected_grad = sharedX(param.get_value() * 0 + eps,
                                          name="mean_corrected_grad_%s" %
                                          param.name)
            gnorm_sqr = sharedX(0.0 + eps, name="gnorm_%s" % param.name)

            prod_taus = sharedX((np.ones_like(param.get_value()) - 2 * eps),
                                name="prod_taus_x_t_" + param.name)
            slow_constant = 2.1

            if self.use_adagrad:
                # sum_square_grad := \sum_i g_i^2
                sum_square_grad = sharedX(param.get_value(borrow=True) * 0.,
                                          name="sum_square_grad_%s" %
                                          param.name)
            """
               Initialization of accumulators
            """
            taus_x_t = sharedX(
                (np.ones_like(param.get_value()) + eps) * slow_constant,
                name="taus_x_t_" + param.name)
            self.taus_x_t = taus_x_t

            #Variance reduction parameters
            #Numerator of the gamma:
            gamma_nume_sqr = sharedX(np.zeros_like(param.get_value()) + eps,
                                     name="gamma_nume_sqr_" + param.name)

            #Denominator of the gamma:
            gamma_deno_sqr = sharedX(np.zeros_like(param.get_value()) + eps,
                                     name="gamma_deno_sqr_" + param.name)

            #For the covariance parameter := E[\gamma \alpha]_{t-1}
            cov_num_t = sharedX(np.zeros_like(param.get_value()) + eps,
                                name="cov_num_t_" + param.name)

            # mean_squared_grad := E[g^2]_{t-1}
            mean_square_grad = sharedX(np.zeros_like(param.get_value()) + eps,
                                       name="msg_" + param.name)

            # mean_square_dx := E[(\Delta x)^2]_{t-1}
            mean_square_dx = sharedX(param.get_value() * 0.,
                                     name="msd_" + param.name)

            if self.use_corrected_grad:
                old_grad = sharedX(param.get_value() * 0. + eps)

            #The uncorrected gradient of previous of the previous update:
            old_plain_grad = sharedX(param.get_value() * 0. + eps)
            mean_curvature = sharedX(param.get_value() * 0. + eps)
            mean_curvature_sqr = sharedX(param.get_value() * 0. + eps)

            # Initialize the E[\Delta]_{t-1}
            mean_dx = sharedX(param.get_value() * 0.)

            # Block-wise normalize the gradient:
            norm_grad = grads[param]

            #For the first time-step, assume that delta_x_t := norm_grad
            gnorm = T.sqr(norm_grad).sum()

            cond = T.eq(step, 0)
            gnorm_sqr_o = cond * gnorm + (1 - cond) * gnorm_sqr
            gnorm_sqr_b = gnorm_sqr_o / (1 - fix_decay)

            norm_grad = norm_grad / (T.sqrt(gnorm_sqr_b) + eps)
            msdx = cond * norm_grad**2 + (1 - cond) * mean_square_dx
            mdx = cond * norm_grad + (1 - cond) * mean_dx

            new_prod_taus = (prod_taus * (1 - 1 / taus_x_t))
            """
                Compute the new updated values.
            """
            # E[g_i^2]_t
            new_mean_squared_grad = (mean_square_grad * (1 - 1 / taus_x_t) +
                                     T.sqr(norm_grad) / (taus_x_t))
            new_mean_squared_grad.name = "msg_" + param.name

            # E[g_i]_t
            new_mean_grad = (mean_grad * (1 - 1 / taus_x_t) +
                             norm_grad / taus_x_t)

            new_mean_grad.name = "nmg_" + param.name
            mg = new_mean_grad / (1 - new_prod_taus)
            mgsq = new_mean_squared_grad / (1 - new_prod_taus)

            new_gnorm_sqr = (gnorm_sqr_o * self.slow_decay +
                             T.sqr(norm_grad).sum() * (1 - self.slow_decay))

            # Keep the rms for numerator and denominator of gamma.
            new_gamma_nume_sqr = (gamma_nume_sqr * (1 - 1 / taus_x_t) + T.sqr(
                (norm_grad - old_grad) * (old_grad - mg)) / taus_x_t)
            new_gamma_nume_sqr.name = "ngammasqr_num_" + param.name

            new_gamma_deno_sqr = (gamma_deno_sqr * (1 - 1 / taus_x_t) + T.sqr(
                (mg - norm_grad) * (old_grad - mg)) / taus_x_t)

            new_gamma_deno_sqr.name = "ngammasqr_den_" + param.name

            gamma = T.sqrt(gamma_nume_sqr) / (T.sqrt(gamma_deno_sqr + eps) + \
                    self.gamma_reg)

            gamma.name = "gamma_" + param.name

            if self.gamma_clip and self.gamma_clip > -1:
                gamma = T.minimum(gamma, self.gamma_clip)

            momentum_step = gamma * mg
            corrected_grad_cand = (norm_grad + momentum_step) / (1 + gamma)

            #For starting the variance reduction.
            if self.start_var_reduction > -1:
                cond = T.le(self.start_var_reduction, step)
                corrected_grad = cond * corrected_grad_cand + (
                    1 - cond) * norm_grad
            else:
                corrected_grad = norm_grad

            if self.use_adagrad:
                g = corrected_grad
                # Accumulate gradient
                new_sum_squared_grad = (sum_square_grad + T.sqr(g))
                rms_g_t = T.sqrt(new_sum_squared_grad)
                rms_g_t = T.maximum(rms_g_t, 1.0)

            #Use the gradients from the previous update
            #to compute the \nabla f(x_t) - \nabla f(x_{t-1})
            cur_curvature = norm_grad - old_plain_grad
            #cur_curvature = theano.printing.Print("Curvature: ")(cur_curvature)
            cur_curvature_sqr = T.sqr(cur_curvature)

            new_curvature_ave = (mean_curvature * (1 - 1 / taus_x_t) +
                                 (cur_curvature / taus_x_t))
            new_curvature_ave.name = "ncurve_ave_" + param.name

            #Average average curvature
            nc_ave = new_curvature_ave / (1 - new_prod_taus)

            new_curvature_sqr_ave = (mean_curvature_sqr * (1 - 1 / taus_x_t) +
                                     (cur_curvature_sqr / taus_x_t))
            new_curvature_sqr_ave.name = "ncurve_sqr_ave_" + param.name

            #Unbiased average squared curvature
            nc_sq_ave = new_curvature_sqr_ave / (1 - new_prod_taus)

            epsilon = 1e-7
            #lr_scalers.get(param, 1.) * learning_rate
            scaled_lr = sharedX(1.0)
            rms_dx_tm1 = T.sqrt(msdx + epsilon)

            rms_curve_t = T.sqrt(new_curvature_sqr_ave + epsilon)

            #This is where the update step is being defined
            delta_x_t = -scaled_lr * (rms_dx_tm1 / rms_curve_t - cov_num_t /
                                      (new_curvature_sqr_ave + epsilon))
            delta_x_t.name = "delta_x_t_" + param.name

            # This part seems to be necessary for only RNNs
            # For feedforward networks this does not seem to be important.
            if self.delta_clip:
                logger.info(
                    "Clipping will be applied on the adaptive step size.")
                delta_x_t = delta_x_t.clip(-self.delta_clip, self.delta_clip)
                if self.use_adagrad:
                    delta_x_t = delta_x_t * corrected_grad / rms_g_t
                else:
                    logger.info("Clipped adagrad is disabled.")
                    delta_x_t = delta_x_t * corrected_grad
            else:
                logger.info(
                    "Clipping will not be applied on the adaptive step size.")
                if self.use_adagrad:
                    delta_x_t = delta_x_t * corrected_grad / rms_g_t
                else:
                    logger.info("Clipped adagrad will not be used.")
                    delta_x_t = delta_x_t * corrected_grad

            new_taus_t = (1 - T.sqr(mdx) / (msdx + eps)) * taus_x_t + sharedX(
                1 + eps, "stabilized")

            #To compute the E[\Delta^2]_t
            new_mean_square_dx = (msdx * (1 - 1 / taus_x_t) +
                                  (T.sqr(delta_x_t) / taus_x_t))

            #To compute the E[\Delta]_t
            new_mean_dx = (mdx * (1 - 1 / taus_x_t) + (delta_x_t / (taus_x_t)))

            #Perform the outlier detection:
            #This outlier detection is slightly different:
            new_taus_t = T.switch(
                T.or_(
                    abs(norm_grad - mg) > (2 * T.sqrt(mgsq - mg**2)),
                    abs(cur_curvature - nc_ave) >
                    (2 * T.sqrt(nc_sq_ave - nc_ave**2))),
                T.switch(new_taus_t > 2.5, sharedX(2.5),
                         new_taus_t + sharedX(1.0) + eps), new_taus_t)

            #Apply the bound constraints on tau:
            new_taus_t = T.maximum(self.lower_bound_tau, new_taus_t)
            new_taus_t = T.minimum(self.upper_bound_tau, new_taus_t)

            new_cov_num_t = (cov_num_t * (1 - 1 / taus_x_t) +
                             (delta_x_t * cur_curvature) * (1 / taus_x_t))

            update_step = delta_x_t

            tot_norm_up += update_step.norm(2)
            tot_param_norm += param.norm(2)

            # Apply updates
            updates[mean_square_grad] = new_mean_squared_grad
            updates[mean_square_dx] = new_mean_square_dx
            updates[mean_dx] = new_mean_dx
            updates[gnorm_sqr] = new_gnorm_sqr
            updates[gamma_nume_sqr] = new_gamma_nume_sqr
            updates[gamma_deno_sqr] = new_gamma_deno_sqr
            updates[taus_x_t] = new_taus_t
            updates[cov_num_t] = new_cov_num_t
            updates[mean_grad] = new_mean_grad
            updates[old_plain_grad] = norm_grad
            updates[mean_curvature] = new_curvature_ave
            updates[mean_curvature_sqr] = new_curvature_sqr_ave

            if self.perform_update:
                updates[param] = param + update_step

            updates[step] = step + 1
            updates[prod_taus] = new_prod_taus

            if self.use_adagrad:
                updates[sum_square_grad] = new_sum_squared_grad

            if self.use_corrected_grad:
                updates[old_grad] = corrected_grad

        return updates, tot_norm_up, tot_param_norm
    def get_funcs(self,
                  learning_rate,
                  grads,
                  inp,
                  cost,
                  errors,
                  lr_scalers=None):
        """
        Compute the AdaDelta updates

        Parameters
        ----------
        learning_rate : float
            Learning rate coefficient.
        grads : dict
            A dictionary mapping from the model's parameters to their
            gradients.
        lr_scalers : dict
            A dictionary mapping from the model's parameters to a learning
            rate multiplier.
        """
        updates = OrderedDict()

        tot_norm_up = 0

        gshared = OrderedDict({
            p: sharedX(p.get_value() * 0., name='%s_grad' % p.name)
            for p, g in grads.iteritems()
        })

        gsup = [(gshared[p], g) for p, g in grads.iteritems()]
        get_norms = lambda x: T.sqrt(sum(map(lambda y: (y**2).sum(), x)))
        gnorm = get_norms(grads.values())
        pnorm = get_norms(grads.keys())
        f_grad_shared = theano.function(inp, [cost, errors, gnorm, pnorm],
                                        updates=gsup)

        for param in gshared.keys():
            # mean_squared_grad := E[g^2]_{t-1}
            mean_square_grad = sharedX(param.get_value() * 0.)
            # mean_square_dx := E[(\Delta x)^2]_{t-1}
            mean_square_dx = sharedX(param.get_value() * 0.)

            if param.name is not None:
                mean_square_grad.name = 'mean_square_grad_' + param.name
                mean_square_dx.name = 'mean_square_dx_' + param.name

            # Accumulate gradient
            new_mean_squared_grad = (self.decay * mean_square_grad +
                                     (1 - self.decay) * T.sqr(gshared[param]))

            # Compute update
            epsilon = learning_rate
            rms_dx_tm1 = T.sqrt(mean_square_dx + epsilon)
            rms_grad_t = T.sqrt(new_mean_squared_grad + epsilon)
            delta_x_t = -rms_dx_tm1 / rms_grad_t * gshared[param]

            # Accumulate updates
            new_mean_square_dx = (self.decay * mean_square_dx +
                                  (1 - self.decay) * T.sqr(delta_x_t))

            # Apply update
            updates[mean_square_grad] = new_mean_squared_grad
            updates[mean_square_dx] = new_mean_square_dx
            updates[param] = param + delta_x_t

            tot_norm_up += delta_x_t.norm(2)

        f_update = theano.function([learning_rate], [tot_norm_up],
                                   updates=updates,
                                   on_unused_input='ignore')

        return f_grad_shared, f_update
    def __init__(self,
                 objective,
                 params,
                 inputs=None,
                 param_constrainers=None,
                 max_iter=-1,
                 lr_scalers=None,
                 verbose=0,
                 tol=None,
                 init_alpha=None,
                 min_init_alpha=1e-3,
                 reset_alpha=True,
                 conjugate=False,
                 reset_conjugate=True,
                 gradients=None,
                 gradient_updates=None,
                 line_search_mode=None,
                 accumulate=False,
                 theano_function_mode=None):
        """
        objective: a theano expression to be minimized
                       should be a function of params and,
                       if provided, inputs
            params: A list of theano shared variables.
                    These are the optimization variables
            inputs: (Optional) A list of theano variables
                    to serve as inputs to the graph.
            param_constrainers: (Optional) A list of callables
                    to be called on all updates dictionaries to
                    be applied to params. This is how you implement
                    constrained optimization.
            reset_alpha: If True, reverts to using init_alpha after
                        each call. If False, the final set of alphas
                        is used at the start of the next call to minimize.
            conjugate: If True, tries to pick conjugate gradient directions.
                       For the directions to be truly conjugate, you must use
                       line_search_mode = 'exhaustive' and the objective function
                       must be quadratic.
                       Using line_search_mode = 'exhaustive' on a non-quadratic objective
                       function implements nonlinear conjugate gradient descent.
            reset_conjugate:
                    has no effect unless conjugate == True
                    if reset_conjugate == True,
                        reverts to direction of steepest descent for the first
                        step in each call to minimize.
                    otherwise, tries to make the new search direction
                    conjugate to the last one (even though the objective function
                    might be totally different on each call to minimize)
            gradients: if None, compute the gradients of obj using T.grad
                    otherwise, a dictionary mapping from params to expressions
                    for their gradients (this allows you to use approximate
                    gradients computed with something other than T.grad)
            gradient_updates: a dictionary of shared variable updates to run
                each time the gradient is computed

            Calling the ``minimize'' method with values for
            for ``inputs'' will update ``params'' to minimize
            ``objective''.
        """

        self.__dict__.update(locals())
        del self.self

        if line_search_mode is None:
            if init_alpha is None:
                init_alpha = (.001, .005, .01, .05, .1)
        else:
            assert line_search_mode == 'exhaustive'
            if init_alpha is None:
                init_alpha = (.5, 1.)

        self.init_alpha = tuple([float(elem) for elem in init_alpha])

        if inputs is None:
            inputs = []

        if param_constrainers is None:
            param_constrainers = []

        obj = objective

        self.verbose = verbose

        param_to_grad_sym = OrderedDict()
        param_to_grad_shared = OrderedDict()
        updates = OrderedDict()
        if self.gradient_updates is not None:
            updates.update(self.gradient_updates)

        self.params = [param for param in params]

        for param in params:
            if self.gradients is not None and param in self.gradients:
                g = self.gradients[param]
            else:
                g = grad(objective, param)
            param_to_grad_sym[param] = g
            if param.name is not None:
                param_name = param.name
            else:
                param_name = 'anon_param'
            grad_name = 'BatchGradientDescent.grad_' + param_name
            grad_shared = sharedX(param.get_value() * 0., name=grad_name)
            param_to_grad_shared[param] = grad_shared
            updates[grad_shared] = g

        self.param_to_grad_shared = param_to_grad_shared

        if self.verbose:
            print 'batch gradient class compiling gradient function'
        t1 = time.time()
        if self.accumulate:
            self._compute_grad = Accumulator(inputs, updates=updates)
        else:
            self._compute_grad = function(
                inputs,
                updates=updates,
                mode=self.theano_function_mode,
                name='BatchGradientDescent._compute_grad')
        if self.verbose:
            t2 = time.time()
            print 'done. Took ', t2 - t1

        if self.verbose:
            print 'batch gradient class compiling objective function'
        if self.accumulate:
            self.obj = Accumulator(inputs, obj)
        else:
            self.obj = function(inputs,
                                obj,
                                mode=self.theano_function_mode,
                                name='BatchGradientDescent.obj')

        if self.verbose:
            print 'done'

        self.param_to_cache = OrderedDict()
        alpha = T.scalar(name='alpha')
        alpha.tag.test_value = np.cast[alpha.dtype](.01)
        cache_updates = OrderedDict()
        goto_updates = OrderedDict()
        for param in params:
            if param.name is None:
                param_name = 'anon_param'
            else:
                param_name = param.name
            cache_name = 'BatchGradientDescent.param_to_cache[%s]' % param_name
            self.param_to_cache[param] = sharedX(param.get_value(borrow=False),
                                                 name=cache_name)
            cache_updates[self.param_to_cache[param]] = param
            cached = self.param_to_cache[param]
            g = self.param_to_grad_shared[param]
            if lr_scalers is not None and param in lr_scalers:
                scaled_alpha = alpha * lr_scalers[param]
            else:
                scaled_alpha = alpha
            mul = scaled_alpha * g
            diff = cached - mul
            goto_updates[param] = diff
        self._cache_values = function(
            [],
            updates=cache_updates,
            mode=self.theano_function_mode,
            name='BatchGradientDescent._cache_values')
        assert isinstance(param_constrainers, (list, tuple))
        for param_constrainer in param_constrainers:
            param_constrainer(goto_updates)
        self._goto_alpha = function([alpha],
                                    updates=goto_updates,
                                    mode=self.theano_function_mode,
                                    name='BatchGradientDescent._goto_alpha')

        norm = T.sqrt(
            sum([
                T.sqr(elem).sum()
                for elem in self.param_to_grad_shared.values()
            ]))
        norm.name = 'BatchGradientDescent.norm'
        normalize_grad_updates = OrderedDict()
        for grad_shared in self.param_to_grad_shared.values():
            normalize_grad_updates[grad_shared] = grad_shared / norm

        # useful for monitoring
        self.ave_grad_size = sharedX(0.)
        self.new_weight = sharedX(1.)
        normalize_grad_updates[self.ave_grad_size] = self.new_weight * norm + (
            1. - self.new_weight) * self.ave_grad_size

        self._normalize_grad = function(
            [],
            norm,
            updates=normalize_grad_updates,
            mode=self.theano_function_mode,
            name='BatchGradientDescent._normalize_grad')

        if self.conjugate:
            grad_shared = self.param_to_grad_shared.values()

            grad_to_old_grad = OrderedDict()
            for elem in grad_shared:
                grad_to_old_grad[elem] = sharedX(elem.get_value(),
                                                 'old_' + elem.name)

            self._store_old_grad = function(
                [norm],
                updates=OrderedDict([(grad_to_old_grad[g], g * norm)
                                     for g in grad_to_old_grad]),
                mode=self.theano_function_mode,
                name='BatchGradientDescent._store_old_grad')

            grad_ordered = list(grad_to_old_grad.keys())
            old_grad_ordered = [grad_to_old_grad[g] for g in grad_ordered]

            def dot_product(x, y):
                return sum([(x_elem * y_elem).sum()
                            for x_elem, y_elem in safe_zip(x, y)])

            beta_pr = (dot_product(grad_ordered, grad_ordered) - dot_product(grad_ordered, old_grad_ordered)) / \
                    (1e-7+dot_product(old_grad_ordered, old_grad_ordered))
            assert beta_pr.ndim == 0

            beta = T.maximum(beta_pr, 0.)
            """

            beta_pr is the Polak-Ribiere formula for beta.
            According to wikipedia, the beta to use for NCG is "a matter of heuristics or taste"
            but max(0, beta_pr) is "a popular choice... which provides direction reset automatically."
            (ie, it is meant to revert to steepest descent when you have traveled far enough that
            the objective function is behaving non-quadratically enough that the conjugate gradient
            formulas aren't working anymore)

            http://en.wikipedia.org/wiki/Nonlinear_conjugate_gradient_method

            """

            assert grad not in grad_to_old_grad

            make_conjugate_updates = [(g, g + beta * grad_to_old_grad[g])
                                      for g in grad_ordered]

            mode = self.theano_function_mode
            if mode is not None and hasattr(mode, 'record'):
                for v, u in make_conjugate_updates:
                    mode.record.handle_line('BatchGradientDescent._make_conjugate var ' \
                            + var_descriptor(v) + '\n')
                    mode.record.handle_line('BatchGradientDescent._make_conjugate update ' \
                            + var_descriptor(u) + '\n')

            self._make_conjugate = function(
                [],
                updates=make_conjugate_updates,
                mode=self.theano_function_mode,
                name='BatchGradientDescent._make_conjugate')

            if mode is not None and hasattr(mode, 'record'):
                for output in self._make_conjugate.maker.fgraph.outputs:
                    mode.record.handle_line('BatchGradientDescent._make_conjugate output ' \
                            + var_descriptor(output) + '\n')

        if tol is None:
            if objective.dtype == "float32":
                self.tol = 1e-6
            else:
                self.tol = 3e-7
        else:
            self.tol = tol

        self.ave_step_size = sharedX(0.)
        self.ave_grad_mult = sharedX(0.)
    def get_funcs(self,
                  learning_rate,
                  grads,
                  inp,
                  cost,
                  errors,
                  lr_scalers=None):
        """
        .. todo::

            WRITEME
        """
        updates = OrderedDict()
        velocity = OrderedDict()
        tot_norm_up = 0

        gshared = OrderedDict({
            p: sharedX(p.get_value() * 0., name='%s_grad' % p.name)
            for p, g in grads.iteritems()
        })

        gsup = [(gshared[p], g) for p, g in grads.iteritems()]
        get_norms = lambda x: T.sqrt(sum(map(lambda y: (y**2).sum(), x)))
        gnorm = get_norms(grads.values())
        pnorm = get_norms(grads.keys())
        f_grad_shared = theano.function(inp, [cost, errors, gnorm, pnorm],
                                        updates=gsup)

        for param in gshared.keys():
            avg_grad_sqr = sharedX(np.zeros_like(param.get_value()))
            velocity[param] = sharedX(np.zeros_like(param.get_value()))

            if param.name is not None:
                avg_grad_sqr.name = 'avg_grad_sqr_' + param.name

            new_avg_grad_sqr = self.averaging_coeff * avg_grad_sqr +\
                (1 - self.averaging_coeff) * T.sqr(gshared[param])
            if self.use_first_order:
                avg_grad = sharedX(np.zeros_like(param.get_value()))
                if param.name is not None:
                    avg_grad.name = 'avg_grad_' + param.name
                new_avg_grad = self.averaging_coeff * avg_grad +\
                    (1 - self.averaging_coeff) * gshared[param]
                rms_grad_t = T.sqrt(new_avg_grad_sqr - new_avg_grad**2)
                updates[avg_grad] = new_avg_grad
            else:
                rms_grad_t = T.sqrt(new_avg_grad_sqr)

            rms_grad_t = T.maximum(rms_grad_t, self.stabilizer)
            normalized_grad = gshared[param] / (rms_grad_t)
            new_velocity = self.momentum * velocity[param] -\
                learning_rate * normalized_grad
            tot_norm_up += new_velocity.norm(2)

            updates[avg_grad_sqr] = new_avg_grad_sqr
            updates[velocity[param]] = new_velocity
            updates[param] = param + new_velocity

        if self.momentum_clipping is not None:
            tot_norm_up = 0

            new_mom_norm = sum(
                map(lambda X: T.sqr(X).sum(),
                    [updates[velocity[param]] for param in grads.keys()]))
            new_mom_norm = T.sqrt(new_mom_norm)
            scaling_den = T.maximum(self.momentum_clipping, new_mom_norm)
            scaling_num = self.momentum_clipping

            for param in grads.keys():
                if self.bound_inc:
                    updates[velocity[param]] *= (scaling_num / scaling_den)
                    updates[param] = param + updates[velocity[param]]
                else:
                    update_step = updates[velocity[param]] * (scaling_num /
                                                              scaling_den)
                    tot_norm_up += update_step.norm(2)
                    updates[param] = param + update_step

        f_update = theano.function([learning_rate], [tot_norm_up],
                                   updates=updates,
                                   on_unused_input='ignore')

        return f_grad_shared, f_update
    def get_funcs(self,
                  learning_rate,
                  grads,
                  inp,
                  cost,
                  errors,
                  lr_scalers=None):
        """
        .. todo::

            WRITEME
        """
        if self.gradient_clipping is not None:
            grads_norm = sum(
                map(lambda X: T.sqr(X).sum(),
                    [grads[param] for param in grads.keys()]))
            grads_norm = T.sqrt(grads_norm)
            scaling_den = T.maximum(self.gradient_clipping, grads_norm)
            scaling_num = self.gradient_clipping
            for param in grads.keys():
                grads[param] = scaling_num * grads[param] / scaling_den

        updates = OrderedDict()
        velocity = OrderedDict()
        normalized_velocities = OrderedDict()

        counter = sharedX(0, 'counter')
        tot_norm_up = 0
        gshared = OrderedDict({
            p: sharedX(p.get_value() * 0., name='%s_grad' % p.name)
            for p, g in grads.iteritems()
        })

        gsup = [(gshared[p], g) for p, g in grads.iteritems()]
        get_norms = lambda x: T.sqrt(sum(map(lambda y: (y**2).sum(), x)))
        gnorm = get_norms(grads.values())
        pnorm = get_norms(grads.keys())
        f_grad_shared = theano.function(inp, [cost, errors, gnorm, pnorm],
                                        updates=gsup)
        for param in gshared.keys():
            avg_grad_sqr = sharedX(np.zeros_like(param.get_value()))
            velocity[param] = sharedX(np.zeros_like(param.get_value()))

            next_counter = counter + 1.

            fix_first_moment = 1. - self.momentum**next_counter
            fix_second_moment = 1. - self.averaging_coeff**next_counter

            if param.name is not None:
                avg_grad_sqr.name = 'avg_grad_sqr_' + param.name

            new_avg_grad_sqr = self.averaging_coeff*avg_grad_sqr \
                + (1 - self.averaging_coeff)*T.sqr(gshared[param])

            rms_grad_t = T.sqrt(new_avg_grad_sqr)
            rms_grad_t = T.maximum(rms_grad_t, self.stabilizer)
            new_velocity = self.momentum * velocity[param] \
                - (1 - self.momentum) * gshared[param]
            normalized_velocity = (new_velocity * T.sqrt(fix_second_moment)) \
                / (rms_grad_t * fix_first_moment)

            tot_norm_up += learning_rate * normalized_velocity.norm(2)

            normalized_velocities[param] = normalized_velocity
            updates[avg_grad_sqr] = new_avg_grad_sqr
            updates[velocity[param]] = new_velocity
            updates[param] = param + normalized_velocities[param]

        updates[counter] = counter + 1
        f_update = theano.function([learning_rate], [tot_norm_up],
                                   updates=updates,
                                   on_unused_input='ignore')

        return f_grad_shared, f_update
class AlexNet(Model):
    """
    This is the base model for AlexNet, Alex Krizhevsky's efficient deep convolutional net described in:
    'ImageNet Classification with Deep Convolutional Neural Networks'
    Alex Krizhevsky, Ilya Sutskever, Geoffrey E. Hinton
    http://www.cs.toronto.edu/~fritz/absps/imagenet.pdf

    Most of the code here is adapted from the authors listed in the license above, from the paper:
    'Theano-based large-scale visual recognition with multiple GPUs'
    Weiguang Ding & Ruoyan Wnag, Fei Mao, Graham Taylor
    http://arxiv.org/pdf/1412.2302.pdf

    Copyright (c) 2014, Weiguang Ding, Ruoyan Wang, Fei Mao and Graham Taylor
    All rights reserved.
    Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
        1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
        2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
        3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    """
    defaults = {  # data stuff
        "use_data_layer": False,
        "rand_crop": True,
        "batch_size":
        256,  # convolutional nets are particular about the batch size
        "output_path": '/outputs/alexnet/'
    }

    def __init__(self,
                 config=None,
                 defaults=defaults,
                 inputs_hook=None,
                 hiddens_hook=None,
                 params_hook=None,
                 use_data_layer=None,
                 rand_crop=None,
                 batch_size=None):
        # init Model to combine the defaults and config dictionaries.
        super(AlexNet, self).__init__(config, defaults)
        # all configuration parameters are now in self.args

        if inputs_hook or hiddens_hook or params_hook:
            log.critical(
                "Inputs_hook, hiddens_hook, and params_hook not implemented yet for AlexNet!"
            )
            raise NotImplementedError()

        self.flag_datalayer = use_data_layer or self.args.get('use_data_layer')
        self.batch_size = batch_size or self.args.get('batch_size')
        self.rand_crop = rand_crop or self.args.get('rand_crop')

        ####################
        # Theano variables #
        ####################
        # allocate symbolic variables for the data
        # 'rand' is a random array used for random cropping/mirroring of data
        self.x = T.ftensor4('x')
        self.y = T.lvector('y')
        self.rand = T.fvector('rand')

        ##########
        # params #
        ##########
        self.params = []

        # make the network!
        self.build_computation_graph()

    def build_computation_graph(self):
        ###################### BUILD NETWORK ##########################
        # whether or not to mirror the input images before feeding them into the network
        if self.flag_datalayer:
            layer_1_input = mirror_images(
                input=self.x,
                image_shape=(
                    self.batch_size,
                    3,
                    256,
                    256,
                ),  # bc01 format
                cropsize=227,
                rand=self.rand,
                flag_rand=self.rand_crop)
        else:
            layer_1_input = self.x  # 4D tensor (going to be in c01b format)

        # Start with 5 convolutional pooling layers
        log.debug("convpool layer 1...")
        convpool_layer1 = ConvPoolLayer(inputs_hook=((self.batch_size, 3, 227,
                                                      227), layer_1_input),
                                        filter_shape=(96, 3, 11, 11),
                                        convstride=4,
                                        padsize=0,
                                        group=1,
                                        poolsize=3,
                                        poolstride=2,
                                        bias_init=0.0,
                                        local_response_normalization=True)
        # Add this layer's parameters!
        self.params += convpool_layer1.get_params()

        log.debug("convpool layer 2...")
        convpool_layer2 = ConvPoolLayer(inputs_hook=((
            self.batch_size,
            96,
            27,
            27,
        ), convpool_layer1.get_outputs()),
                                        filter_shape=(256, 96, 5, 5),
                                        convstride=1,
                                        padsize=2,
                                        group=2,
                                        poolsize=3,
                                        poolstride=2,
                                        bias_init=0.1,
                                        local_response_normalization=True)
        # Add this layer's parameters!
        self.params += convpool_layer2.get_params()

        log.debug("convpool layer 3...")
        convpool_layer3 = ConvPoolLayer(
            inputs_hook=((self.batch_size, 256, 13, 13),
                         convpool_layer2.get_outputs()),
            filter_shape=(384, 256, 3, 3),
            convstride=1,
            padsize=1,
            group=1,
            poolsize=1,
            poolstride=0,
            bias_init=0.0,
            local_response_normalization=False)
        # Add this layer's parameters!
        self.params += convpool_layer3.get_params()

        log.debug("convpool layer 4...")
        convpool_layer4 = ConvPoolLayer(
            inputs_hook=((self.batch_size, 384, 13, 13),
                         convpool_layer3.get_outputs()),
            filter_shape=(384, 384, 3, 3),
            convstride=1,
            padsize=1,
            group=2,
            poolsize=1,
            poolstride=0,
            bias_init=0.1,
            local_response_normalization=False)
        # Add this layer's parameters!
        self.params += convpool_layer4.get_params()

        log.debug("convpool layer 5...")
        convpool_layer5 = ConvPoolLayer(
            inputs_hook=((self.batch_size, 384, 13, 13),
                         convpool_layer4.get_outputs()),
            filter_shape=(256, 384, 3, 3),
            convstride=1,
            padsize=1,
            group=2,
            poolsize=3,
            poolstride=2,
            bias_init=0.0,
            local_response_normalization=False)
        # Add this layer's parameters!
        self.params += convpool_layer5.get_params()

        # Now onto the fully-connected layers!
        fc_config = {
            'activation':
            'rectifier',  # type of activation function to use for output
            'weights_init':
            'gaussian',  # either 'gaussian' or 'uniform' - how to initialize weights
            'weights_mean': 0.0,  # mean for gaussian weights init
            'weights_std':
            0.005,  # standard deviation for gaussian weights init
            'bias_init': 0.0  # how to initialize the bias parameter
        }
        log.debug("fully connected layer 1 (model layer 6)...")
        # we want to have dropout applied to the training version, but not the test version.
        fc_layer6_input = T.flatten(convpool_layer5.get_outputs(), 2)
        fc_layer6 = BasicLayer(inputs_hook=(9216, fc_layer6_input),
                               output_size=4096,
                               config=fc_config)
        # Add this layer's parameters!
        self.params += fc_layer6.get_params()

        # now apply dropout to the output for training
        dropout_layer6 = dropout(fc_layer6.get_outputs(), corruption_level=0.5)

        log.debug("fully connected layer 2 (model layer 7)...")
        fc_layer7 = BasicLayer(inputs_hook=(4096, fc_layer6.get_outputs()),
                               output_size=4096,
                               config=fc_config)
        fc_layer7_train = BasicLayer(inputs_hook=(4096, dropout_layer6),
                                     output_size=4096,
                                     params_hook=fc_layer7.get_params(),
                                     config=fc_config)
        # Add this layer's parameters!
        self.params += fc_layer7_train.get_params()

        # apply dropout again for training
        dropout_layer7 = dropout(fc_layer7_train.get_outputs(),
                                 corruption_level=0.5)

        # last layer is a softmax prediction output layer
        softmax_config = {
            'weights_init': 'gaussian',
            'weights_mean': 0.0,
            'weights_std': 0.005,
            'bias_init': 0.0
        }
        log.debug("softmax classification layer (model layer 8)...")
        softmax_layer8 = SoftmaxLayer(inputs_hook=(4096,
                                                   fc_layer7.get_outputs()),
                                      output_size=1000,
                                      config=softmax_config)
        softmax_layer8_train = SoftmaxLayer(
            inputs_hook=(4096, dropout_layer7),
            output_size=1000,
            params_hook=softmax_layer8.get_params(),
            config=softmax_config)
        # Add this layer's parameters!
        self.params += softmax_layer8.get_params()

        # finally the softmax output from the whole thing!
        self.output = softmax_layer8.get_outputs()

        #####################
        # Cost and monitors #
        #####################
        self.train_cost = softmax_layer8_train.negative_log_likelihood(self.y)
        cost = softmax_layer8.negative_log_likelihood(self.y)
        errors = softmax_layer8.errors(self.y)
        train_errors = softmax_layer8_train.errors(self.y)

        self.monitors = OrderedDict([('cost', cost), ('errors', errors),
                                     ('dropout_errors', train_errors)])

        #########################
        # Compile the functions #
        #########################
        log.debug("Compiling functions!")
        t = time.time()
        log.debug("f_predict...")
        # use the actual argmax from the classification
        self.f_predict = function(
            inputs=[self.x], outputs=softmax_layer8.get_argmax_prediction())
        log.debug("f_monitors")
        self.f_monitors = function(inputs=[self.x, self.y],
                                   outputs=self.monitors.values())
        log.debug("compilation took %s" %
                  make_time_units_string(time.time() - t))

    def get_inputs(self):
        """
        This should return the input(s) to the model's computation graph. This is called by the Optimizer when creating
        the theano train function on the cost expression returned by get_train_cost().

        This should normally return the same theano variable list that is used in the inputs= argument to the f_predict
        function.
        ------------------

        :return: Theano variables representing the input(s) to the training function.
        :rtype: List(theano variable)
        """
        return [self.x]

    def get_outputs(self):
        """
        This method will return the model's output variable expression from the computational graph.
        This should be what is given for the outputs= part of the 'f_predict' function from self.predict().

        This will be used for creating hooks to link models together, where these outputs can be strung as the inputs
        or hiddens to another model :)
        ------------------

        :return: theano expression of the outputs from this model's computation
        :rtype: theano tensor (expression)
        """
        return self.output

    def predict(self, input):
        """
        This method will return the model's output (run through the function), given an input. In the case that
        input_hooks or hidden_hooks are used, the function should use them appropriately and assume they are the input.

        Try to avoid re-compiling the theano function created for predict - check a hasattr(self, 'f_predict') or
        something similar first. I recommend creating your theano f_predict in a create_computation_graph method
        to be called after the class initializes.
        ------------------

        :param input: Theano/numpy tensor-like object that is the input into the model's computation graph.
        :type input: tensor

        :return: Theano/numpy tensor-like object that is the output of the model's computation graph.
        :rtype: tensor
        """
        if not hasattr(self, 'f_predict'):
            log.error(
                "Missing self.f_predict - make sure you ran self.build_computation_graph()! "
                "This should have run during initialization....")
            raise NotImplementedError()
        return self.f_predict(*input)

    def get_train_cost(self):
        """
        This returns the expression that represents the cost given an input, which is used for the Optimizer during
        training. The reason we can't just compile a f_train theano function is because updates need to be calculated
        for the parameters during gradient descent - and these updates are created in the Optimizer object.
        ------------------

        :return: theano expression of the model's training cost, from which parameter gradients will be computed.
        :rtype: theano tensor
        """
        return self.train_cost

    def get_monitors(self):
        """
        This returns a dictionary of (monitor_name: monitor_function) of variables (monitors) whose values we care
        about during training. For every monitor returned by this method, the function will be run on the
        train/validation/test dataset and its value will be reported.

        Again, please avoid recompiling the monitor functions every time - check your hasattr to see if they already
        exist!
        ------------------

        :return: Dictionary of String: theano_function for each monitor variable we care about in the model.
        :rtype: Dictionary
        """
        if not hasattr(self, 'f_monitors'):
            log.error(
                "Missing self.f_monitors - make sure you ran self.build_computation_graph()! "
                "This should have run during initialization....")
            raise NotImplementedError()
        names = ', '.join(self.monitors.keys())
        return {names: self.f_monitors}

    def get_params(self):
        """
        This returns the list of theano shared variables that will be trained by the Optimizer.
        These parameters are used in the gradient.
        ------------------

        :return: flattened list of theano shared variables to be trained
        :rtype: List(shared_variables)
        """
        return self.params
Beispiel #34
0
    def get_gradients(self, model, data):
        """
        PCD approximation to the gradient of the bound.
        Keep in mind this is a cost, so we are upper bounding
        the negative log likelihood.
        """
        self.get_data_specs(model)[0].validate(data)

        if self.supervised:
            (X, Y) = data
            # note: if the Y layer changes to something without linear energy,
            # we'll need to make the expected energy clamp Y in the positive phase
            assert isinstance(model.hidden_layers[-1], dbm.Softmax)
        else:
            X = data
            Y = None

        q = model.mf(X, Y)


        """
            Use the non-negativity of the KL divergence to construct a lower bound
            on the log likelihood. We can drop all terms that are constant with
            repsect to the model parameters:

            log P(v) = L(v, q) + KL(q || P(h|v))
            L(v, q) = log P(v) - KL(q || P(h|v))
            L(v, q) = log P(v) - sum_h q(h) log q(h) + q(h) log P(h | v)
            L(v, q) = log P(v) + sum_h q(h) log P(h | v) + const
            L(v, q) = log P(v) + sum_h q(h) log P(h, v) - sum_h q(h) log P(v) + const
            L(v, q) = sum_h q(h) log P(h, v) + const
            L(v, q) = sum_h q(h) -E(h, v) - log Z + const

            so the cost we want to minimize is
            expected_energy + log Z + const


            Note: for the RBM, this bound is exact, since the KL divergence goes to 0.
        """

        variational_params = flatten(q)

        # The gradients of the expected energy under q are easy, we can just do that in theano
        expected_energy_q = model.expected_energy(X, q).mean()
        params = list(model.get_params())
        gradients = OrderedDict(safe_zip(params, T.grad(expected_energy_q, params,
            consider_constant = variational_params,
            disconnected_inputs = 'ignore')))

        """
        d/d theta log Z = (d/d theta Z) / Z
                        = (d/d theta sum_h sum_v exp(-E(v,h)) ) / Z
                        = (sum_h sum_v - exp(-E(v,h)) d/d theta E(v,h) ) / Z
                        = - sum_h sum_v P(v,h)  d/d theta E(v,h)
        """

        layer_to_chains = model.make_layer_to_state(self.num_chains)

        def recurse_check(l):
            if isinstance(l, (list, tuple)):
                for elem in l:
                    recurse_check(elem)
            else:
                assert l.get_value().shape[0] == self.num_chains

        recurse_check(layer_to_chains.values())

        model.layer_to_chains = layer_to_chains

        # Note that we replace layer_to_chains with a dict mapping to the new
        # state of the chains
        updates, layer_to_chains = model.get_sampling_updates(layer_to_chains,
                self.theano_rng, num_steps=self.num_gibbs_steps,
                return_layer_to_updated = True)


        if self.toronto_neg:
            # Ruslan Salakhutdinov's undocumented negative phase from
            # http://www.mit.edu/~rsalakhu/code_DBM/dbm_mf.m
            # IG copied it here without fully understanding it, so it
            # only applies to exactly the same model structure as
            # in that code.

            assert isinstance(model.visible_layer, dbm.BinaryVector)
            assert isinstance(model.hidden_layers[0], dbm.BinaryVectorMaxPool)
            assert model.hidden_layers[0].pool_size == 1
            assert isinstance(model.hidden_layers[1], dbm.BinaryVectorMaxPool)
            assert model.hidden_layers[1].pool_size == 1
            assert isinstance(model.hidden_layers[2], dbm.Softmax)
            assert len(model.hidden_layers) == 3

            V_samples = layer_to_chains[model.visible_layer]
            H1_samples, H2_samples, Y_samples = [layer_to_chains[layer] for layer in model.hidden_layers]

            H1_mf = model.hidden_layers[0].mf_update(state_below=model.visible_layer.upward_state(V_samples),
                                                    state_above=model.hidden_layers[1].downward_state(H2_samples),
                                                    layer_above=model.hidden_layers[1])
            Y_mf = model.hidden_layers[2].mf_update(state_below=model.hidden_layers[1].upward_state(H2_samples))
            H2_mf = model.hidden_layers[1].mf_update(state_below=model.hidden_layers[0].upward_state(H1_mf),
                                                    state_above=model.hidden_layers[2].downward_state(Y_mf),
                                                    layer_above=model.hidden_layers[2])

            expected_energy_p = model.energy(V_samples, [H1_mf, H2_mf, Y_samples]).mean()

            constants = flatten([V_samples, H1_mf, H2_mf, Y_samples])

            neg_phase_grads = OrderedDict(safe_zip(params, T.grad(-expected_energy_p, params, consider_constant = constants)))
        else:
            warnings.warn("""TODO: reduce variance of negative phase by integrating out
                    the even-numbered layers. The Rao-Blackwellize method can do this
                    for you when expected gradient = gradient of expectation, but doing
                    this in general is trickier.""")
            #layer_to_chains = model.rao_blackwellize(layer_to_chains)
            expected_energy_p = model.energy(layer_to_chains[model.visible_layer],
                    [layer_to_chains[layer] for layer in model.hidden_layers]).mean()

            samples = flatten(layer_to_chains.values())
            for i, sample in enumerate(samples):
                if sample.name is None:
                    sample.name = 'sample_'+str(i)

            neg_phase_grads = OrderedDict(safe_zip(params, T.grad(-expected_energy_p, params, consider_constant
                = samples, disconnected_inputs='ignore')))


        for param in list(gradients.keys()):
            gradients[param] = neg_phase_grads[param] + gradients[param]

        return gradients, updates
Beispiel #35
0
class Monitor(object):
    """
    A class for monitoring Models while they are being trained.

    A monitor object records the number of minibatches and number of examples
    the model has trained, as well as any number of "channels" that track
    quantities of interest (examples: the objective function, measures of
    hidden unit activity, reconstruction error, sum of squared second
    derivatives, average norm of the weight vectors,  etc.)
    """
    def __init__(self, model):
        """
        Makes a monitor for `model`. Assumes the model has not been
        trained at all yet.

        Parameters
        ----------
        model : pylearn2.models.model.Model instance
        """
        self.training_succeeded = False
        self.model = model
        self.channels = OrderedDict()
        self._num_batches_seen = 0
        self._examples_seen = 0
        self._epochs_seen = 0
        self._datasets = []
        self._iteration_mode = []
        self._batch_size = []
        self._num_batches = []
        self._dirty = True
        self._rng_seed = []
        self.names_to_del = ['theano_function_mode']
        self.t0 = time.time()
        # Determine whether the model should use topological or vector form of
        # examples. If the model acts on a space with more than the batch index
        # and channel dimension, the model has topological dimensions, so the
        # topological view of the data should be used.
        vector = model.get_input_space().make_theano_batch(name='monitoring_input')
        if isinstance(vector.type, theano.sparse.SparseType):
            self.topo = False
        else:
            self.topo = len(vector.type.broadcastable) > 2

        self.require_label = False
        self.theano_function_mode = None

    def set_theano_function_mode(self, mode):
        if self.theano_function_mode != mode:
            self._dirty = True
            self.theano_function_mode = mode

    def add_dataset(self, dataset, mode='sequential', batch_size=None,
                    num_batches=None, seed = None):
        """
        Determines the data used to calculate the values of each channel.

        Parameters
        ----------
        dataset : object
            A `pylearn2.datasets.Dataset` object.
        mode : str or object, optional
            Iteration mode; see the docstring of the `iterator` method
            on `pylearn2.datasets.Dataset` for details.
        batch_size : int, optional
            The size of an individual batch. Optional if `mode` is
            'sequential' and `num_batches` is specified (batch size
            will be calculated based on full dataset size).
        num_batches : int, optional
            The total number of batches. Unnecessary if `mode` is
            'sequential' and `batch_size` is specified (number of
            batches will be calculated based on full dataset size).
        """
        # The user can ommit using lists if only one dataset is set
        if not isinstance(dataset, list):
            dataset = [dataset]
        if not isinstance(mode, list):
            mode = [mode]
        if not isinstance(batch_size, list):
            batch_size = [batch_size]
        if not isinstance(num_batches, list):
            num_batches = [num_batches]
        if seed is None:
            seed = [ None ] * len(dataset)
        if not isinstance(seed, list):
            seed = [ seed ]
        if len(mode) != len(dataset):
            raise ValueError("Received "+str(len(dataset))+" dataset but " + str(len(mode)) + " modes.")
        if any([len(l) != len(dataset) for l in [batch_size, seed]]):
            raise ValueError("make sure each dataset has its iteration " + \
                        "batch size and number of batches.")
        for (d, m, b, n, sd) in safe_izip(dataset, mode, batch_size, num_batches, seed):
            try:
                it = d.iterator(mode=m, batch_size=b,
                                      num_batches=n,
                                      topo=self.topo,
                                      targets=self.require_label,
                                      rng = sd)
            except ValueError as exc:
                raise ValueError("invalid iteration parameters in "
                                 "Monitor.add_dataset: " + str(exc))
            if it.stochastic:
                # must be a seed, not a random number generator
                # if it were a random number generator, different iterators using
                # it would update its state, so we would not get the same iterator
                # each time
                # Also, must not be None, because this makes the iterator pick
                # a seed based on the clock
                if sd is None:
                    raise TypeError("Monitor requires a seed when using stochastic iteration modes.")
                if not isinstance(sd, (list, tuple, int)):
                    raise TypeError("Monitor requires a seed (not a random number generator) when using stochastic iteration modes.")
            else:
                assert sd is None # the iterator should catch this, but let's double-check

            if not d in self._datasets:
                self._datasets.append(d)
                self._iteration_mode.append(m)
                self._batch_size.append(b)
                self._num_batches.append(n)
                self._rng_seed.append(sd)

    def __call__(self):
        """
        Runs the model on the monitoring dataset in order to add one
        data point to each of the channels.
        """

        # If the channels have changed at all, we need to recompile the theano
        # functions used to compute them
        if self._dirty:
            self.redo_theano()

        model = self.model
        datasets = self._datasets

        # Set all channels' val_shared to 0
        self.begin_record_entry()

        for d, i, b, n, a, sd, ne in safe_izip(datasets, self._iteration_mode, self._batch_size,
                                 self._num_batches, self.accum, self._rng_seed, self.num_examples):
            if isinstance(d, basestring):
                d = yaml_parse.load(d)
                raise NotImplementedError()
                # need to put d back into self._datasets
            myiterator = d.iterator(mode=i,
                                    batch_size=b,
                                    num_batches=n,
                                    topo=self.topo,
                                    targets=self.require_label,
                                    rng=sd)

            actual_ne = 0
            for X in myiterator:
                if self.require_label:
                    X, y = X
                    self.run_prereqs(X,y,d)
                    a(X, y)
                else:
                    self.run_prereqs(X, None, d)
                    a(X)
                if X.ndim == 2:
                    actual_batch_size = X.shape[0]
                else:
                    actual_batch_size = X.shape[d.get_topo_batch_axis()]
                actual_ne += actual_batch_size
            # end for X
            if actual_ne != ne:
                raise RuntimeError("At compile time, your iterator said it had "
                        + str(ne) + " examples total, but at runtime it gave us "
                        + str(actual_ne) + ".")
        # end for d


        log.info("Monitoring step:")
        log.info("\tEpochs seen: %d" % self._epochs_seen)
        log.info("\tBatches seen: %d" % self._num_batches_seen)
        log.info("\tExamples seen: %d" % self._examples_seen)
        t = time.time() - self.t0
        for channel_name in sorted(self.channels.keys(), key=number_aware_alphabetical_key):
            channel = self.channels[channel_name]
            channel.time_record.append(t)
            channel.batch_record.append(self._num_batches_seen)
            channel.example_record.append(self._examples_seen)
            channel.epoch_record.append(self._epochs_seen)
            val = channel.val_shared.get_value()
            channel.val_record.append(val)
            # TODO: use logging infrastructure so that user can configure
            # formatting
            if abs(val) < 1e4:
                val_str = str(val)
            else:
                val_str = '%.3e' % val

            log.info("\t%s: %s" % (channel_name, val_str))

    def run_prereqs(self, X, y, dataset):
        if dataset not in self.prereqs:
            return
        for prereq in self.prereqs[dataset]:
            prereq(X,y)

    def get_batches_seen(self):
        """ Returns the number of batches the model has learned on (assuming
        that the learning code has been calling Monitor.report_batch correctly)
        """
        return self._num_batches_seen

    def get_epochs_seen(self):
        return self._epochs_seen

    def get_examples_seen(self):
        """ Returns the number of examples the model has learned on (assuming
        that the learning code has been calling Monitor.report_batch correctly)
        """
        return self._examples_seen

    def report_batch(self, num_examples):
        """ Call this whenever the model has learned on another batch of examples.
        Report how many examples were learned on. """
        self._examples_seen += num_examples
        self._num_batches_seen += 1

    def report_epoch(self):
        self._epochs_seen += 1

    def redo_theano(self):
        """
        Recompiles Theano functions used by this monitor.

        This is needed so that if new channels are added, Theano's
        optimizations make sure (to the extent that they can) that the new
        channels and old channels don't have any redundant calculations.

        It is also needed to regenerate Theano functions after pickling and
        unpickling, since Theano functions should not be pickled.
        """
        self._dirty = False

        init_names = dir(self)
        self.prereqs = OrderedDict()
        for channel in self.channels.values():
            if channel.prereqs is not None:
                dataset = channel.dataset
                if dataset not in self.prereqs:
                    self.prereqs[dataset] = []
                prereqs = self.prereqs[dataset]
                for prereq in channel.prereqs:
                    if prereq not in prereqs:
                        prereqs.append(prereq)

        updates = OrderedDict()
        for channel in self.channels.values():
            updates[channel.val_shared] = np.cast[config.floatX](0.0)
        with log_timing(log, "compiling begin_record_entry"):
            self.begin_record_entry = function(inputs=[], updates=updates, mode=self.theano_function_mode,
                    name = 'Monitor.begin_record_entry')
        updates = OrderedDict()
        givens = OrderedDict()
        # Get the appropriate kind of theano variable to represent the data the model
        # acts on
        X = self.model.get_input_space().make_theano_batch(name = "monitoring_X")
        if config.compute_test_value != 'off':
            m = self.model.get_test_batch_size()
            test_value = self.model.get_input_space().get_origin_batch(m)
            X.tag.test_value = np.cast[X.type.dtype](test_value)
        if self.require_label:
            Y = self.model.get_output_space().make_theano_batch(name = "monitoring_Y")

        log.info('Monitored channels: ')
        for key in sorted(self.channels.keys()):
            mode = self.theano_function_mode
            if mode is not None and hasattr(mode, 'record'):
                mode.record.handle_line('compiling monitor including channel '+key+'\n')
            log.info('\t%s' % key)
        it = [d.iterator(mode=i, num_batches=n, batch_size=b, topo=self.topo) \
              for d, i, n, b in safe_izip(self._datasets, self._iteration_mode,
                                    self._num_batches, self._batch_size)]
        self.num_examples = [np.cast[config.floatX](float(i.num_examples)) for i in it]
        givens = [OrderedDict() for d in self._datasets]
        updates = [OrderedDict() for d in self._datasets]
        for channel in self.channels.values():
            index = self._datasets.index(channel.dataset)
            d = self._datasets[index]
            g = givens[index]
            cur_num_examples = self.num_examples[index]
            u = updates[index]
            if isinstance(channel.graph_input, (list, tuple)):
                channel_X, channel_Y = channel.graph_input
                assert channel_X not in g or g[channel_X] is X
                assert channel_Y not in g or g[channel_Y] is Y
                g[channel_X] = X
                g[channel_Y] = Y
            else:
                channel_X = channel.graph_input
                assert channel_X not in g or g[channel_X] is X
                g[channel_X] = X
            if n == 0:
                raise ValueError("Iterating over 0 examples results in divide by 0")
            if self.topo:
                batch_index = d.get_topo_batch_axis()
            else:
                batch_index = 0
            val = channel.val * T.cast(X.shape[batch_index], config.floatX) / cur_num_examples
            u[channel.val_shared] = channel.val_shared + val

        with log_timing(log, "Compiling accum"):
            # Check type of update expressions
            for up in updates:
                for key in up:
                    if key.dtype != up[key].dtype:
                        raise TypeError('Monitoring channel shared variable ' \
                                + key.name + ' has dtype ' + key.dtype + \
                                ' but is driven by an expression with type ' + \
                                up[key].dtype)

            self.accum = []
            for idx, packed in enumerate(safe_izip(givens, updates)):
                g, u = packed
                mode = self.theano_function_mode
                if mode is not None and hasattr(mode, 'record'):
                    for elem in g:
                        mode.record.handle_line('g key '+var_descriptor(elem)+'\n')
                        mode.record.handle_line('g val '+var_descriptor(g[elem])+'\n')
                    for elem in u:
                        mode.record.handle_line('u key '+var_descriptor(elem)+'\n')
                        mode.record.handle_line('u val '+var_descriptor(u[elem])+'\n')
                function_name = 'Monitor.accum[%d]' % idx
                if self.require_label:
                    if mode is not None and hasattr(mode, 'record'):
                        mode.record.handle_line('compiling supervised accum\n')
                    # Some channels may not depend on the data, ie, they might just monitor the model
                    # parameters, or some shared variable updated by the training algorithm, so we
                    # need to ignore the unused input error
                    self.accum.append(function([X, Y], givens=g, updates=u, mode=self.theano_function_mode,
                            name=function_name))
                else:
                    if mode is not None and hasattr(mode, 'record'):
                        mode.record.handle_line('compiling unsupervised accum\n')
                    self.accum.append(function([X], givens=g, updates=u, mode=self.theano_function_mode,
                            name=function_name))
            for a in self.accum:
                if mode is not None and hasattr(mode, 'record'):
                    for elem in a.maker.fgraph.outputs:
                        mode.record.handle_line('accum output '+var_descriptor(elem)+'\n')
                log.info("graph size: %d" % len(a.maker.fgraph.toposort()))
        final_names = dir(self)
        self.register_names_to_del([name for name in final_names
                                    if name not in init_names])

    def register_names_to_del(self, names):
        """
        Register names of fields that should be deleted before pickling.

        Parameters
        ----------
        names : list
            A list of attribute names as strings.
        """
        for name in names:
            if name not in self.names_to_del:
                self.names_to_del.append(name)

    def __getstate__(self):
        """
        In order to avoid pickling a copy of the dataset whenever a monitor
        is saved, the __getstate__ method replaces the dataset field with the
        dataset's yaml source. This is not a perfect solution because it won't
        work with job resuming, which would require saving the state of the
        dataset's random number generator.

        Like in the Model class, we also need to avoid saving any Theano
        functions, so we delete everything that can be regenerated with
        `redo_theano` by deleting the fields in `self.names_to_del`
        """

        # Patch old pickled monitors
        if not hasattr(self, '_datasets'):
            self._datasets = [ self._dataset ]
            del self._dataset

        temp = self._datasets

        if self._datasets:
            self._datasets = []
            for dataset in temp:
                if isinstance(dataset, basestring):
                    self._datasets.append(dataset)
                else:
                    try:
                        self._datasets.append(dataset.yaml_src)
                    except AttributeError:
                        warnings.warn('Trained model saved without indicating yaml_src')
        d = copy.copy(self.__dict__)
        self._datasets = temp
        for name in self.names_to_del:
            if name in d:
                del d[name]


        return d

    def __setstate__(self, d):

        # patch old pkl files
        if '_dataset' in d:
            d['_datasets'] = [ d['_dataset'] ]
            del d['_dataset']

        self.__dict__.update(d)

    def add_channel(self, name, ipt, val, dataset=None, prereqs=None):
        """
        Asks the monitor to start tracking a new value.  Can be called even
        after the monitor is already in use.

        Parameters
        ----------
        name: str
            The display name in the monitor.
        ipt: tensor_like
            The symbolic tensor which should be clamped to the data.
            (or a (features,targets) list/tuple containing two symbolic tensors)
        val: tensor_like
            The value (function of `ipt`) to be tracked.
        dataset: A Dataset instance specifying which dataset to compute
            this channel on.
        prereqs: list of callables that take two numpy tensors
            (X and y, where y will be None if no labels are used)
            each prereq must be called exactly once per each new
            batch of data drawn *from dataset* before the channel
            value is computed
            if two channels provide a prereq with exactly the same
            id, that prereq will only be called once
        """

        if isinstance(val, (float, int, long)):
            val = np.cast[theano.config.floatX](val)

        val = T.as_tensor_variable(val)

        if not isinstance(ipt, (list, tuple)):
            tmp = [ ipt ]
        else:
            tmp = ipt
        inputs = theano.gof.graph.inputs([val])
        for elem in inputs:
            if not hasattr(elem, 'get_value') and not isinstance(elem, theano.gof.graph.Constant):
                if elem not in tmp:
                    raise ValueError("Unspecified input: "+str(elem))



        mode = self.theano_function_mode
        if mode is not None and hasattr(mode, 'record'):
            mode.record.handle_line('Adding monitor channel '+name+'\n')
            if isinstance(ipt, (list, tuple)):
                for elem in ipt:
                    mode.record.handle_line('Includes input var '+var_descriptor(elem)+'\n')
            else:
                mode.record.handle_line(name+' input var is '+var_descriptor(ipt)+'\n')
            mode.record.handle_line('channel '+name+' is '+var_descriptor(val)+'\n')

        if dataset is None:
            if len(self._datasets) == 1:
                dataset = self._datasets[0]
            elif len(self._datasets) == 0:
                raise ValueError(_err_no_data)
            else:
                raise ValueError(_err_ambig_data)

        try:
            self._datasets.index(dataset)
        except ValueError:
            raise ValueError("The dataset specified is not " + \
                "one of the monitor's datasets")

        if name in self.channels:
            raise ValueError("Tried to create the same channel twice (%s)" %
                             name)
        if isinstance(ipt, (list, tuple)):
            if dataset is not None:
                if not dataset.has_targets():
                    raise ValueError("Tried to create a channel ("+name \
                            +") that uses targets, but monitoring dataset has no targets")
            self.require_label = True
            assert len(ipt) == 2
        self.channels[name] = MonitorChannel(ipt, val, name, dataset, prereqs)
        self._dirty = True

    def _sanity_check(self):
        """
            Sometimes we serialize models and then load them somewhere else
            but still try to use their Monitor, and the Monitor is in a mangled
            state. I've added some calls to _sanity_check to try to catch when
            that happens. Not sure what to do for a long term fix. I think it
            requires making theano graphs serializable first.
        """
        for name in self.channels:
            channel = self.channels[name]
            assert hasattr(channel, 'prereqs')

    @classmethod
    def get_monitor(cls, model):
        """
        Returns a model's monitor. If the model doesn't have a monitor yet,
        installs one and returns that.

        Parameters
        ----------
        model : object
            An object that implements the `Model` interface specified in
            `pylearn2.models`.
        """

        if hasattr(model, 'monitor'):
            rval = model.monitor
            rval._sanity_check()
        else:
            rval = Monitor(model)
            model.monitor = rval

        return rval

    # TODO: find out if monitor.foo below are used anywhere, remove if not.
    @property
    def batch_size(self):
        return self._batch_size

    @property
    def num_batches(self):
        return self._num_batches

    def setup(self, dataset, cost, batch_size, num_batches = None, extra_costs=None,
            mode='sequential'):
        """
        Sets up the monitor for a cost minimization problem.
        Adds channels defined by both the model and the cost for
        the specified dataset(s), as well as a channel called 'objective'
        defined by the costs' __call__ method.

        dataset: a Dataset or dictionary mapping string names to Datasets
                    If string names are used, then for every dataset,
                    each channel defined by the model or cost will be
                    replicated with that dataset's name followed by an
                    underscore as the prefix.
                    For example, if your cost defines a channel called
                    'misclass', and datasets is {'train' : train_dataset,
                    'valid' : valid_dataset} you will get channels called
                    'train_misclass' and 'valid_misclass'.

        cost: a Cost

        """
        if dataset is None:
            return
        if isinstance(dataset, Dataset):
            dataset = {'': dataset}
        else:
            assert isinstance(dataset, dict)
            assert all(isinstance(key, str) for key in dataset)
            assert all(isinstance(dataset[key], Dataset) for key in dataset)

        if extra_costs is None:
            costs = {}
        else:
            costs = extra_costs
        assert '' not in costs
        costs[''] = cost

        supervised = any(cost.supervised for cost in costs.values())
        model = self.model

        X_space = model.get_input_space()
        X = X_space.make_theano_batch(name='monitor_X')

        if config.compute_test_value != 'off':
            X.tag.test_value = X_space.get_origin_batch(batch_size).astype(X.dtype)

        if supervised:
            Y_space = model.get_output_space()
            Y = Y_space.make_theano_batch(name='monitor_Y')

            if config.compute_test_value != 'off':
                Y.tag.test_value = Y_space.get_origin_batch(batch_size).astype(Y.dtype)

            ipt = (X, Y)
        else:
            Y = None
            ipt = X
        custom_channels = {}
        for cost_name in costs:
            if cost_name == '':
                prefix = ''
            else:
                prefix = cost_name + '_'
            cost = costs[cost_name]
            raw_channels = cost.get_monitoring_channels(model, X, Y)
            channels = {}
            for name in raw_channels:
                channels[prefix+name] = raw_channels[name]
            custom_channels.update(channels)
        model_channels = model.get_monitoring_channels(X, Y)
        custom_channels.update(model_channels)

        if is_stochastic(mode):
            seed = [[2013, 02, 22]]
        else:
            seed = None

        for dataset_name in dataset:
            cur_dataset = dataset[dataset_name]
            self.add_dataset(dataset=cur_dataset,
                                 mode=mode,
                                 batch_size=batch_size,
                                 num_batches=num_batches,
                                 seed=seed)
            if dataset_name == '':
                dprefix = ''
            else:
                dprefix = dataset_name + '_'
            # These channel name 'objective' must not vary, since callbacks that respond to the
            # values in the monitor use the name to find it.
            for cost_name in costs:
                cost = costs[cost_name]
                cost_value = cost(model, X, Y)
                if cost_value is not None:
                    if cost_name == '':
                        name = dprefix + 'objective'
                    else:
                        name = dprefix + cost_name
                    self.add_channel(name=name, ipt=ipt,
                        val=cost_value, dataset=cur_dataset)
            for key in custom_channels:
                self.add_channel(name=dprefix + key, ipt=ipt,
                        val=custom_channels[key], dataset=cur_dataset)
Beispiel #36
0
class Monitor(object):
    """
    A class for monitoring Models while they are being trained.

    A monitor object records the number of minibatches and number of examples
    the model has trained, as well as any number of "channels" that track
    quantities of interest (examples: the objective function, measures of
    hidden unit activity, reconstruction error, sum of squared second
    derivatives, average norm of the weight vectors,  etc.)
    """
    def __init__(self, model):
        """
        Makes a monitor for `model`. Assumes the model has not been trained at
        all yet.

        Parameters
        ----------
        model : pylearn2.models.model.Model instance
        """
        self.training_succeeded = False
        self.model = model
        self.channels = OrderedDict()
        self._num_batches_seen = 0
        self._examples_seen = 0
        self._epochs_seen = 0
        self._datasets = []
        self._iteration_mode = []
        self._batch_size = []
        self._num_batches = []
        self._dirty = True
        self._rng_seed = []
        self.names_to_del = ['theano_function_mode']
        self.t0 = time.time()
        self.theano_function_mode = None

        # Initialize self._nested_data_specs, self._data_specs_mapping,
        # and self._flat_data_specs
        self._build_data_specs()

    def _build_data_specs(self):
        """
        Computes a nested data_specs for input and all channels

        Also computes the mapping to flatten it. This function is called from
        redo_theano.
        """
        # Ask the model what it needs
        m_space, m_source = self.model.get_monitoring_data_specs()
        input_spaces = [m_space]
        input_sources = [m_source]
        for channel in self.channels.values():
            space = channel.data_specs[0]
            assert isinstance(space, Space)
            input_spaces.append(space)
            input_sources.append(channel.data_specs[1])

        nested_space = CompositeSpace(input_spaces)
        nested_source = tuple(input_sources)

        self._nested_data_specs = (nested_space, nested_source)
        self._data_specs_mapping = DataSpecsMapping(self._nested_data_specs)

        flat_space = self._data_specs_mapping.flatten(nested_space,
                                                      return_tuple=True)
        flat_source = self._data_specs_mapping.flatten(nested_source,
                                                       return_tuple=True)
        self._flat_data_specs = (CompositeSpace(flat_space), flat_source)

    def set_theano_function_mode(self, mode):
        """
        Parameters
        ----------
        mode : theano.compile.Mode
            Theano functions for the monitoring channels will be compiled and
            run using this mode.
        """
        if self.theano_function_mode != mode:
            self._dirty = True
            self.theano_function_mode = mode

    def add_dataset(self,
                    dataset,
                    mode='sequential',
                    batch_size=None,
                    num_batches=None,
                    seed=None):
        """
        Determines the data used to calculate the values of each channel.

        Parameters
        ----------
        dataset : object
            A `pylearn2.datasets.Dataset` object.
        mode : str or object, optional
            Iteration mode; see the docstring of the `iterator` method \
            on `pylearn2.datasets.Dataset` for details.
        batch_size : int, optional
            The size of an individual batch. Optional if `mode` is \
            'sequential' and `num_batches` is specified (batch size \
            will be calculated based on full dataset size).
        num_batches : int, optional
            The total number of batches. Unnecessary if `mode` is \
            'sequential' and `batch_size` is specified (number of \
            batches will be calculated based on full dataset size).
        seed : int, optional
            Optional. The seed to be used for random iteration modes.
        """
        # The user can ommit using lists if only one dataset is set
        if not isinstance(dataset, list):
            dataset = [dataset]
        if not isinstance(mode, list):
            mode = [mode]
        if not isinstance(batch_size, list):
            batch_size = [batch_size]
        if not isinstance(num_batches, list):
            num_batches = [num_batches]
        if seed is None:
            seed = [None] * len(dataset)
        if not isinstance(seed, list):
            seed = [seed]
        if len(mode) != len(dataset):
            raise ValueError("Received " + str(len(dataset)) +
                             " dataset but " + str(len(mode)) + " modes.")
        if any([len(l) != len(dataset) for l in [batch_size, seed]]):
            raise ValueError("make sure each dataset has its iteration " +
                             "batch size and number of batches.")
        for (d, m, b, n, sd) in safe_izip(dataset, mode, batch_size,
                                          num_batches, seed):
            try:
                it = d.iterator(mode=m,
                                batch_size=b,
                                num_batches=n,
                                data_specs=self._flat_data_specs,
                                return_tuple=True,
                                rng=sd)
            except ValueError as exc:
                raise ValueError("invalid iteration parameters in " +
                                 "Monitor.add_dataset: " + str(exc))
            if it.stochastic:
                # Must be a seed, not a random number generator. If it were a
                # random number generator, different iterators using it would
                # update its state, so we would not get the same iterator
                # each time. Also, must not be None, because this makes the
                # iterator pick a seed based on the clock
                if sd is None:
                    raise TypeError("Monitor requires a seed when using " +
                                    "stochastic iteration modes.")
                if not isinstance(sd, (list, tuple, int)):
                    raise TypeError("Monitor requires a seed (not a random " +
                                    "number generator) when using " +
                                    "stochastic iteration modes.")
            else:
                # The iterator should catch this, but let's double-check
                assert sd is None

            if not d in self._datasets:
                self._datasets.append(d)
                self._iteration_mode.append(m)
                self._batch_size.append(b)
                self._num_batches.append(n)
                self._rng_seed.append(sd)

    def __call__(self):
        """
        Runs the model on the monitoring dataset in order to add one data point
        to each of the channels.
        """

        # If the channels have changed at all, we need to recompile the theano
        # functions used to compute them
        if self._dirty:
            self.redo_theano()

        datasets = self._datasets

        # Set all channels' val_shared to 0
        self.begin_record_entry()
        for d, i, b, n, a, sd, ne in safe_izip(datasets, self._iteration_mode,
                                               self._batch_size,
                                               self._num_batches, self.accum,
                                               self._rng_seed,
                                               self.num_examples):
            if isinstance(d, basestring):
                d = yaml_parse.load(d)
                raise NotImplementedError()

            # need to put d back into self._datasets
            myiterator = d.iterator(mode=i,
                                    batch_size=b,
                                    num_batches=n,
                                    data_specs=self._flat_data_specs,
                                    return_tuple=True,
                                    rng=sd)

            # If self._flat_data_specs is empty, no channel needs data,
            # so we do not need to call the iterator in order to average
            # the monitored values across different batches, we only
            # have to call them once.
            if len(self._flat_data_specs[1]) == 0:
                X = ()
                self.run_prereqs(X, d)
                a(*X)

            else:
                actual_ne = 0
                for X in myiterator:
                    # X is a flat (not nested) tuple
                    self.run_prereqs(X, d)
                    a(*X)
                    actual_ne += self._flat_data_specs[0].np_batch_size(X)
                # end for X
                if actual_ne != ne:
                    raise RuntimeError("At compile time, your iterator said "
                                       "it had %d examples total, but at "
                                       "runtime it gave us %d." %
                                       (ne, actual_ne))
        # end for d

        log.info("Monitoring step:")
        log.info("\tEpochs seen: %d" % self._epochs_seen)
        log.info("\tBatches seen: %d" % self._num_batches_seen)
        log.info("\tExamples seen: %d" % self._examples_seen)
        t = time.time() - self.t0
        for channel_name in sorted(self.channels.keys(),
                                   key=number_aware_alphabetical_key):
            channel = self.channels[channel_name]
            channel.time_record.append(t)
            channel.batch_record.append(self._num_batches_seen)
            channel.example_record.append(self._examples_seen)
            channel.epoch_record.append(self._epochs_seen)
            val = channel.val_shared.get_value()
            channel.val_record.append(val)
            # TODO: use logging infrastructure so that user can configure
            # formatting
            if abs(val) < 1e4:
                val_str = str(val)
            else:
                val_str = '%.3e' % val

            log.info("\t%s: %s" % (channel_name, val_str))

    def run_prereqs(self, data, dataset):
        """
        Runs all "prerequistie functions" on a batch of data. Always called
        right before computing the monitoring channels on that batch.

        Parameters
        ----------
        data : tuple or Variable
            a member of the Space used as input to the monitoring functions
        dataset : Dataset
            the Dataset the data was drawn from
        """
        if dataset not in self.prereqs:
            return
        for prereq in self.prereqs[dataset]:
            prereq(*data)

    def get_batches_seen(self):
        """
        Returns the number of batches the model has learned on (assuming that
        the learning code has been calling Monitor.report_batch correctly).
        """
        return self._num_batches_seen

    def get_epochs_seen(self):
        """
        Returns
        -------
        epochs_seen : int
            The number of epochs the model has been trained on.
            One "epoch" is one pass through Dataset.iterator.
        """
        return self._epochs_seen

    def get_examples_seen(self):
        """
        Returns
        -------
        examples_seen : int
            The number of examples the model has learned on (assuming that
            the learning code has been calling Monitor.report_batch correctly)
        """
        return self._examples_seen

    def report_batch(self, num_examples):
        """
        Call this whenever the model has learned on another batch of examples.
        Report how many examples were learned on.

        Parameters
        ----------
        num_examples : int
            The number of examples learned on in this minibatch.
        """
        self._examples_seen += num_examples
        self._num_batches_seen += 1

    def report_epoch(self):
        """
        Call this whenever the model has completed another "epoch" of learning.
        We regard one pass through Dataset.iterator as one epoch.
        """
        self._epochs_seen += 1

    def redo_theano(self):
        """
        Recompiles Theano functions used by this monitor.

        This is called any time we need to evaluate the channels and the
        channel definitions have changed since last we called it, or if the
        theano functions are unavailable for any other reason (first time they
        are needed after construction or deserialization, etc.)

        All channels are compiled as part of the same theano function so that
        the theano optimizations can eliminate subexpressions that are shared
        between multiple channels.
        """
        self._dirty = False

        # Recompute the data specs, since the channels may have changed.
        self._build_data_specs()

        init_names = dir(self)
        self.prereqs = OrderedDict()
        for channel in self.channels.values():
            if channel.prereqs is not None:
                dataset = channel.dataset
                if dataset not in self.prereqs:
                    self.prereqs[dataset] = []
                prereqs = self.prereqs[dataset]
                for prereq in channel.prereqs:
                    if prereq not in prereqs:
                        prereqs.append(prereq)

        updates = OrderedDict()
        for channel in self.channels.values():
            updates[channel.val_shared] = np.cast[config.floatX](0.0)
        with log_timing(log, "compiling begin_record_entry"):
            self.begin_record_entry = function(
                inputs=[],
                updates=updates,
                mode=self.theano_function_mode,
                name='Monitor.begin_record_entry')
        updates = OrderedDict()
        givens = OrderedDict()
        # Get the appropriate kind of theano variable to represent the data
        # the model acts on
        batch_names = ['monitoring_%s' % s for s in self._flat_data_specs[1]]
        theano_args = self._flat_data_specs[0].make_theano_batch(batch_names)

        # Get a symbolic expression of the batch size
        # We do it here, rather than for each channel, because channels with an
        # empty data_specs do not use data, and are unable to extract the batch
        # size. The case where the whole data specs is empty is not supported.
        batch_size = self._flat_data_specs[0].batch_size(theano_args)

        # Also get a nested representation, for joint iteration
        # with each of channel.graph_input
        nested_theano_args = self._data_specs_mapping.nest(theano_args)
        if not isinstance(nested_theano_args, tuple):
            nested_theano_args = (nested_theano_args, )
        assert len(nested_theano_args) == (len(self.channels) + 1)

        log.info('Monitored channels: ')
        for key in sorted(self.channels.keys()):
            mode = self.theano_function_mode
            if mode is not None and hasattr(mode, 'record'):
                mode.record.handle_line('compiling monitor including ' +
                                        'channel ' + key + '\n')
            log.info('\t%s' % key)
        it = [
            d.iterator(mode=i,
                       num_batches=n,
                       batch_size=b,
                       data_specs=self._flat_data_specs,
                       return_tuple=True)
            for d, i, n, b in safe_izip(self._datasets, self._iteration_mode,
                                        self._num_batches, self._batch_size)
        ]
        self.num_examples = [
            np.cast[config.floatX](float(i.num_examples)) for i in it
        ]
        givens = [OrderedDict() for d in self._datasets]
        updates = [OrderedDict() for d in self._datasets]
        for i, channel in enumerate(self.channels.values()):
            index = self._datasets.index(channel.dataset)
            d = self._datasets[index]
            g = givens[index]
            cur_num_examples = self.num_examples[index]
            u = updates[index]

            # Flatten channel.graph_input and the appropriate part of
            # nested_theano_args, to iterate jointly over them.
            c_mapping = DataSpecsMapping(channel.data_specs)
            channel_inputs = c_mapping.flatten(channel.graph_input,
                                               return_tuple=True)
            inputs = c_mapping.flatten(nested_theano_args[i + 1],
                                       return_tuple=True)

            for (channel_X, X) in safe_izip(channel_inputs, inputs):
                assert channel_X not in g or g[channel_X] is X
                assert channel_X.type == X.type, (channel_X.type, X.type)
                g[channel_X] = X

            if batch_size == 0:
                # No channel does need any data, so there is not need to
                # average results, and we will call the accum functions only
                # once.
                # TODO: better handling of channels not needing data when
                # some other channels need data.
                assert len(self._flat_data_specs[1]) == 0
                val = channel.val
            else:
                if n == 0:
                    raise ValueError("Iterating over 0 examples results in " +
                                     "divide by 0")
                val = (channel.val * T.cast(batch_size, config.floatX) /
                       cur_num_examples)
            u[channel.val_shared] = channel.val_shared + val

        with log_timing(log, "Compiling accum"):
            # Check type of update expressions
            for up in updates:
                for key in up:
                    if key.dtype != up[key].dtype:
                        raise TypeError('Monitoring channel shared variable ' +
                                        key.name + ' has dtype ' + key.dtype +
                                        ' but is driven by an expression ' +
                                        'with type ' + up[key].dtype)

            self.accum = []
            for idx, packed in enumerate(safe_izip(givens, updates)):
                g, u = packed
                mode = self.theano_function_mode
                if mode is not None and hasattr(mode, 'record'):
                    for elem in g:
                        mode.record.handle_line('g key ' +
                                                var_descriptor(elem) + '\n')
                        mode.record.handle_line('g val ' +
                                                var_descriptor(g[elem]) + '\n')
                    for elem in u:
                        mode.record.handle_line('u key ' +
                                                var_descriptor(elem) + '\n')
                        mode.record.handle_line('u val ' +
                                                var_descriptor(u[elem]) + '\n')
                function_name = 'Monitor.accum[%d]' % idx
                if mode is not None and hasattr(mode, 'record'):
                    mode.record.handle_line('compiling supervised accum\n')
                # Some channels may not depend on the data, ie, they might just
                # monitor the model parameters, or some shared variable updated
                # by the training algorithm, so we need to ignore the unused
                # input error
                self.accum.append(
                    function(theano_args,
                             givens=g,
                             updates=u,
                             mode=self.theano_function_mode,
                             name=function_name))
            for a in self.accum:
                if mode is not None and hasattr(mode, 'record'):
                    for elem in a.maker.fgraph.outputs:
                        mode.record.handle_line('accum output ' +
                                                var_descriptor(elem) + '\n')
                log.info("graph size: %d" % len(a.maker.fgraph.toposort()))
        final_names = dir(self)
        self.register_names_to_del(
            [name for name in final_names if name not in init_names])

    def register_names_to_del(self, names):
        """
        Register names of fields that should be deleted before pickling.

        Parameters
        ----------
        names : list
            A list of attribute names as strings.
        """
        for name in names:
            if name not in self.names_to_del:
                self.names_to_del.append(name)

    def __getstate__(self):
        """
        In order to avoid pickling a copy of the dataset whenever a monitor
        is saved, the __getstate__ method replaces the dataset field with the
        dataset's yaml source. This is not a perfect solution because it won't
        work with job resuming, which would require saving the state of the
        dataset's random number generator.

        Like in the Model class, we also need to avoid saving any Theano
        functions, so we delete everything that can be regenerated with
        `redo_theano` by deleting the fields in `self.names_to_del`
        """

        # Patch old pickled monitors
        if not hasattr(self, '_datasets'):
            self._datasets = [self._dataset]
            del self._dataset

        temp = self._datasets

        if self._datasets:
            self._datasets = []
            for dataset in temp:
                if isinstance(dataset, basestring):
                    self._datasets.append(dataset)
                else:
                    try:
                        self._datasets.append(dataset.yaml_src)
                    except AttributeError:
                        warnings.warn('Trained model saved without ' +
                                      'indicating yaml_src')
        d = copy.copy(self.__dict__)
        self._datasets = temp
        for name in self.names_to_del:
            if name in d:
                del d[name]

        return d

    def __setstate__(self, d):
        """
        Sets the object to have the state described by `d`.

        Parameters
        ----------
        d : dict
            A dictionary mapping string names of fields to values for
            these fields.
        """
        # patch old pkl files
        if '_dataset' in d:
            d['_datasets'] = [d['_dataset']]
            del d['_dataset']

        self.__dict__.update(d)

    def add_channel(self,
                    name,
                    ipt,
                    val,
                    dataset=None,
                    prereqs=None,
                    data_specs=None):
        """
        Asks the monitor to start tracking a new value.  Can be called even
        after the monitor is already in use.

        Parameters
        ----------
        name : str
            The display name in the monitor.
        ipt : tensor_like
            The symbolic tensor which should be clamped to the data. \
            (or a list/tuple containing symbolic tensors, following the \
            data_specs)
        val : tensor_like
            The value (function of `ipt`) to be tracked.
        dataset : pylearn2.datasets.Dataset
            Which dataset to compute this channel on
        prereqs : list of callables that take a list of numpy tensors
            Each prereq must be called exactly once per each new batch of \
            data drawn *from dataset* before the channel value is computed \
            if two channels provide a prereq with exactly the same id, that \
            prereq will only be called once
        data_specs : (space, source) pair
            Identifies the order, format and semantics of ipt
        """
        if isinstance(val, (float, int, long)):
            val = np.cast[theano.config.floatX](val)

        val = T.as_tensor_variable(val)

        if data_specs is None:
            warnings.warn("parameter 'data_specs' should be provided when " +
                          "calling add_channel. We will build a default one.",
                          stacklevel=2)
            if isinstance(ipt, list):
                ipt = tuple(ipt)
            if ipt is not None and not isinstance(ipt, tuple):
                ipt = (ipt, )

            if ipt is None:
                data_specs = (NullSpace(), '')
            elif len(ipt) == 0:
                data_specs = (CompositeSpace([]), ())
            elif hasattr(dataset, 'get_data_specs'):
                dataset_space, dataset_source = dataset.get_data_specs()
                if (len(ipt) == 1 and dataset_source is not None
                        and (not isinstance(dataset_source, tuple)
                             or len(dataset_source) == 1)
                        and 'features' in dataset_source):
                    data_specs = (dataset_space, dataset_source)
                elif (len(ipt) == 2
                      and dataset_source == ('features', 'targets')):
                    data_specs = (dataset_space, dataset_source)
                else:
                    raise ValueError("Cannot infer default data_specs for " +
                                     "the following input points and " +
                                     "dataset: ipt = %s, dataset = %s" %
                                     (ipt, dataset))

        data_specs[0].validate(ipt)

        mapping = DataSpecsMapping(data_specs)
        flat_ipt = mapping.flatten(ipt)
        if not isinstance(flat_ipt, tuple):
            flat_ipt = (flat_ipt, )
        inputs = theano.gof.graph.inputs([val])
        for elem in inputs:
            if not hasattr(elem, 'get_value') and \
               not isinstance(elem, theano.gof.graph.Constant):
                if elem not in flat_ipt:
                    raise ValueError("Unspecified input: " + str(elem) +
                                     ". This may be due to an incorrect " +
                                     "implementation of a cost's " +
                                     "get_data_specs() method, or of a " +
                                     "model's get_monitoring_data_specs() " +
                                     "method.")

        mode = self.theano_function_mode
        if mode is not None and hasattr(mode, 'record'):
            mode.record.handle_line('Adding monitor channel ' + name + '\n')
            assert isinstance(flat_ipt, tuple)
            if len(flat_ipt) != 1:
                for elem in flat_ipt:
                    mode.record.handle_line('Includes input var ' +
                                            var_descriptor(elem) + '\n')
            else:
                mode.record.handle_line(name + ' input var is ' +
                                        var_descriptor(flat_ipt[0]) + '\n')
            mode.record.handle_line('channel ' + name + ' is ' +
                                    var_descriptor(val) + '\n')

        if dataset is None:
            if len(self._datasets) == 1:
                dataset = self._datasets[0]
            elif len(self._datasets) == 0:
                raise ValueError(_err_no_data)
            else:
                raise ValueError(_err_ambig_data)

        try:
            self._datasets.index(dataset)
        except ValueError:
            raise ValueError("The dataset specified is not one of the " +
                             "monitor's datasets")

        if name in self.channels:
            raise ValueError("Tried to create the same channel twice (%s)" %
                             name)

        self.channels[name] = MonitorChannel(ipt, val, name, data_specs,
                                             dataset, prereqs)
        self._dirty = True

    def _sanity_check(self):
        """
        Sometimes we serialize models and then load them somewhere else
        but still try to use their Monitor, and the Monitor is in a mangled
        state. I've added some calls to _sanity_check to try to catch when
        that happens. Not sure what to do for a long term fix. I think it
        requires making theano graphs serializable first.
        """
        for name in self.channels:
            channel = self.channels[name]
            assert hasattr(channel, 'prereqs')

    @classmethod
    def get_monitor(cls, model):
        """
        Returns a model's monitor. If the model doesn't have a monitor yet,
        installs one and returns that.

        Parameters
        ----------
        model : object
            An object that implements the `Model` interface specified in \
            `pylearn2.models`.
        """

        if hasattr(model, 'monitor'):
            rval = model.monitor
            rval._sanity_check()
        else:
            rval = Monitor(model)
            model.monitor = rval

        return rval

    # TODO: find out if this method is used anywhere, remove if not.
    @property
    def batch_size(self):
        """
        Returns
        -------
        batch_size : int
            The size of the batches used for monitoring
        """
        return self._batch_size

    # TODO: find out if this method is used anywhere, remove if not.
    @property
    def num_batches(self):
        """
        Returns
        -------
        num_batches : int
            The number of batches used for monitoring
        """
        return self._num_batches

    def setup(self,
              dataset,
              cost,
              batch_size,
              num_batches=None,
              extra_costs=None,
              mode='sequential',
              obj_prereqs=None,
              cost_monitoring_args=None):
        """
        Sets up the monitor for a cost minimization problem.
        Adds channels defined by both the model and the cost for
        the specified dataset(s), as well as a channel called 'objective'
        defined by the costs' __call__ method.

        Parameters
        ----------
        dataset : pylearn2.datasets.Dataset
            Dataset or dictionary mapping string names to Datasets.  If \
            string names are used, then for every dataset, each channel \
            defined by the model or cost will be replicated with that \
            dataset's name followed by an underscore as the prefix. For \
            example, if your cost defines a channel called 'misclass', and \
            datasets is {'train' : train_dataset, 'valid' : valid_dataset} \
            you will get channels called 'train_misclass' and 'valid_misclass'.
        cost : pylearn2.costs.Cost
            The cost being optimized by training. The value of the cost will
            appear as the `objective` channel. Its `get_monitoring_channels`
            method will also be used to supply other channels.
        extra_costs : OrderedDict, optional
            A dictionary mapping channel names to Cost objects.
            Their value will appear as the specified channel name.
            They will also provide more monitoring channels via their
            `get_monitoring_channels` method.
        obj_prereqs : None, or list of functions
            Functions to pass as prerequisites to the `objective` channel.
        cost_monitoring_args : dict
            Dictionary of kwargs that will be passed to \
            `cost.get_monitoring_channels()` (but not for the extra_costs).
        """

        if dataset is None:
            return
        if isinstance(dataset, Dataset):
            dataset = {'': dataset}
        else:
            assert isinstance(dataset, dict)
            assert all(isinstance(key, str) for key in dataset)
            assert all(isinstance(dataset[key], Dataset) for key in dataset)

        if extra_costs is None:
            costs = {}
        else:
            costs = extra_costs
        assert '' not in costs
        costs[''] = cost

        if cost_monitoring_args is None:
            cost_monitoring_args = {}

        model = self.model

        # Build a composite data_specs containing the specs for all costs,
        # then the specs of the model
        cost_names = sorted(costs.keys())
        spaces = []
        sources = []
        for c in cost_names:
            c_space, c_source = costs[c].get_data_specs(model)
            spaces.append(c_space)
            sources.append(c_source)

        # Ask the model for the data_specs needed
        m_space, m_source = model.get_monitoring_data_specs()
        spaces.append(m_space)
        sources.append(m_source)

        nested_space = CompositeSpace(spaces)
        nested_sources = tuple(sources)

        # Flatten this data_specs, so we build only one symbolic Theano
        # variable for each of the unique (space, source) pairs.
        mapping = DataSpecsMapping((nested_space, nested_sources))
        space_tuple = mapping.flatten(nested_space, return_tuple=True)
        source_tuple = mapping.flatten(nested_sources, return_tuple=True)
        ipt = tuple(
            space.make_theano_batch(name='monitor_%s' % source,
                                    batch_size=None)
            for (space, source) in safe_zip(space_tuple, source_tuple))

        # Build a nested tuple from ipt, to dispatch the appropriate parts
        # of the ipt batch to each cost
        nested_ipt = mapping.nest(ipt)

        custom_channels = {}
        for i, cost_name in enumerate(cost_names):
            if cost_name == '':
                prefix = ''
            else:
                prefix = cost_name + '_'
            cost = costs[cost_name]
            cost_ipt = nested_ipt[i]
            raw_channels = cost.get_monitoring_channels(model, cost_ipt)
            channels = {}
            for name in raw_channels:
                # We need three things: the value itself (raw_channels[name]),
                # the input variables (cost_ipt), and the data_specs for
                # these input variables ((spaces[i], sources[i]))
                channels[prefix + name] = (raw_channels[name], cost_ipt,
                                           (spaces[i], sources[i]))
            custom_channels.update(channels)

        # Use the last inputs from nested_ipt for the model
        model_channels = model.get_monitoring_channels(nested_ipt[-1])
        channels = {}
        for name in model_channels:
            # Note: some code used to consider that model_channels[name]
            # could be a a (channel, prereqs) pair, this is not supported.
            channels[name] = (model_channels[name], nested_ipt[-1],
                              (spaces[-1], sources[-1]))
        custom_channels.update(channels)

        if is_stochastic(mode):
            seed = [[2013, 02, 22]]
        else:
            seed = None

        for dataset_name in dataset:
            cur_dataset = dataset[dataset_name]
            self.add_dataset(dataset=cur_dataset,
                             mode=mode,
                             batch_size=batch_size,
                             num_batches=num_batches,
                             seed=seed)
            if dataset_name == '':
                dprefix = ''
            else:
                dprefix = dataset_name + '_'
            # These channel name 'objective' must not vary, since callbacks
            # that respond to the values in the monitor use the name to find
            # it.
            for i, cost_name in enumerate(cost_names):
                cost = costs[cost_name]
                cost_ipt = nested_ipt[i]
                cost_value = cost.expr(model, cost_ipt)
                if cost_value is not None:
                    if cost_name == '':
                        name = dprefix + 'objective'
                        prereqs = obj_prereqs
                    else:
                        name = dprefix + cost_name
                        prereqs = None

                    cost.get_data_specs(model)[0].validate(cost_ipt)
                    self.add_channel(name=name,
                                     ipt=cost_ipt,
                                     val=cost_value,
                                     data_specs=cost.get_data_specs(model),
                                     dataset=cur_dataset,
                                     prereqs=prereqs)

            for key in custom_channels:
                val, ipt, data_specs = custom_channels[key]
                data_specs[0].validate(ipt)
                self.add_channel(name=dprefix + key,
                                 ipt=ipt,
                                 val=val,
                                 data_specs=data_specs,
                                 dataset=cur_dataset)
Beispiel #37
0
class Optimizer(object):
    """
    Default interface for an optimizer implementation - this provides the necessary parameter updates when
    training a model on a dataset using an online stochastic process.
    """
    def __init__(self, model, dataset,
                 n_epoch=1000, batch_size=100, minimum_batch_size=1,
                 save_frequency=10, early_stop_threshold=.9995, early_stop_length=30,
                 learning_rate=1e-3, lr_decay='exponential', lr_factor=1,
                 **kwargs):
        """
        Initialize the Optimizer.

        Parameters
        ----------
        model : Model
            The Model to train.
        dataset : Dataset
            The Dataset to use when training the Model.
        n_epoch : int
            how many training iterations over the dataset to go.
        batch_size : int
            How many examples from the training dataset to use in parallel.
        minimum_batch_size : int
            The minimum number of examples required at a time (for things like time series, this would be > 1).
        save_frequency : int
            How many epochs to train between each new save of the Model's parameters.
        early_stop_threshold : float
            The factor by how much the best validation training score needs to improve to determine early stopping.
        early_stop_length : int
            The patience or number of epochs to wait after the early_stop_threshold has been reached before stopping.
        learning_rate : float
            The multiplicative amount to adjust parameters based on their gradient values.
        lr_decay : str
            The type of decay function to use for changing the learning rate over epochs. See
            `opendeep.utils.decay` for options.
        lr_factor : float
            The amount to use for the decay function when changing the learning rate over epochs. See
            `opendeep.utils.decay` for its effect for given decay functions.
        """
        log.info("Initializing optimizer %s", str(type(self)))

        if early_stop_threshold is None:
            early_stop_threshold = 1.
        if save_frequency is None:
            save_frequency = 1000000
        if early_stop_length is None:
            early_stop_length = 100

        self.args = locals().copy()
        self.args.pop('self')
        kwargs = self.args.pop('kwargs')
        self.args = add_kwargs_to_dict(kwargs, self.args)
        # log the arguments
        log.info("optimizer config args: %s", str(self.args))

        assert isinstance(model, Model), "Optimizer input model needs to be an opendeep Model class!"
        assert isinstance(dataset, Dataset), "Optimizer input dataset needs to be an opendeep Dataset class!"
        self.model = model
        self.dataset = dataset

        # Learning rate - how drastic of a step do the parameters change
        self.learning_rate = sharedX(learning_rate, 'learning_rate')
        self.lr_scalers = self.model.get_lr_scalers()
        if lr_decay:
            self.learning_rate_decay = get_decay_function(lr_decay,
                                                          self.learning_rate,
                                                          self.learning_rate.get_value(),
                                                          lr_factor)
        else:
            self.learning_rate_decay = False

        self.noise_switches = raise_to_list(self.model.get_noise_switch())
        self.batch_size = batch_size
        self.minimum_batch_size = minimum_batch_size
        self.n_epoch = n_epoch
        self.save_frequency = save_frequency
        self.early_stop_threshold = early_stop_threshold
        self.early_stop_length = early_stop_length

    def _get_batch_indices(self, data_lengths):
        """
        Computes the tuples of (start_index, end_index) that represent the appropriate slices of the concatenated
        dataset with regards to the given data_lengths. This allows for lists of data lengths to represent sequences,
        so that the concatenated batches returned do not overstep the start of a new sequence.

        Parameters
        ----------
        data_lengths : list(int) or int
            List of num_examples for each dataset (the length of the datasets - this is a list in the case of
            sequences).

        Returns
        -------
        list((int, int))
            List of tuples (start, end) representing the batch slices for the total dataset if it were concatenated.
        """
        batch_indices = []
        start_idx = 0
        for len in raise_to_list(data_lengths):
            # integer division to determine number of whole batches for this length
            n_batches = len / int(self.batch_size)
            # add the (start_idx, end_idx) tuple to the list
            for i in range(n_batches):
                end_idx = start_idx + self.batch_size
                batch_indices.append((start_idx, end_idx))
                start_idx = end_idx
            # remainder to find number of leftover examples
            remainder = numpy.remainder(len, self.batch_size)
            end_idx = start_idx + remainder
            # check if it is bigger than the minimum allowed size
            if remainder >= self.minimum_batch_size:
                batch_indices.append((start_idx, end_idx))
            start_idx = end_idx
        return batch_indices

    def _get_givens_subset(self, subset, batch_slice):
        """
        This translates a batch slice of start and end indices into the actual data from the given subset.

        Parameters
        ----------
        subset : int
            The subset to use - determined in opendeep.data.datasets as TRAIN, VALID, or TEST attributes.
        batch_slice : symbolic slice
            The symbolic slice to grab from the data.

        Returns
        -------
        OrderedDict
            The givens to provide to a function where it sets the input variable to the actual batch representation
            of data from the dataset: (input_variable: data[batch])
        """
        # translate the data_idx into the givens for the model
        # first get the lists of input variables the model requires - inputs and targets
        model_inputs = raise_to_list(self.model.get_inputs())
        model_targets = raise_to_list(self.model.get_targets())
        givens = None
        if self.dataset.getSubset(subset)[0] is not None:
            # grab the data and labels
            data, labels = self.dataset.getSubset(subset)
            # create the givens for the input function as pairs of (input_variable: sliced_data)
            givens = OrderedDict(zip(model_inputs, [data[batch_slice]]))
            # include labels as well if they are required by the model
            if model_targets is not None and len(model_targets) > 0:
                if labels is None:
                    log.error("No labels in the dataset!")
                    raise AssertionError, "No lables in the dataset!"
                givens.update(OrderedDict(zip(model_targets, [labels[batch_slice]])))
        else:
            log.warning("Dataset doesn't have subset %s" % get_subset_strings(subset))

        return givens

    def get_updates(self, gradients):
        """
        This returns the parameter updates to use during training. It defaults to only using (annealed) learning rate.

        Parameters
        ----------
        gradients : dict
            A dictionary mapping from the model's parameters to their
            gradients.

        Returns
        -------
        updates : OrderdDict
            A dictionary mapping from the old model parameters, to their new
            values after a single iteration of the learning rule.
        """
        log.debug('Setting up Stochastic Gradient Descent for optimizer...')
        updates = OrderedDict()
        for (param, gradient) in six.iteritems(gradients):
            scaled_lr = self.learning_rate * self.lr_scalers.get(param, 1.)
            updates[param] = param - scaled_lr * gradient
        return updates

    def get_lr_monitor(self):
        """
        Returns a monitor dictionary to the Optimizer's learning rate.

        Returns
        -------
        dict
            Mapping 'learning_rate' to `self.learning_rate` shared variable.
        """
        return {'learning_rate': self.learning_rate}

    def train(self, monitor_channels=None, train_outservice=None, plot=None, continue_training=False):
        """
        This method performs the training!!!
        It is an online training method that goes over minibatches from the dataset for a number of epochs,
        updating parameters after each minibatch.

        You can disrupt training with a KeyBoardInterrupt and it should exit/save parameters gracefully.

        Parameters
        ----------
        monitor_channels : list(MonitorsChannel or Monitor), optional
            The list of channels or monitors containing monitor expressions/variables to compile and evaluate
            on the data.
        train_outservice : OutService, optional
            The OutService to use for the automatically created train_cost monitor. Default of None just outputs
            to logs.
        plot : Plot, optional
            The Plot object to use if we want to graph the outputs (uses bokeh server).
        continue_training : bool
            Whether to continue training from a previous point.
        """
        ###############################################
        # theano index variable to use on the dataset #
        ###############################################
        # index to a [mini]batch - both start and end
        data_idx = T.iscalar('data_index')
        data_end_idx = T.iscalar('data_end_index')
        function_input = [data_idx, data_end_idx]
        batch_slice = slice(data_idx, data_end_idx)

        # compute number of minibatches for training, validation and testing
        # shapes is list of list - input list of datasets to optimizer (for multiple inputs), and each dataset
        # could be a list of shared variables (like multiple sequences from files)
        train_data_shapes = raise_to_list(self.dataset.getDataShape(TRAIN))
        valid_data_shapes = raise_to_list(self.dataset.getDataShape(VALID))
        test_data_shapes = raise_to_list(self.dataset.getDataShape(TEST))

        # train_batches is going to be lists of tuples that contain the start and end indices for train data.
        # this is more useful in the case of datasets that are lists of sequences, so that the start and end
        # indices can make sure a batch does not cross the sequence boundary on the concatenated data
        train_data_lens = [shape[0] for shape in train_data_shapes]
        self.train_batches = self._get_batch_indices(train_data_lens)

        if valid_data_shapes is not None:
            valid_data_lens = [shape[0] for shape in valid_data_shapes]
            self.valid_batches = self._get_batch_indices(valid_data_lens)
        else:
            self.valid_batches = None
        if test_data_shapes is not None:
            test_data_lens = [shape[0] for shape in test_data_shapes]
            self.test_batches = self._get_batch_indices(test_data_lens)
        else:
            self.test_batches = None

        # create the givens for the input function as pairs of (input_variable: sliced_data)
        train_givens = self._get_givens_subset(TRAIN, batch_slice)
        valid_givens = self._get_givens_subset(VALID, batch_slice)
        test_givens = self._get_givens_subset(TEST, batch_slice)

        # Now time to create the gradient updates for the model - make sure to handle the possible
        # list of costs used for pretraining of certain parts of the model.
        train_costs = raise_to_list(self.model.get_train_cost())
        train_updates = []
        self.gradients = []
        for i, train_cost in enumerate(train_costs):
            # Now create the training cost function for the model to use while training - update parameters
            # gradient!
            gradients, _ = self.model.get_gradient(cost=train_cost)
            self.gradients.append(gradients)

            # Calculate the optimizer updates each run
            # This is where the magic happens for a lot of sub-implementations of SGD!
            # It tells how to update the params each training epoch
            gradient_updates = self.get_updates(gradients)

            # Combine the updates from the model also if applicable
            updates = self.model.get_updates()
            if updates:
                updates.update(gradient_updates)
            else:
                updates = gradient_updates
            train_updates.append(updates)

        # grab the model parameters to use during training
        self.params = self.model.get_params()
        log.info("%s params: %s", str(type(self.model)), str(self.params))

        # deal with the monitor channels if they were given (or take them from the plot)
        if monitor_channels is None and plot is not None and len(plot.channels) > 0:
            monitor_channels = plot.channels
        self.train_monitors_dict = {}
        self.valid_monitors_dict = {}
        self.test_monitors_dict = {}
        self.train_monitors_outservice_dict = {}
        self.valid_monitors_outservice_dict = {}
        self.test_monitors_outservice_dict = {}
        if monitor_channels:
            # collapse the appropriate monitors into their (name, expression, out_service) tuples
            train_collapsed = collapse_channels(monitor_channels, train=True)
            valid_collapsed = collapse_channels(monitor_channels, valid=True)
            test_collapsed  = collapse_channels(monitor_channels, test=True)
            # get name: expression dictionary
            self.train_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in train_collapsed])
            self.valid_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in valid_collapsed])
            self.test_monitors_dict  = OrderedDict([(name, expression) for name, expression, _ in test_collapsed])
            # get name: outservice dictionary
            self.train_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in train_collapsed])
            self.valid_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in valid_collapsed])
            self.test_monitors_outservice_dict  = OrderedDict([(name, out) for name, _, out in test_collapsed])
        # finally deal with an outservice provided to monitor training cost
        self.train_outservice = train_outservice
        # remove redundant files made by the fileservice for the train monitor.
        # TODO: THIS FEELS LIKE A HACK. I don't like it.
        if isinstance(self.train_outservice, FileService):
            os.remove(self.train_outservice.valid_filename)
            os.remove(self.train_outservice.test_filename)

        #######################################
        # compile train and monitor functions #
        #######################################
        train_functions = []
        for i in range(len(train_costs)):
            updates = train_updates[i]
            train_cost = train_costs[i]
            # Compile the training function!
            log.info('Compiling f_learn %d/%d function for model %s...', i + 1, len(train_updates),
                     str(type(self.model)))
            t = time.time()

            f_learn = function(inputs=function_input,
                               updates=updates,
                               outputs=[train_cost] + self.train_monitors_dict.values(),
                               givens=train_givens,
                               name='f_learn_%d' % i)

            log.info('f_learn compilation took %s', make_time_units_string(time.time() - t))
            train_functions.append(f_learn)

        # figure out if we want valid and test
        self.valid_flag = (self.dataset.getSubset(VALID)[0] is not None) and (len(self.valid_monitors_dict) > 0)
        self.test_flag = (self.dataset.getSubset(TEST)[0] is not None) and (len(self.test_monitors_dict) > 0)
        # Now compile the monitor functions!
        log.debug("Compiling monitor functions...")
        monitor_t = time.time()
        # valid monitors
        if self.valid_flag:
            self.valid_monitor_function = function(
                inputs=function_input,
                updates=self.model.get_updates(),
                outputs=self.valid_monitors_dict.values(),
                givens=valid_givens,
                name='valid_monitor_function'
            )
        else:
            self.valid_monitor_function = None

        # test monitors
        if self.test_flag:
            self.test_monitor_function = function(
                inputs=function_input,
                updates=self.model.get_updates(),
                outputs=self.test_monitors_dict.values(),
                givens=test_givens,
                name='test_monitor_function'
            )
        else:
            self.test_monitor_function = None

        log.debug("Compilation done. Took %s", make_time_units_string(time.time() - monitor_t))

        ##################
        # start training #
        ##################
        # make sure to deal with a list of train_cost functions - for layer-wise pretraining!
        # this list of training functions was created during __init__()
        start_time = time.time()
        for func_i, train_function in enumerate(train_functions):
            log.info("-----------TRAINING %s function %d/%d FOR %d EPOCHS (continue_training=%s)-----------",
                     str(type(self.model)), func_i + 1, len(train_functions), self.n_epoch, str(continue_training))

            log.debug("Train dataset size is: %s", self.dataset.getDataShape(TRAIN))
            if self.dataset.getSubset(VALID)[0] is not None:
                log.debug("Valid dataset size is: %s", self.dataset.getDataShape(VALID))
            if self.dataset.getSubset(TEST)[0] is not None:
                log.debug("Test dataset size is: %s", self.dataset.getDataShape(TEST))

            self.STOP = False
            self.epoch_counter = 0
            if not continue_training:
                # reset any decay params
                for decay_param in self.get_decay_params():
                    decay_param.reset()

            self.times = []
            self.best_cost = numpy.inf
            self.best_params = None
            self.patience = 0

            t = time.time()

            while not self.STOP:
                try:
                    self.STOP = self._perform_one_epoch(train_function, plot)
                except KeyboardInterrupt:
                    log.info("STOPPING EARLY FROM KEYBOARDINTERRUPT")
                    self.STOP = True

            # save params
            if self.best_params is not None:
                log.debug("Restoring best model parameters...")
                set_shared_values(self.params, self.best_params)
            log.debug("Saving model parameters...")
            self.model.save_params('trained_epoch_' + str(self.epoch_counter) + '.pkl')

            log.info("------------TRAIN TIME TOOK %s---------", make_time_units_string(time.time() - t))

        log.info("------------TOTAL %s TRAIN TIME TOOK %s---------",
                 str(type(self.model)), make_time_units_string(time.time() - start_time))


    def _perform_one_epoch(self, f_learn, plot=None):
        """
        Performs a single training iteration with the given learn function.
        """
        self.epoch_counter += 1
        t = time.time()
        log.info('EPOCH %s', str(self.epoch_counter))

        # set the noise switches on for training function! (this is where things like dropout happen)
        switch_vals = []
        if len(self.noise_switches) > 0 and (self.valid_flag or self.test_flag or self.epoch_counter == 1):
            log.debug("Turning on %s noise switches", str(len(self.noise_switches)))
            switch_vals = [switch.get_value() for switch in self.noise_switches]
            [switch.set_value(1.) for switch in self.noise_switches]

        # train
        train_costs = []
        train_monitors = {key: [] for key in self.train_monitors_dict.keys()}
        for batch_start, batch_end in self.train_batches:
            _outs = raise_to_list(f_learn(batch_start, batch_end))
            train_costs.append(_outs[0])
            # handle any user defined monitors
            if len(train_monitors) > 0:
                current_monitors = zip(self.train_monitors_dict.keys(), _outs[1:])
                for name, val in current_monitors:
                    train_monitors[name].append(val)

        # get the mean values for the batches
        mean_train = numpy.mean(train_costs, 0)
        current_mean_monitors = {key: numpy.mean(vals, 0) for key, vals in train_monitors.items()}
        # log the mean values!
        log.info('Train cost: %s', trunc(mean_train))
        if len(current_mean_monitors) > 0:
            log.info('Train monitors: %s', str(current_mean_monitors))
        # send the values to their outservices
        if self.train_outservice:
            self.train_outservice.write(mean_train, TRAIN)
        for name, service in self.train_monitors_outservice_dict.items():
            if name in current_mean_monitors and service:
                service.write(current_mean_monitors[name], TRAIN)
        # if there is a plot, also send them over!
        if plot:
            current_mean_monitors.update({TRAIN_COST_KEY: mean_train})
            plot.update_plots(epoch=self.epoch_counter, monitors=current_mean_monitors)

        # set the noise switches off for valid and test sets! we assume unseen data is noisy anyway :)
        if len(self.noise_switches) > 0 and (self.valid_flag or self.test_flag):
            log.debug("Turning off %s noise switches", str(len(self.noise_switches)))
            [switch.set_value(0.) for switch in self.noise_switches]

        # valid
        if self.valid_flag:
            valid_monitors = {key: [] for key in self.valid_monitors_dict.keys()}
            for batch_start, batch_end in self.valid_batches:
                _outs = raise_to_list(self.valid_monitor_function(batch_start, batch_end))
                current_monitors = zip(self.valid_monitors_dict.keys(), _outs)
                for name, val in current_monitors:
                    valid_monitors[name].append(val)

            # get the mean values for the batches
            current_mean_monitors = {key: numpy.mean(vals, 0) for key, vals in valid_monitors.items()}
            # log the mean values!
            log.info('Valid monitors: %s', str(current_mean_monitors))
            # send the values to their outservices
            for name, service in self.valid_monitors_outservice_dict.items():
                if name in current_mean_monitors and service:
                    service.write(current_mean_monitors[name], VALID)
            # if there is a plot, also send them over!
            if plot:
                plot.update_plots(epoch=self.epoch_counter, monitors=current_mean_monitors)

        #test
        if self.test_flag:
            test_monitors = {key: [] for key in self.test_monitors_dict.keys()}
            for batch_start, batch_end in self.test_batches:
                _outs = raise_to_list(self.test_monitor_function(batch_start, batch_end))
                current_monitors = zip(self.test_monitors_dict.keys(), _outs)
                for name, val in current_monitors:
                    test_monitors[name].append(val)

            # get the mean values for the batches
            current_mean_monitors = {key: numpy.mean(vals, 0) for key, vals in test_monitors.items()}
            # log the mean values!
            log.info('Test monitors: %s', str(current_mean_monitors))
            # send the values to their outservices
            for name, service in self.test_monitors_outservice_dict.items():
                if name in current_mean_monitors and service:
                    service.write(current_mean_monitors[name], TEST)
            # if there is a plot, also send them over!
            if plot:
                plot.update_plots(epoch=self.epoch_counter, monitors=current_mean_monitors)

        # check for early stopping on train costs
        cost = numpy.sum(train_costs)
        if cost < self.best_cost * self.early_stop_threshold:
            self.patience = 0
            self.best_cost = cost
            # save the parameters that made it the best
            self.best_params = get_shared_values(self.params)
        else:
            self.patience += 1

        # check for stopping either from n_epochs or from threshold/patience
        stop = False
        if self.epoch_counter >= self.n_epoch:
            log.info("Stopping (reached max number of epochs)...")
            stop = True
        if self.patience >= self.early_stop_length:
            log.info("Stopping early (reached stop threshold)...")
            stop = True

        timing = time.time() - t
        self.times.append(timing)

        log.info('time: ' + make_time_units_string(timing))

        log.debug('remaining time: ' +
                 make_time_units_string((self.n_epoch - self.epoch_counter) * numpy.mean(self.times)))

        if (self.epoch_counter % self.save_frequency) == 0:
            #save params
            self.model.save_params('trained_epoch_' + str(self.epoch_counter) + '.pkl')

        # ANNEAL!
        if not stop:
            # perform the appropriate decay on the decay functions/parameters for this optimizer and model
            for decay_param in self.get_decay_params():
                decay_param.decay()

        # reset the switches
        if len(self.noise_switches) > 0:
            [switch.set_value(val) for switch, val in zip(self.noise_switches, switch_vals)]

        # return whether or not to stop this epoch
        return stop

    def get_decay_params(self):
        """
        Returns a list of all the Decay objects to decay during training.

        Returns
        -------
        list
            List of Decay objects to use after each training epoch - in this case the
            learning rate decay.
        """
        decay_params = self.model.get_decay_params()
        if hasattr(self, 'learning_rate_decay') and self.learning_rate_decay:
            decay_params.append(self.learning_rate_decay)
        return decay_params