def main():
    var = theano.shared(T.zeros(shape=(88, 100), dtype=theano.config.floatX).eval(), name='W')
    updates = [(var, add_uniform(input=var, noise_level=.02))]

    stats = get_stats(var)
    l1 = stats.pop('l1')
    l2 = stats.pop('l2')
    min = stats.pop('min')
    max = stats.pop('max')
    var = stats.pop('var')
    std = stats.pop('std')
    mean = stats.pop('mean')

    mean_monitor = Monitor('mean', mean, train=True, valid=True, out_service=FileService('outs/mean.txt'))
    var_monitor = Monitor('var', var, out_service=FileService('outs/var.txt'))

    w_channel = MonitorsChannel('W', monitors=mean_monitor)

    stat_channel = MonitorsChannel('stats', monitors=[var_monitor])

    monitors = [w_channel, stat_channel]

    train_collapsed_raw = collapse_channels(monitors, train=True)
    train_collapsed = OrderedDict([(item[0], item[1]) for item in train_collapsed_raw])
    train_services = OrderedDict([(item[0], item[2]) for item in train_collapsed_raw])
    valid_collapsed_raw = collapse_channels(monitors, valid=True)
    valid_collapsed = OrderedDict([(item[0], item[1]) for item in valid_collapsed_raw])
    valid_services = OrderedDict([(item[0], item[2]) for item in valid_collapsed_raw])

    log.debug('compiling...')
    f = theano.function(inputs=[], outputs=list(train_collapsed.values()), updates=updates)
    f2 = theano.function(inputs=[], outputs=list(valid_collapsed.values()), updates=updates)
    log.debug('done')

    t1=time.time()

    for epoch in range(10):
        t=time.time()
        log.debug(epoch)
        vals = f()
        m = OrderedDict(zip(train_collapsed.keys(), vals))
        for name, service in train_services.items():
            if name in m:
                service.write(m[name], "train")
        log.debug('----- '+make_time_units_string(time.time()-t))

    for epoch in range(10):
        t = time.time()
        log.debug(epoch)
        vals = f2()
        m = OrderedDict(zip(valid_collapsed.keys(), vals))
        for name, service in valid_services.items():
            if name in m:
                service.write(m[name], "valid")
        log.debug('----- ' + make_time_units_string(time.time() - t))

    log.debug("TOTAL TIME "+make_time_units_string(time.time()-t1))
Exemple #2
0
def main():
    var = theano.shared(T.zeros(shape=(88, 100), dtype=theano.config.floatX).eval(), name='W')
    updates = [(var, add_uniform(input=var, noise_level=.02))]

    stats = get_stats(var)
    l1 = stats.pop('l1')
    l2 = stats.pop('l2')
    min = stats.pop('min')
    max = stats.pop('max')
    var = stats.pop('var')
    std = stats.pop('std')
    mean = stats.pop('mean')

    mean_monitor = Monitor('mean', mean, train=True, valid=True, out_service=FileService('outs/mean.txt'))
    var_monitor = Monitor('var', var, out_service=FileService('outs/var.txt'))

    w_channel = MonitorsChannel('W', monitors=mean_monitor)

    stat_channel = MonitorsChannel('stats', monitors=[var_monitor])

    monitors = [w_channel, stat_channel]

    train_collapsed_raw = collapse_channels(monitors, train=True)
    train_collapsed = OrderedDict([(item[0], item[1]) for item in train_collapsed_raw])
    train_services = OrderedDict([(item[0], item[2]) for item in train_collapsed_raw])
    valid_collapsed_raw = collapse_channels(monitors, valid=True)
    valid_collapsed = OrderedDict([(item[0], item[1]) for item in valid_collapsed_raw])
    valid_services = OrderedDict([(item[0], item[2]) for item in valid_collapsed_raw])

    log.debug('compiling...')
    f = theano.function(inputs=[], outputs=train_collapsed.values(), updates=updates)
    f2 = theano.function(inputs=[], outputs=valid_collapsed.values(), updates=updates)
    log.debug('done')

    t1=time.time()

    for epoch in range(10):
        t=time.time()
        log.debug(epoch)
        vals = f()
        m = OrderedDict(zip(train_collapsed.keys(), vals))
        for name, service in train_services.items():
            if name in m:
                service.write(m[name], TRAIN)
        log.debug('----- '+make_time_units_string(time.time()-t))

    for epoch in range(10):
        t = time.time()
        log.debug(epoch)
        vals = f2()
        m = OrderedDict(zip(valid_collapsed.keys(), vals))
        for name, service in valid_services.items():
            if name in m:
                service.write(m[name], VALID)
        log.debug('----- ' + make_time_units_string(time.time() - t))

    log.debug("TOTAL TIME "+make_time_units_string(time.time()-t1))
def main():
    w = theano.shared(T.zeros(shape=(88, 100), dtype=theano.config.floatX).eval(), name='W')
    updates = [(w, add_uniform(input=w, noise_level=.02))]

    stats = get_stats(w)
    l1 = stats.pop('l1')
    l2 = stats.pop('l2')
    min = stats.pop('min')
    max = stats.pop('max')
    var = stats.pop('var')
    std = stats.pop('std')
    mean = stats.pop('mean')

    mean_monitor = Monitor('mean', mean, train=True, valid=True)
    stat_monitor = Monitor('max', max)

    w_channel = MonitorsChannel('W', monitors=mean_monitor)

    stat_channel = MonitorsChannel('stats', monitors=[stat_monitor])

    monitors = [w_channel, stat_channel]

    train_collapsed = collapse_channels(monitors, train=True)
    train_collapsed = OrderedDict([(name, expression) for name, expression, _ in train_collapsed])
    valid_collapsed = collapse_channels(monitors, valid=True)
    valid_collapsed = OrderedDict([(name, expression) for name, expression, _ in valid_collapsed])

    plot = Plot(bokeh_doc_name='test_plots', monitor_channels=monitors, open_browser=True)

    log.debug('compiling...')
    f = theano.function(inputs=[], outputs=list(train_collapsed.values()), updates=updates)
    f2 = theano.function(inputs=[], outputs=list(valid_collapsed.values()), updates=updates)
    log.debug('done')

    t1=time.time()

    for epoch in range(100):
        t=time.time()
        log.debug(epoch)
        vals = f()
        m = OrderedDict(zip(train_collapsed.keys(), vals))
        plot.update_plots(epoch, m)
        time.sleep(0.02)
        log.debug('----- '+make_time_units_string(time.time()-t))

    for epoch in range(100):
        t = time.time()
        log.debug(epoch)
        vals = f2()
        m = OrderedDict(zip(valid_collapsed.keys(), vals))
        plot.update_plots(epoch, m)
        time.sleep(0.02)
        log.debug('----- ' + make_time_units_string(time.time() - t))

    log.debug("TOTAL TIME "+make_time_units_string(time.time()-t1))
Exemple #4
0
    def compile_run_fn(self):
        """
        This is a helper function to compile the f_run function for computing the model's outputs given inputs.
        Compile and set the f_run function used for `run()`.

        It sets the `self.f_run` attribute to the f_run function.

        .. note::
            The run function defaults like so::

                self.f_run = function(inputs  = raise_to_list(self.get_inputs()),
                                      outputs = raise_to_list(self.get_outputs()),
                                      updates = self.get_updates(),
                                      name    = 'f_run')
        """
        if not hasattr(self, 'f_run'):
            log.debug("Compiling f_run...")
            t = time.time()
            outputs = raise_to_list(self.get_outputs())
            if outputs is not None and len(outputs) == 1:
                outputs = outputs[0]
            self.f_run = function(inputs  = raise_to_list(self.get_inputs()),
                                  outputs = outputs,
                                  updates = self.get_updates(),
                                  name    = 'f_run')
            log.debug("Compilation done. Took %s", make_time_units_string(time.time() - t))
        else:
            log.warn('f_run already exists!')
Exemple #5
0
    def compile_run_fn(self):
        """
        This is a helper function to compile the f_run function for computing the model's outputs given inputs.
        Compile and set the f_run function used for `run()`.

        It sets the `self.f_run` attribute to the f_run function.

        .. note::
            The run function defaults like so::

                self.f_run = function(inputs  = raise_to_list(self.get_inputs()),
                                      outputs = self.get_outputs(),
                                      updates = self.get_updates(),
                                      name    = 'f_run')

        Returns
        -------
        Theano function
            The compiled theano function for running the model.
        """
        if not getattr(self, 'f_run', None):
            log.debug("Compiling f_run...")
            t = time.time()
            self.f_run = function(inputs  = raise_to_list(self.get_inputs()),
                                  outputs = self.get_outputs(),
                                  updates = self.get_updates(),
                                  name    = 'f_run')
            log.debug("Compilation done. Took %s", make_time_units_string(time.time() - t))
        else:
            log.debug('f_run already exists!')

        return self.f_run
Exemple #6
0
    def compile_vocab(self, iters):
        """
        Creates a dictionary mapping tokens (words or characters) to integers given the level and preprocessing.

        Parameters
        ----------
        iters : iterable
            The iterable to go through when creating the vocaublary dictionary.

        Returns
        -------
        vocab
            The dictionary mapping token: integer for all tokens in the `iters` iterable.
        """
        log.debug("Creating vocabulary...")
        t = time.time()
        vocab = {self.unk_token: 0}
        i = 1
        for token in iters:
            if token not in vocab:
                vocab[token] = i
                i += 1
        log.debug("Vocab took %s to create." %
                  make_time_units_string(time.time() - t))
        return vocab
def compute_CSL_with_minibatches_one_chain(fn, minibatches):
    """
    Computes the CSL over minibatches with a single chain (no parallel chains to average computation over).

    Parameters
    ----------
    fn : theano function
        The CSL function to use.
    minibatches : tensor
        The minibatches of data as a 3D tensor with shape (num_minibatches, batch_size, input_dimensionality).

    Returns
    -------
    float
        The mean LL value over minibatches.
    """
    LLs = []
    t = time.time()
    mean = None
    for i, minibatch in enumerate(minibatches):
        # loop through one minibatch
        LL = fn(minibatch)
        LLs.append(LL)
        mean = numpy.mean(LLs)
        log.info('%d  /  %d batches, LL mean so far %.4f' % (i + 1, minibatches.shape[0], mean))
    log.info('mean LL %s' % mean)
    log.info('--- took %s ---' % make_time_units_string(time.time() - t))
    return mean
Exemple #8
0
    def run(self, input):
        """
        This method will return the Prototype's output (run through the `f_run` function), given an input. The input
        comes from all unique inputs to the models in the Prototype as calculated from `get_inputs()` and the outputs
        computed similarly from `get_outputs`.

        Try to avoid re-compiling the theano function created for run - check a `hasattr(self, 'f_run')` or
        something similar first.

        Parameters
        ----------
        input: array_like
            Theano/numpy tensor-like object that is the input into the model's computation graph.

        Returns
        -------
        array_like
            Theano/numpy tensor-like object that is the output of the model's computation graph.
        """
        # make sure the input is raised to a list - we are going to splat it!
        input = raise_to_list(input)
        # first check if we already made an f_run function
        if hasattr(self, 'f_run'):
            return self.f_run(*input)
        # otherwise, compile it!
        else:
            inputs = self.get_inputs()
            outputs = self.get_outputs()
            updates = self.get_updates()
            t = time.time()
            log.info("Compiling f_run...")
            self.f_run = function(inputs=inputs, outputs=outputs, updates=updates, name="f_run")
            log.info("Compilation done! Took %s", make_time_units_string(time.time() - t))
            return self.f_run(*input)
Exemple #9
0
    def compile_run_fn(self):
        """
        This is a helper function to compile the f_run function for computing the model's outputs given inputs.
        Compile and set the f_run function used for `run()`.

        It sets the `self.f_run` attribute to the f_run function.

        .. note::
            The run function defaults like so::

                self.f_run = function(inputs  = raise_to_list(self.get_inputs()),
                                      outputs = raise_to_list(self.get_outputs()),
                                      updates = self.get_updates(),
                                      name    = 'f_run')
        """
        if not hasattr(self, 'f_run'):
            log.debug("Compiling f_run...")
            t = time.time()
            outputs = raise_to_list(self.get_outputs())
            if outputs is not None and len(outputs) == 1:
                outputs = outputs[0]
            self.f_run = function(inputs=raise_to_list(self.get_inputs()),
                                  outputs=outputs,
                                  updates=self.get_updates(),
                                  name='f_run')
            log.debug("Compilation done. Took %s",
                      make_time_units_string(time.time() - t))
        else:
            log.warn('f_run already exists!')
def compute_CSL_with_minibatches_one_chain(fn, minibatches):
    """
    Computes the CSL over minibatches with a single chain (no parallel chains to average computation over).

    Parameters
    ----------
    fn : theano function
        The CSL function to use.
    minibatches : tensor
        The minibatches of data as a 3D tensor with shape (num_minibatches, batch_size, input_dimensionality).

    Returns
    -------
    float
        The mean LL value over minibatches.
    """
    LLs = []
    t = time.time()
    mean = None
    for i, minibatch in enumerate(minibatches):
        # loop through one minibatch
        LL = fn(minibatch)
        LLs.append(LL)
        mean = numpy.mean(LLs)
        log.info('%d  /  %d batches, LL mean so far %.4f' %
                 (i + 1, minibatches.shape[0], mean))
    log.info('mean LL %s' % mean)
    log.info('--- took %s ---' % make_time_units_string(time.time() - t))
    return mean
Exemple #11
0
    def compile_run_fn(self):
        """
        This is a helper function to compile the f_run function for computing the model's outputs given inputs.
        Compile and set the f_run function used for `run()`.

        It sets the `self.f_run` attribute to the f_run function.

        .. note::
            The run function defaults like so::

                self.f_run = function(inputs  = raise_to_list(self.get_inputs()),
                                      outputs = self.get_outputs(),
                                      updates = self.get_updates(),
                                      name    = 'f_run')

        Returns
        -------
        Theano function
            The compiled theano function for running the model.
        """
        if not getattr(self, 'f_run', None):
            log.debug("Compiling f_run...")
            t = time.time()
            self.f_run = function(inputs=raise_to_list(self.get_inputs()),
                                  outputs=self.get_outputs(),
                                  updates=self.get_updates(),
                                  name='f_run')
            log.debug("Compilation done. Took %s",
                      make_time_units_string(time.time() - t))
        else:
            log.debug('f_run already exists!')

        return self.f_run
Exemple #12
0
 def __init__(self,
              dataset,
              subset=datasets.TRAIN,
              batch_size=1,
              minimum_batch_size=1,
              rng=None):
     _t = time.time()
     log.debug('Initializing a %s sequential iterator over %s',
               str(type(dataset)), datasets.get_subset_strings(subset))
     super(self.__class__, self).__init__(dataset, subset, batch_size,
                                          minimum_batch_size, rng)
     log.debug('iterator took %s to make' %
               make_time_units_string(time.time() - _t))
Exemple #13
0
    def run(self, input):
        """
        This method will return the Prototype's output (run through the `f_run` function), given an input. The input
        comes from all unique inputs to the models in the Prototype as calculated from `get_inputs()` and the outputs
        computed similarly from `get_outputs`.

        Try to avoid re-compiling the theano function created for run - check a `hasattr(self, 'f_run')` or
        something similar first.

        Parameters
        ----------
        input: array_like
            Theano/numpy tensor-like object that is the input into the model's computation graph.

        Returns
        -------
        array_like
            Theano/numpy tensor-like object that is the output of the model's computation graph.
        """
        # set the noise switches off for running! we assume unseen data is noisy anyway :)
        old_switch_vals = []
        if len(self.get_switches()) > 0:
            log.debug("Turning off %s noise switches, resetting them after run!", str(len(self.get_switches())))
            old_switch_vals = [switch.get_value() for switch in self.get_switches()]
            [switch.set_value(0.) for switch in self.get_switches()]

        # make sure the input is raised to a list - we are going to splat it!
        input = raise_to_list(input)
        # first check if we already made an f_run function
        if hasattr(self, 'f_run'):
            output = self.f_run(*input)
        # otherwise, compile it!
        else:
            inputs = raise_to_list(self.get_inputs())
            outputs = raise_to_list(self.get_outputs())
            if outputs is not None and len(outputs) == 1:
                outputs = outputs[0]
            updates = self.get_updates()
            t = time.time()
            log.info("Compiling f_run...")
            self.f_run = function(inputs=inputs, outputs=outputs, updates=updates, name="f_run")
            log.info("Compilation done! Took %s", make_time_units_string(time.time() - t))
            output = self.f_run(*input)

        # reset any switches to how they were!
        if len(self.get_switches()) > 0:
            [switch.set_value(val) for switch, val in zip(self.get_switches(), old_switch_vals)]

        return output
Exemple #14
0
    def train(self, continue_training=False):
        log.info(
            "-----------TRAINING %s FOR %s EPOCHS (continue_training=%s)-----------",
            str(type(self.model)), str(self.n_epoch), str(continue_training))
        log.debug("Train dataset size is: %s",
                  self.dataset.getDataShape(datasets.TRAIN))
        if self.dataset.hasSubset(datasets.VALID):
            log.debug("Valid dataset size is: %s",
                      self.dataset.getDataShape(datasets.VALID))
        if self.dataset.hasSubset(datasets.TEST):
            log.debug("Test dataset size is: %s",
                      self.dataset.getDataShape(datasets.TEST))

        self.STOP = False
        self.epoch_counter = 0
        if not continue_training:
            # reset the learning rate
            if hasattr(self, 'learning_rate_decay'):
                self.learning_rate_decay.reset()
            # reset the other model decaying functions
            for decay_param in self.model.get_decay_params():
                decay_param.reset()

        self.times = []
        self.best_cost = float('inf')
        self.best_params = None
        self.patience = 0

        start_time = time.time()

        while not self.STOP:
            try:
                self.STOP = self._perform_one_epoch()
            except KeyboardInterrupt:
                log.info("STOPPING EARLY FROM KEYBOARDINTERRUPT")
                self.STOP = True

        #save params
        if self.best_params is not None:
            log.debug("Restoring best model parameters...")
            set_shared_values(self.params, self.best_params)
        log.debug("Saving model parameters...")
        self.model.save_params('trained_epoch_' + str(self.epoch_counter) +
                               '.pkl')

        log.info("------------TOTAL %s TRAIN TIME TOOK %s---------",
                 str(type(self.model)),
                 make_time_units_string(time.time() - start_time))
Exemple #15
0
        def sample_some_numbers(n_samples):
            # The network's initial state
            init_vis = initial
            noisy_init_vis = self.f_noise(init_vis)

            network_state = [[noisy_init_vis] + [
                numpy.zeros(shape=(initial.shape[0], self.layer_sizes[i + 1]),
                            dtype=theano.config.floatX)
                for i in range(len(self.bias_list[1:]))
            ]]

            visible_chain = [init_vis]
            noisy_h0_chain = [noisy_init_vis]
            sampled_h = []

            times = []
            for i in xrange(n_samples - 1):
                _t = time.time()

                # feed the last state into the network, run new state, and obtain visible units expectation chain
                net_state_out, vis_pX_chain = sampling_wrapper(
                    network_state[-1])

                # append to the visible chain
                visible_chain += vis_pX_chain

                # append state output to the network state chain
                network_state.append(net_state_out)

                noisy_h0_chain.append(net_state_out[0])

                if i % k == 0:
                    sampled_h.append(T.stack(net_state_out[1:]))
                    if i == k:
                        log.debug("About " + make_time_units_string(
                            numpy.mean(times) * (n_samples - 1 - i)) +
                                  " remaining...")

                times.append(time.time() - _t)

            log.DEBUG("Sampling done.")
            return numpy.vstack(visible_chain), sampled_h
        def sample_some_numbers(n_samples):
            # The network's initial state
            init_vis       = initial
            noisy_init_vis = self.f_noise(init_vis)

            network_state  = [
                [noisy_init_vis] +
                [
                    numpy.zeros(shape=(initial.shape[0], self.layer_sizes[i+1]), dtype=theano.config.floatX)
                    for i in range(len(self.bias_list[1:]))
                ]
            ]

            visible_chain  = [init_vis]
            noisy_h0_chain = [noisy_init_vis]
            sampled_h = []

            times = []
            for i in xrange(n_samples-1):
                _t = time.time()

                # feed the last state into the network, run new state, and obtain visible units expectation chain
                net_state_out, vis_pX_chain = sampling_wrapper(network_state[-1])

                # append to the visible chain
                visible_chain += vis_pX_chain

                # append state output to the network state chain
                network_state.append(net_state_out)

                noisy_h0_chain.append(net_state_out[0])

                if i%k == 0:
                    sampled_h.append(T.stack(net_state_out[1:]))
                    if i == k:
                        log.debug("About "+make_time_units_string(numpy.mean(times)*(n_samples-1-i))+" remaining...")

                times.append(time.time() - _t)

            log.DEBUG("Sampling done.")
            return numpy.vstack(visible_chain), sampled_h
Exemple #17
0
    def run(self, input):
        """
        This method will return the model's output (run through the function), given an input. In the case that
        input_hooks or hidden_hooks are used, the function should use them appropriately and assume they are the input.

        Try to avoid re-compiling the theano function created for run - check a hasattr(self, 'f_run') or
        something similar first. I recommend creating your theano f_run in a create_computation_graph method
        to be called after the class initializes.
        ------------------

        :param input: Theano/numpy tensor-like object that is the input into the model's computation graph.
        :type input: tensor

        :return: Theano/numpy tensor-like object that is the output of the model's computation graph.
        :rtype: tensor
        """
        # set any noise switches to zero
        if len(self.get_noise_switch()) > 0:
            vals = [switch.get_value() for switch in self.get_noise_switch()]
            [switch.set_value(0.) for switch in self.get_noise_switch()]

        # check if the run function is already compiled, otherwise compile it!
        if not hasattr(self, 'f_run'):
            log.debug("Compiling f_run...")
            t = time.time()
            self.f_run = function(inputs  = raise_to_list(self.get_inputs()),
                                  outputs = self.get_outputs(),
                                  updates = self.get_updates())
            log.debug("Compilation done. Took %s", make_time_units_string(time.time() - t))

        # because we use the splat to account for multiple inputs to the function, make sure input is a list.
        input = raise_to_list(input)
        # return the results of the run function!
        output =  self.f_run(*input)

        # reset the noise switches
        if len(self.get_noise_switch()) > 0:
            [switch.set_value(val) for switch, val in zip(self.get_noise_switch(), vals)]

        return output
def compute_CSL_with_minibatches(fn, minibatches, chains):
    """
    Computes CSL over parallel chains of minibatches (means the input chains is a 4D tensor consisting of minibatches
    of shape (N, K, D)). When there are N chains, each chain having K samples of dimension D

    Parameters
    ----------
    fn : theano function
        The CSL function to use.
    minibatches : tensor
        The minibatches of data as a 3D tensor with shape (num_minibatches, batch_size, input_dimensionality).
    chains : tensor
        The chains of data as a 4D tensor with shape (n_minibatches, n_chains, batch_size, input_dimensionality).
    Returns
    -------
    float
        The mean LL value over minibatches.
    """
    # fn is the compiled theano fn
    LLs = []
    t = time.time()
    for i, minibatch in enumerate(minibatches):
        # loop through one minibatch
        LL_minibatch_all_chains = []
        for chain_minibatch in chains:
            # loop through a minibatch of chains
            LL = fn(minibatch, chain_minibatch)
            LL_minibatch_all_chains.append(LL)
        LL_minibatch_all_chains = numpy.concatenate(LL_minibatch_all_chains,
                                                    axis=1)
        # import ipdb; ipdb.set_trace()
        LLs.append(LL_minibatch_all_chains)
        mean = numpy.mean(LLs)
        log.info('%d  /  %d batches, LL mean so far %.4f' %
                 (i + 1, minibatches.shape[0], mean))
    LLs = numpy.concatenate(LLs, axis=0)
    mean_LLs = LLs.mean()
    log.info('mean LL %s' % str(mean_LLs))
    log.info('--- took %s ---' % make_time_units_string(time.time() - t))
    return mean_LLs
def compute_CSL_with_minibatches(fn, minibatches, chains):
    """
    Computes CSL over parallel chains of minibatches (means the input chains is a 4D tensor consisting of minibatches
    of shape (N, K, D)). When there are N chains, each chain having K samples of dimension D

    Parameters
    ----------
    fn : theano function
        The CSL function to use.
    minibatches : tensor
        The minibatches of data as a 3D tensor with shape (num_minibatches, batch_size, input_dimensionality).
    chains : tensor
        The chains of data as a 4D tensor with shape (n_minibatches, n_chains, batch_size, input_dimensionality).
    Returns
    -------
    float
        The mean LL value over minibatches.
    """
    # fn is the compiled theano fn
    LLs = []
    t = time.time()
    for i, minibatch in enumerate(minibatches):
        # loop through one minibatch
        LL_minibatch_all_chains = []
        for chain_minibatch in chains:
            # loop through a minibatch of chains
            LL = fn(minibatch, chain_minibatch)
            LL_minibatch_all_chains.append(LL)
        LL_minibatch_all_chains = numpy.concatenate(LL_minibatch_all_chains, axis=1)
        # import ipdb; ipdb.set_trace()
        LLs.append(LL_minibatch_all_chains)
        mean = numpy.mean(LLs)
        log.info('%d  /  %d batches, LL mean so far %.4f' % (i + 1, minibatches.shape[0], mean))
    LLs = numpy.concatenate(LLs, axis=0)
    mean_LLs = LLs.mean()
    log.info('mean LL %s' % str(mean_LLs))
    log.info('--- took %s ---' % make_time_units_string(time.time() - t))
    return mean_LLs
Exemple #20
0
    def run(self, input):
        """
        This method will return the Prototype's output (run through the `f_run` function), given an input. The input
        comes from all unique inputs to the models in the Prototype as calculated from `get_inputs()` and the outputs
        computed similarly from `get_outputs`.

        Try to avoid re-compiling the theano function created for run - check a `hasattr(self, 'f_run')` or
        something similar first.

        Parameters
        ----------
        input: array_like
            Theano/numpy tensor-like object that is the input into the model's computation graph.

        Returns
        -------
        array_like
            Theano/numpy tensor-like object that is the output of the model's computation graph.
        """
        # make sure the input is raised to a list - we are going to splat it!
        input = raise_to_list(input)
        # first check if we already made an f_run function
        if hasattr(self, 'f_run'):
            return self.f_run(*input)
        # otherwise, compile it!
        else:
            inputs = self.get_inputs()
            outputs = self.get_outputs()
            updates = self.get_updates()
            t = time.time()
            log.info("Compiling f_run...")
            self.f_run = function(inputs=inputs,
                                  outputs=outputs,
                                  updates=updates,
                                  name="f_run")
            log.info("Compilation done! Took %s",
                     make_time_units_string(time.time() - t))
            return self.f_run(*input)
Exemple #21
0
    def __init__(self,
                 dataset,
                 subset=datasets.TRAIN,
                 batch_size=1,
                 minimum_batch_size=1,
                 rng=None):
        # initialize a numpy rng if one is not provided
        if rng is None:
            random.seed(123)
            self.rng = random
        else:
            self.rng = rng

        _t = time.time()
        log.debug('Initializing a %s random iterator over %s',
                  str(type(dataset)), datasets.get_subset_strings(subset))
        super(self.__class__, self).__init__(dataset, subset, batch_size,
                                             minimum_batch_size)

        # randomize the indices to access
        self.indices = numpy.arange(self.data_len)
        self.rng.shuffle(self.indices)
        log.debug('iterator took %s to make' %
                  make_time_units_string(time.time() - _t))
Exemple #22
0
    def compile_vocab(self, iters):
        """
        Creates a dictionary mapping tokens (words or characters) to integers given the level and preprocessing.

        Parameters
        ----------
        iters : iterable
            The iterable to go through when creating the vocaublary dictionary.

        Returns
        -------
        vocab
            The dictionary mapping token: integer for all tokens in the `iters` iterable.
        """
        log.debug("Creating vocabulary...")
        t = time.time()
        vocab = {self.unk_token: 0}
        i = 1
        for token in iters:
            if token not in vocab:
                vocab[token] = i
                i += 1
        log.debug("Vocab took %s to create." % make_time_units_string(time.time() - t))
        return vocab
Exemple #23
0
    def train(self, continue_training=False):
        """
        This method performs the training!!!
        :param continue_training:
        :type continue_training:
        :return:
        :rtype:
        """
        # grab the model parameters to use during training
        self.params = self.model.get_params()
        log.info("%s params: %s", str(type(self.model)), str(self.params))

        ###############################################
        # theano index variable to use on the dataset #
        ###############################################
        # index to a [mini]batch - both start and end
        data_idx = T.iscalar('data_index')
        data_end_idx = T.iscalar('data_end_index')
        batch_slice = slice(data_idx, data_end_idx)

        # compute number of minibatches for training, validation and testing
        # shapes is list of list - input list of datasets to optimizer (for multiple inputs), and each dataset
        # could be a list of shared variables (like multiple sequences from files)
        train_data_shapes = raise_to_list(self.dataset.getDataShape(TRAIN))
        valid_data_shapes = raise_to_list(self.dataset.getDataShape(VALID))
        test_data_shapes  = raise_to_list(self.dataset.getDataShape(TEST))

        # train_batches is going to be lists of tuples that contain the start and end indices for train data
        train_data_lens = [shape[0] for shape in train_data_shapes]
        self.train_batches = self.get_batch_indices(train_data_lens)

        if valid_data_shapes is not None:
            valid_data_lens = [shape[0] for shape in valid_data_shapes]
            self.valid_batches = self.get_batch_indices(valid_data_lens)
        else:
            self.valid_batches = None
        if test_data_shapes is not None:
            test_data_lens = [shape[0] for shape in test_data_shapes]
            self.test_batches = self.get_batch_indices(test_data_lens)
        else:
            self.test_batches = None

        # translate the data_idx into the givens for the model
        model_inputs = raise_to_list(self.model.get_inputs())
        model_targets = raise_to_list(self.model.get_targets())

        train_data, train_labels = self.dataset.getSubset(TRAIN)
        train_givens = OrderedDict(zip(model_inputs, [train_data[batch_slice]]))
        if model_targets is not None and len(model_targets) > 0:
            train_givens.update(OrderedDict(zip(model_targets, [train_labels[batch_slice]])))

        valid_data, valid_labels = self.dataset.getSubset(VALID)
        valid_givens = OrderedDict(zip(model_inputs, [valid_data[batch_slice]]))
        if model_targets is not None and len(model_targets) > 0:
            valid_givens.update(OrderedDict(zip(model_targets, [valid_labels[batch_slice]])))

        test_data, test_labels = self.dataset.getSubset(TEST)
        test_givens = OrderedDict(zip(model_inputs, [test_data[batch_slice]]))
        if model_targets is not None and len(model_targets) > 0:
            test_givens.update(OrderedDict(zip(model_targets, [test_labels[batch_slice]])))

        # Now time to create the training cost functions for the model - make sure to handle the possible
        # list of costs used for pretraining of certain parts of the model.
        train_costs = raise_to_list(self.model.get_train_cost())
        self.train_functions = []
        for i, train_cost in enumerate(train_costs):
            # Now create the training cost function for the model to use while training - update parameters
            # gradient!
            gradients, _ = self.model.get_gradient(cost=train_cost)

            # Calculate the optimizer updates each run
            # This is where the magic happens for a lot of sub-implementations of SGD, including AdaDelta!
            # It tells how to update the params each training epoch
            gradient_updates = self.get_updates(gradients)

            # Combine the updates from the model also if applicable
            train_updates = self.model.get_updates()
            if train_updates:
                train_updates.update(gradient_updates)
            else:
                train_updates = gradient_updates

            # Compile the training function!
            log.info('Compiling f_learn %d/%d function for model %s...', i + 1, len(train_costs),
                     str(type(self.model)))
            t = time.time()

            f_learn = function(inputs=[data_idx, data_end_idx],
                               updates=train_updates,
                               outputs=train_cost,
                               givens=train_givens,
                               name='f_learn_%d' % i)

            log.info('f_learn compilation took %s', make_time_units_string(time.time() - t))
            self.train_functions.append(f_learn)

        # grab the expression(s) to use to monitor different model values during training
        log.debug("Compiling monitor functions...")
        monitor_t = time.time()
        self.monitors = OrderedDict(self.model.get_monitors())
        self.monitor_names = self.monitors.keys()
        if len(self.monitors.keys()) > 0:
            self.train_monitor_function = function(
                inputs=[data_idx, data_end_idx],
                updates=self.model.get_updates(),
                outputs=self.monitors.values(),
                givens=train_givens,
                name="train_monitor_function"
            )
        if len(self.monitors.keys()) > 0:
            self.valid_monitor_function = function(
                inputs=[data_idx, data_end_idx],
                updates=self.model.get_updates(),
                outputs=self.monitors.values(),
                givens=valid_givens,
                name="valid_monitor_function"
            )
        if len(self.monitors.keys()) > 0:
            self.test_monitor_function = function(
                inputs=[data_idx, data_end_idx],
                updates=self.model.get_updates(),
                outputs=self.monitors.values(),
                givens=test_givens,
                name="test_monitor_function"
            )
        log.debug("Compilation done. Took %s", make_time_units_string(time.time() - monitor_t))

        self.noise_switches = raise_to_list(self.model.get_noise_switch())

        ##################
        # start training #
        ##################
        # make sure to deal with a list of train_cost functions - for layer-wise pretraining!
        # this list of training functions was created during __init__()
        start_time = time.time()
        for func_i, train_function in enumerate(self.train_functions):
            log.info("-----------TRAINING %s function %d/%d FOR %d EPOCHS (continue_training=%s)-----------",
                     str(type(self.model)), func_i + 1, len(self.train_functions), self.n_epoch, str(continue_training))

            log.debug("Train dataset size is: %s", self.dataset.getDataShape(TRAIN))
            if self.dataset.hasSubset(VALID):
                log.debug("Valid dataset size is: %s", self.dataset.getDataShape(VALID))
            if self.dataset.hasSubset(TEST):
                log.debug("Test dataset size is: %s", self.dataset.getDataShape(TEST))

            self.STOP = False
            self.epoch_counter = 0
            if not continue_training:
                # reset the learning rate
                if hasattr(self, 'learning_rate_decay') and self.learning_rate_decay:
                    self.learning_rate_decay.reset()
                # reset the other model decaying functions
                for decay_param in self.model.get_decay_params():
                    decay_param.reset()

            self.times = []
            self.best_cost = numpy.inf
            self.best_params = None
            self.patience = 0

            t = time.time()

            while not self.STOP:
                try:
                    self.STOP = self._perform_one_epoch(train_function)
                except KeyboardInterrupt:
                    log.info("STOPPING EARLY FROM KEYBOARDINTERRUPT")
                    self.STOP = True

            # save params
            if self.best_params is not None:
                log.debug("Restoring best model parameters...")
                set_shared_values(self.params, self.best_params)
            log.debug("Saving model parameters...")
            self.model.save_params('trained_epoch_' + str(self.epoch_counter) + '.pkl')

            log.info("------------TRAIN TIME TOOK %s---------", make_time_units_string(time.time() - t))

        log.info("------------TOTAL %s TRAIN TIME TOOK %s---------",
                 str(type(self.model)), make_time_units_string(time.time() - start_time))
Exemple #24
0
    def _perform_one_epoch(self, f_learn, plot=None):
        """
        Performs a single training iteration with the given learn function.
        """
        self.epoch_counter += 1
        t = time.time()
        log.info('EPOCH %s', str(self.epoch_counter))

        # set the noise switches on for training function! (this is where things like dropout happen)
        if not self.model.switches_on:
            self.model.turn_on_switches()

        #########
        # train #
        #########
        train_costs = []
        train_monitors = {key: [] for key in self.train_monitors_dict.keys()}
        train_data = [
            minibatch(input_data, self.batch_size, self.min_batch_size)
            for input_data in raise_to_list(self.dataset.train_inputs)
            ]
        if self.dataset.train_targets is not None and not self.unsupervised:
            train_data += [
                minibatch(target, self.batch_size, self.min_batch_size)
                for target in raise_to_list(self.dataset.train_targets)
                ]

        for batch in min_normalized_izip(*train_data):
            _outs = raise_to_list(f_learn(*batch))
            train_costs.append(_outs[0])
            # handle any user defined monitors
            if len(train_monitors) > 0:
                current_monitors = zip(self.train_monitors_dict.keys(), _outs[1:])
                for name, val in current_monitors:
                    val = numpy.asarray(val)
                    train_monitors[name].append(val)

        # get the mean values for the batches
        mean_train = numpy.mean(train_costs, 0)
        current_mean_monitors = {key: numpy.mean(vals, 0) for key, vals in train_monitors.items()}
        # log the mean values!
        log.info('Train cost: %s', trunc(mean_train))
        if len(current_mean_monitors) > 0:
            log.info('Train monitors: %s', str(current_mean_monitors))
        # send the values to their outservices
        if self.train_outservice:
            self.train_outservice.write(mean_train, "train")
        for name, service in self.train_monitors_outservice_dict.items():
            if name in current_mean_monitors and service:
                service.write(current_mean_monitors[name], "train")
        # if there is a plot, also send them over!
        if plot:
            current_mean_monitors.update({TRAIN_COST_KEY: mean_train})
            plot.update_plots(epoch=self.epoch_counter, monitors=current_mean_monitors)

        # set the noise switches off for valid and test sets! we assume unseen data is noisy anyway :)
        if self.model.switches_on:
            self.model.turn_off_switches()

        #########
        # valid #
        #########
        self._compute_over_subset("valid", self.dataset.valid_inputs, self.dataset.valid_targets,
                                  self.valid_monitors_dict, self.valid_monitor_function,
                                  self.valid_monitors_outservice_dict, plot)

        ########
        # test #
        ########
        self._compute_over_subset("test", self.dataset.test_inputs, self.dataset.test_targets,
                                  self.test_monitors_dict, self.test_monitor_function,
                                  self.test_monitors_outservice_dict, plot)

        ###########
        # cleanup #
        ###########
        # check for early stopping on train costs
        cost = numpy.sum(train_costs)
        # if the cost improved, reset the patience and record the best cost.
        if cost < self.best_cost * self.early_stop_threshold:
            self.patience = 0
            self.best_cost = cost
            # save the parameters that made it the best
            self.best_params = {key: param.get_value(borrow=False) for key, param in self.params.items()}
        elif not numpy.isnan(cost):
            self.patience += 1

        # check for stopping either from n_epochs or from threshold/patience
        stop = False
        if self.epoch_counter >= self.n_epoch:
            log.info("Stopping (reached max number of epochs)...")
            stop = True
        if self.patience >= self.early_stop_length:
            log.info("Stopping early (reached stop threshold)...")
            stop = True

        timing = time.time() - t
        self.times.append(timing)

        log.info('time: ' + make_time_units_string(timing))

        log.debug('remaining time: ' +
                 make_time_units_string((self.n_epoch - self.epoch_counter) * numpy.mean(self.times)))

        if (self.epoch_counter % self.save_frequency) == 0:
            #save params
            self.model.save_params('trained_epoch_' + str(self.epoch_counter))

        # ANNEAL!
        if not stop:
            # perform the appropriate decay on the decay functions/parameters for this optimizer and model
            for decay_param in self.get_decay_params():
                decay_param.decay()

        # return whether or not to stop this epoch
        return stop
Exemple #25
0
    def train(self, monitor_channels=None, train_outservice=None, plot=None):
        """
        This method performs the training!!!
        It is an online training method that goes over minibatches from the dataset for a number of epochs,
        updating parameters after each minibatch.

        You can disrupt training with a KeyBoardInterrupt and it should exit/save parameters gracefully.

        Parameters
        ----------
        monitor_channels : list(MonitorsChannel or Monitor), optional
            The list of channels or monitors containing monitor expressions/variables to compile and evaluate
            on the data.
        train_outservice : OutService, optional
            The OutService to use for the automatically created train_cost monitor. Default of None just outputs
            to logs.
        plot : Plot, optional
            The Plot object to use if we want to graph the outputs (uses bokeh server).
        """
        if not self.model:
            log.error("No self.model for the Optimizer!")
            raise AssertionError("Needs to be initialized with a Model! (Or something went wrong if train() "
                                 "was called from the Model. Try initializing the Optimizer with the model param "
                                 "and calling optimizer.train().")

        #########################
        # gradients and updates #
        #########################
        # grab the model parameters to use during training
        self.params = self.model.get_params()
        # Now create the training cost function for the model to use while training - update parameters
        # gradient!
        gradients = grad(cost=self.loss_expression, wrt=list(self.params.values()))
        # now create the dictionary mapping the parameter with its gradient
        gradients = OrderedDict(
            [(param, g) for param, g in zip(list(self.params.values()), gradients)]
        )
        # clip gradients if we want.
        gradients = clip_gradients(gradients, self.grad_clip, self.hard_clip)

        # Calculate the optimizer updates each run
        # This is where the magic happens for a lot of sub-implementations of SGD!
        # It tells how to update the params each training epoch
        gradient_updates = self.get_updates(gradients)

        # Combine the updates from the model also if applicable
        updates = self.model.get_updates()
        if updates:
            updates.update(gradient_updates)
        else:
            updates = gradient_updates

        log.info("%s params: %s", self.model._classname, str(list(self.params.keys())))

        ############
        # monitors #
        ############
        # deal with the monitor channels if they were given (or take them from the plot)
        if monitor_channels is None and plot is not None and len(plot.channels) > 0:
            monitor_channels = plot.channels
        self.train_monitors_dict = {}
        self.valid_monitors_dict = {}
        self.test_monitors_dict = {}
        self.train_monitors_outservice_dict = {}
        self.valid_monitors_outservice_dict = {}
        self.test_monitors_outservice_dict = {}
        if monitor_channels:
            # collapse the appropriate monitors into their (name, expression, out_service) tuples
            train_collapsed = collapse_channels(monitor_channels, train=True)
            valid_collapsed = collapse_channels(monitor_channels, valid=True)
            test_collapsed  = collapse_channels(monitor_channels, test=True)
            # get name: expression dictionary
            self.train_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in train_collapsed])
            self.valid_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in valid_collapsed])
            self.test_monitors_dict  = OrderedDict([(name, expression) for name, expression, _ in test_collapsed])
            # get name: outservice dictionary
            self.train_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in train_collapsed])
            self.valid_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in valid_collapsed])
            self.test_monitors_outservice_dict  = OrderedDict([(name, out) for name, _, out in test_collapsed])
        # finally deal with an outservice provided to monitor training cost
        self.train_outservice = train_outservice
        # remove redundant files made by the fileservice for the train monitor.
        # TODO: THIS FEELS LIKE A HACK. I don't like it.
        if isinstance(self.train_outservice, FileService):
            os.remove(self.train_outservice.valid_filename)
            os.remove(self.train_outservice.test_filename)

        #######################################
        # compile train and monitor functions #
        #######################################
        function_input = raise_to_list(self.model.get_inputs())
        if self.loss_targets is not None:
            function_input += self.loss_targets
        # Compile the training function!
        log.info('Compiling f_learn function for model %s...', self.model._classname)
        t = time.time()

        f_learn = function(inputs=function_input,
                           updates=updates,
                           outputs=[self.loss_expression] + list(self.train_monitors_dict.values()),
                           name='f_learn')

        log.info('f_learn compilation took %s', make_time_units_string(time.time() - t))

        # figure out if we want valid and test (monitors)
        self.valid_flag = (self.dataset.valid_inputs is not None) and (len(self.valid_monitors_dict) > 0)
        self.test_flag = (self.dataset.test_inputs is not None) and (len(self.test_monitors_dict) > 0)
        # Now compile the monitor functions!
        log.debug("Compiling monitor functions...")
        monitor_t = time.time()
        # valid monitors
        if self.valid_flag:
            self.valid_monitor_function = function(
                inputs=function_input,
                updates=self.model.get_updates(),
                outputs=list(self.valid_monitors_dict.values()),
                name='valid_monitor_function'
            )
        else:
            self.valid_monitor_function = None

        # test monitors
        if self.test_flag:
            self.test_monitor_function = function(
                inputs=function_input,
                updates=self.model.get_updates(),
                outputs=list(self.test_monitors_dict.values()),
                name='test_monitor_function'
            )
        else:
            self.test_monitor_function = None

        log.debug("Compilation done. Took %s", make_time_units_string(time.time() - monitor_t))

        ##################
        # start training #
        ##################
        log.info("-----------TRAINING %s FOR %d EPOCHS-----------",
                 self.model._classname, self.n_epoch)

        self.STOP = False
        self.epoch_counter = 0
        # reset any decay params
        for decay_param in self.get_decay_params():
            decay_param.reset()

        self.times = []
        self.best_cost = numpy.inf
        self.best_params = None
        self.patience = 0

        t = time.time()

        while not self.STOP:
            try:
                self.STOP = self._perform_one_epoch(f_learn, plot)
            except KeyboardInterrupt:
                log.info("STOPPING EARLY FROM KEYBOARDINTERRUPT")
                self.STOP = True

        # save params
        if self.best_params is not None:
            log.debug("Restoring best model parameters...")
            for best_param, param_value in self.best_params.items():
                self.params[best_param].set_value(param_value, borrow=False)
        log.debug("Saving model parameters...")
        self.model.save_params('trained_epoch_' + str(self.epoch_counter))

        log.info("------------TRAIN TIME TOOK %s---------", make_time_units_string(time.time() - t))
Exemple #26
0
    def _perform_one_epoch(self, f_learn, plot=None):
        """
        Performs a single training iteration with the given learn function.
        """
        self.epoch_counter += 1
        t = time.time()
        log.info('EPOCH %s', str(self.epoch_counter))

        # set the noise switches on for training function! (this is where things like dropout happen)
        switch_vals = []
        if len(self.noise_switches) > 0 and (self.valid_flag or self.test_flag or self.epoch_counter == 1):
            log.debug("Turning on %s noise switches", str(len(self.noise_switches)))
            switch_vals = [switch.get_value() for switch in self.noise_switches]
            [switch.set_value(1.) for switch in self.noise_switches]

        # train
        train_costs = []
        train_monitors = {key: [] for key in self.train_monitors_dict.keys()}
        for batch_start, batch_end in self.train_batches:
            _outs = raise_to_list(f_learn(batch_start, batch_end))
            train_costs.append(_outs[0])
            # handle any user defined monitors
            if len(train_monitors) > 0:
                current_monitors = zip(self.train_monitors_dict.keys(), _outs[1:])
                for name, val in current_monitors:
                    train_monitors[name].append(val)

        # get the mean values for the batches
        mean_train = numpy.mean(train_costs, 0)
        current_mean_monitors = {key: numpy.mean(vals, 0) for key, vals in train_monitors.items()}
        # log the mean values!
        log.info('Train cost: %s', trunc(mean_train))
        if len(current_mean_monitors) > 0:
            log.info('Train monitors: %s', str(current_mean_monitors))
        # send the values to their outservices
        if self.train_outservice:
            self.train_outservice.write(mean_train, TRAIN)
        for name, service in self.train_monitors_outservice_dict.items():
            if name in current_mean_monitors and service:
                service.write(current_mean_monitors[name], TRAIN)
        # if there is a plot, also send them over!
        if plot:
            current_mean_monitors.update({TRAIN_COST_KEY: mean_train})
            plot.update_plots(epoch=self.epoch_counter, monitors=current_mean_monitors)

        # set the noise switches off for valid and test sets! we assume unseen data is noisy anyway :)
        if len(self.noise_switches) > 0 and (self.valid_flag or self.test_flag):
            log.debug("Turning off %s noise switches", str(len(self.noise_switches)))
            [switch.set_value(0.) for switch in self.noise_switches]

        # valid
        if self.valid_flag:
            valid_monitors = {key: [] for key in self.valid_monitors_dict.keys()}
            for batch_start, batch_end in self.valid_batches:
                _outs = raise_to_list(self.valid_monitor_function(batch_start, batch_end))
                current_monitors = zip(self.valid_monitors_dict.keys(), _outs)
                for name, val in current_monitors:
                    valid_monitors[name].append(val)

            # get the mean values for the batches
            current_mean_monitors = {key: numpy.mean(vals, 0) for key, vals in valid_monitors.items()}
            # log the mean values!
            log.info('Valid monitors: %s', str(current_mean_monitors))
            # send the values to their outservices
            for name, service in self.valid_monitors_outservice_dict.items():
                if name in current_mean_monitors and service:
                    service.write(current_mean_monitors[name], VALID)
            # if there is a plot, also send them over!
            if plot:
                plot.update_plots(epoch=self.epoch_counter, monitors=current_mean_monitors)

        #test
        if self.test_flag:
            test_monitors = {key: [] for key in self.test_monitors_dict.keys()}
            for batch_start, batch_end in self.test_batches:
                _outs = raise_to_list(self.test_monitor_function(batch_start, batch_end))
                current_monitors = zip(self.test_monitors_dict.keys(), _outs)
                for name, val in current_monitors:
                    test_monitors[name].append(val)

            # get the mean values for the batches
            current_mean_monitors = {key: numpy.mean(vals, 0) for key, vals in test_monitors.items()}
            # log the mean values!
            log.info('Test monitors: %s', str(current_mean_monitors))
            # send the values to their outservices
            for name, service in self.test_monitors_outservice_dict.items():
                if name in current_mean_monitors and service:
                    service.write(current_mean_monitors[name], TEST)
            # if there is a plot, also send them over!
            if plot:
                plot.update_plots(epoch=self.epoch_counter, monitors=current_mean_monitors)

        # check for early stopping on train costs
        cost = numpy.sum(train_costs)
        if cost < self.best_cost * self.early_stop_threshold:
            self.patience = 0
            self.best_cost = cost
            # save the parameters that made it the best
            self.best_params = get_shared_values(self.params)
        else:
            self.patience += 1

        # check for stopping either from n_epochs or from threshold/patience
        stop = False
        if self.epoch_counter >= self.n_epoch:
            log.info("Stopping (reached max number of epochs)...")
            stop = True
        if self.patience >= self.early_stop_length:
            log.info("Stopping early (reached stop threshold)...")
            stop = True

        timing = time.time() - t
        self.times.append(timing)

        log.info('time: ' + make_time_units_string(timing))

        log.debug('remaining time: ' +
                 make_time_units_string((self.n_epoch - self.epoch_counter) * numpy.mean(self.times)))

        if (self.epoch_counter % self.save_frequency) == 0:
            #save params
            self.model.save_params('trained_epoch_' + str(self.epoch_counter) + '.pkl')

        # ANNEAL!
        if not stop:
            # perform the appropriate decay on the decay functions/parameters for this optimizer and model
            for decay_param in self.get_decay_params():
                decay_param.decay()

        # reset the switches
        if len(self.noise_switches) > 0:
            [switch.set_value(val) for switch, val in zip(self.noise_switches, switch_vals)]

        # return whether or not to stop this epoch
        return stop
Exemple #27
0
    def train(self, monitor_channels=None, train_outservice=None, plot=None, additional_cost=None):
        """
        This method performs the training!!!
        It is an online training method that goes over minibatches from the dataset for a number of epochs,
        updating parameters after each minibatch.

        You can disrupt training with a KeyBoardInterrupt and it should exit/save parameters gracefully.

        Parameters
        ----------
        monitor_channels : list(MonitorsChannel or Monitor), optional
            The list of channels or monitors containing monitor expressions/variables to compile and evaluate
            on the data.
        train_outservice : OutService, optional
            The OutService to use for the automatically created train_cost monitor. Default of None just outputs
            to logs.
        plot : Plot, optional
            The Plot object to use if we want to graph the outputs (uses bokeh server).
        additional_cost : theano expression or list(theano expression), optional
            Any additional cost expressions to use during training (things like regularization). These will be summed
            with the existing cost.
        """
        if not self.model:
            log.error("No self.model for the Optimizer!")
            raise AssertionError("Needs to be initialized with a Model! (Or something went wrong if train() "
                                 "was called from the Model. Try initializing the Optimizer with the model param "
                                 "and calling optimizer.train().")

        #####################################################
        # handle additional costs (normally regularization) #
        #####################################################
        # Create the gradient updates for the model - make sure to handle the possible
        # list of costs used for pretraining of certain parts of the model.
        train_costs = raise_to_list(self.model.get_train_cost())
        # deal with any other additional costs (like regularization, etc.)
        if additional_cost is not None:
            additional_costs = raise_to_list(additional_cost)
            if len(additional_costs) > 1:
                additional_cost = T.sum(additional_costs)

        #########################
        # gradients and updates #
        #########################
        train_updates = []
        self.gradients = []
        for i, train_cost in enumerate(train_costs):
            # Now create the training cost function for the model to use while training - update parameters
            # gradient!
            if len(train_costs) > 1 and additional_cost is not None:
                log.warning("additional_cost will double count with gradients during layer-wise pretraining!")
                warnings.warn("additional_cost will double count with gradients during layer-wise pretraining!")
            # TODO: additional_cost will double count with gradients during layer-wise pretraining.
            # Need to somehow make w.r.t. params appropriate for the individual training costs.
            gradients, _ = self.model.get_gradient(cost=train_cost, additional_cost=additional_cost)
            # clip gradients if we want.
            gradients = clip_gradients(gradients, self.grad_clip, self.hard_clip)
            # append to list
            self.gradients.append(gradients)

            # Calculate the optimizer updates each run
            # This is where the magic happens for a lot of sub-implementations of SGD!
            # It tells how to update the params each training epoch
            gradient_updates = self.get_updates(gradients)

            # Combine the updates from the model also if applicable
            updates = self.model.get_updates()
            if updates:
                updates.update(gradient_updates)
            else:
                updates = gradient_updates
            train_updates.append(updates)

        # grab the model parameters to use during training
        self.params = self.model.get_params()
        log.info("%s params: %s", str(type(self.model)), str(self.params))

        ############
        # monitors #
        ############
        # deal with the monitor channels if they were given (or take them from the plot)
        if monitor_channels is None and plot is not None and len(plot.channels) > 0:
            monitor_channels = plot.channels
        self.train_monitors_dict = {}
        self.valid_monitors_dict = {}
        self.test_monitors_dict = {}
        self.train_monitors_outservice_dict = {}
        self.valid_monitors_outservice_dict = {}
        self.test_monitors_outservice_dict = {}
        if monitor_channels:
            # collapse the appropriate monitors into their (name, expression, out_service) tuples
            train_collapsed = collapse_channels(monitor_channels, train=True)
            valid_collapsed = collapse_channels(monitor_channels, valid=True)
            test_collapsed  = collapse_channels(monitor_channels, test=True)
            # get name: expression dictionary
            self.train_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in train_collapsed])
            self.valid_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in valid_collapsed])
            self.test_monitors_dict  = OrderedDict([(name, expression) for name, expression, _ in test_collapsed])
            # get name: outservice dictionary
            self.train_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in train_collapsed])
            self.valid_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in valid_collapsed])
            self.test_monitors_outservice_dict  = OrderedDict([(name, out) for name, _, out in test_collapsed])
        # finally deal with an outservice provided to monitor training cost
        self.train_outservice = train_outservice
        # remove redundant files made by the fileservice for the train monitor.
        # TODO: THIS FEELS LIKE A HACK. I don't like it.
        if isinstance(self.train_outservice, FileService):
            os.remove(self.train_outservice.valid_filename)
            os.remove(self.train_outservice.test_filename)

        #######################################
        # compile train and monitor functions #
        #######################################
        function_input = raise_to_list(self.model.get_inputs()) + raise_to_list(self.model.get_targets())
        train_functions = []
        for i, (updates, train_cost) in enumerate(zip(train_updates, train_costs)):
            # Compile the training function!
            log.info('Compiling f_learn %d/%d function for model %s...', i + 1, len(train_updates),
                     str(type(self.model)))
            t = time.time()

            f_learn = function(inputs=function_input,
                               updates=updates,
                               outputs=[train_cost] + list(self.train_monitors_dict.values()),
                               name='f_learn_%d' % i)

            log.info('f_learn %d compilation took %s', i + 1, make_time_units_string(time.time() - t))
            train_functions.append(f_learn)

        # figure out if we want valid and test (monitors)
        self.valid_flag = (self.dataset.valid_inputs is not None) and (len(self.valid_monitors_dict) > 0)
        self.test_flag = (self.dataset.test_inputs is not None) and (len(self.test_monitors_dict) > 0)
        # Now compile the monitor functions!
        log.debug("Compiling monitor functions...")
        monitor_t = time.time()
        # valid monitors
        if self.valid_flag:
            self.valid_monitor_function = function(
                inputs=function_input,
                updates=self.model.get_updates(),
                outputs=list(self.valid_monitors_dict.values()),
                name='valid_monitor_function'
            )
        else:
            self.valid_monitor_function = None

        # test monitors
        if self.test_flag:
            self.test_monitor_function = function(
                inputs=function_input,
                updates=self.model.get_updates(),
                outputs=list(self.test_monitors_dict.values()),
                name='test_monitor_function'
            )
        else:
            self.test_monitor_function = None

        log.debug("Compilation done. Took %s", make_time_units_string(time.time() - monitor_t))

        ##################
        # start training #
        ##################
        # make sure to deal with a list of train_cost functions - for layer-wise pretraining!
        # this list of training functions was created during __init__()
        start_time = time.time()
        for func_i, train_function in enumerate(train_functions):
            log.info("-----------TRAINING %s function %d/%d FOR %d EPOCHS-----------",
                     str(type(self.model)), func_i + 1, len(train_functions), self.n_epoch)

            self.STOP = False
            self.epoch_counter = 0
            # reset any decay params
            for decay_param in self.get_decay_params():
                decay_param.reset()

            self.times = []
            self.best_cost = numpy.inf
            self.best_params = None
            self.patience = 0

            t = time.time()

            while not self.STOP:
                try:
                    self.STOP = self._perform_one_epoch(train_function, plot)
                except KeyboardInterrupt:
                    log.info("STOPPING EARLY FROM KEYBOARDINTERRUPT")
                    self.STOP = True

            # save params
            if self.best_params is not None:
                log.debug("Restoring best model parameters...")
                set_shared_values(self.params, self.best_params)
            log.debug("Saving model parameters...")
            self.model.save_params('trained_epoch_' + str(self.epoch_counter) + '.pkl')

            log.info("------------TRAIN TIME TOOK %s---------", make_time_units_string(time.time() - t))

        log.info("------------TOTAL %s TRAIN TIME TOOK %s---------",
                 str(type(self.model)), make_time_units_string(time.time() - start_time))
Exemple #28
0
    def _perform_one_epoch(self):
        self.epoch_counter += 1
        t = time.time()
        log.info('EPOCH %s', str(self.epoch_counter))

        #train
        train_costs = []
        train_monitors = {key: [] for key in self.monitors.keys()}
        for x, y in self.iterator(self.dataset, datasets.TRAIN,
                                  self.batch_size, self.minimum_batch_size,
                                  self.rng):
            if self.unsupervised:
                train_costs.append(self.f_learn(x))
                for key in self.monitors.keys():
                    monitor_function = self.monitors[key]
                    train_monitors[key].append(monitor_function(x))
            else:
                train_costs.append(self.f_learn(x, y))
                for key in self.monitors.keys():
                    monitor_function = self.monitors[key]
                    train_monitors[key].append(monitor_function(x, y))
        log.info('Train cost: %s', trunc(numpy.mean(train_costs, 0)))
        if len(self.monitors.keys()) > 0:
            log.info(
                'Train monitors: %s',
                str({
                    key: numpy.mean(value, 0)
                    for key, value in train_monitors.items()
                }))

        #valid
        if self.dataset.hasSubset(
                datasets.VALID) and len(self.monitors.keys()) > 0:
            valid_monitors = {key: [] for key in self.monitors.keys()}
            for x, y in self.iterator(self.dataset, datasets.VALID,
                                      self.batch_size, self.minimum_batch_size,
                                      self.rng):
                if self.unsupervised:
                    for key in self.monitors.keys():
                        monitor_function = self.monitors[key]
                        valid_monitors[key].append(monitor_function(x))
                else:
                    for key in self.monitors.keys():
                        monitor_function = self.monitors[key]
                        valid_monitors[key].append(monitor_function(x, y))
            log.info(
                'Valid monitors: %s',
                str({
                    key: numpy.mean(value, 0)
                    for key, value in valid_monitors.items()
                }))

        #test
        if self.dataset.hasSubset(
                datasets.TEST) and len(self.monitors.keys()) > 0:
            test_monitors = {key: [] for key in self.monitors.keys()}
            for x, y in self.iterator(self.dataset, datasets.TEST,
                                      self.batch_size, self.minimum_batch_size,
                                      self.rng):
                if self.unsupervised:
                    for key in self.monitors.keys():
                        monitor_function = self.monitors[key]
                        test_monitors[key].append(monitor_function(x))
                else:
                    for key in self.monitors.keys():
                        monitor_function = self.monitors[key]
                        test_monitors[key].append(monitor_function(x, y))
            log.info(
                'Test monitors: %s',
                str({
                    key: numpy.mean(value, 0)
                    for key, value in test_monitors.items()
                }))

        # check for early stopping on train costs
        cost = numpy.sum(train_costs)
        if cost < self.best_cost * self.early_stop_threshold:
            self.patience = 0
            self.best_cost = cost
            # save the parameters that made it the best
            self.best_params = get_shared_values(self.params)
        else:
            self.patience += 1

        if self.epoch_counter >= self.n_epoch or self.patience >= self.early_stop_length:
            log.info("Stopping early...")
            self.STOP = True

        timing = time.time() - t
        self.times.append(timing)

        log.info('time: ' + make_time_units_string(timing))

        log.info('remaining time: ' +
                 make_time_units_string((self.n_epoch - self.epoch_counter) *
                                        numpy.mean(self.times)))

        if (self.epoch_counter % self.save_frequency) == 0:
            #save params
            self.model.save_params('trained_epoch_' + str(self.epoch_counter) +
                                   '.pkl')

        # ANNEAL!
        if hasattr(self, 'learning_rate_decay'):
            self.learning_rate_decay.decay()
        if hasattr(self, 'momentum_decay'):
            self.momentum_decay.decay()
        for decay_param in self.model.get_decay_params():
            decay_param.decay()
Exemple #29
0
    def train(self, monitor_channels=None, train_outservice=None, plot=None, continue_training=False):
        """
        This method performs the training!!!
        It is an online training method that goes over minibatches from the dataset for a number of epochs,
        updating parameters after each minibatch.

        You can disrupt training with a KeyBoardInterrupt and it should exit/save parameters gracefully.

        Parameters
        ----------
        monitor_channels : list(MonitorsChannel or Monitor), optional
            The list of channels or monitors containing monitor expressions/variables to compile and evaluate
            on the data.
        train_outservice : OutService, optional
            The OutService to use for the automatically created train_cost monitor. Default of None just outputs
            to logs.
        plot : Plot, optional
            The Plot object to use if we want to graph the outputs (uses bokeh server).
        continue_training : bool
            Whether to continue training from a previous point.
        """
        ###############################################
        # theano index variable to use on the dataset #
        ###############################################
        # index to a [mini]batch - both start and end
        data_idx = T.iscalar('data_index')
        data_end_idx = T.iscalar('data_end_index')
        function_input = [data_idx, data_end_idx]
        batch_slice = slice(data_idx, data_end_idx)

        # compute number of minibatches for training, validation and testing
        # shapes is list of list - input list of datasets to optimizer (for multiple inputs), and each dataset
        # could be a list of shared variables (like multiple sequences from files)
        train_data_shapes = raise_to_list(self.dataset.getDataShape(TRAIN))
        valid_data_shapes = raise_to_list(self.dataset.getDataShape(VALID))
        test_data_shapes = raise_to_list(self.dataset.getDataShape(TEST))

        # train_batches is going to be lists of tuples that contain the start and end indices for train data.
        # this is more useful in the case of datasets that are lists of sequences, so that the start and end
        # indices can make sure a batch does not cross the sequence boundary on the concatenated data
        train_data_lens = [shape[0] for shape in train_data_shapes]
        self.train_batches = self._get_batch_indices(train_data_lens)

        if valid_data_shapes is not None:
            valid_data_lens = [shape[0] for shape in valid_data_shapes]
            self.valid_batches = self._get_batch_indices(valid_data_lens)
        else:
            self.valid_batches = None
        if test_data_shapes is not None:
            test_data_lens = [shape[0] for shape in test_data_shapes]
            self.test_batches = self._get_batch_indices(test_data_lens)
        else:
            self.test_batches = None

        # create the givens for the input function as pairs of (input_variable: sliced_data)
        train_givens = self._get_givens_subset(TRAIN, batch_slice)
        valid_givens = self._get_givens_subset(VALID, batch_slice)
        test_givens = self._get_givens_subset(TEST, batch_slice)

        # Now time to create the gradient updates for the model - make sure to handle the possible
        # list of costs used for pretraining of certain parts of the model.
        train_costs = raise_to_list(self.model.get_train_cost())
        train_updates = []
        self.gradients = []
        for i, train_cost in enumerate(train_costs):
            # Now create the training cost function for the model to use while training - update parameters
            # gradient!
            gradients, _ = self.model.get_gradient(cost=train_cost)
            self.gradients.append(gradients)

            # Calculate the optimizer updates each run
            # This is where the magic happens for a lot of sub-implementations of SGD!
            # It tells how to update the params each training epoch
            gradient_updates = self.get_updates(gradients)

            # Combine the updates from the model also if applicable
            updates = self.model.get_updates()
            if updates:
                updates.update(gradient_updates)
            else:
                updates = gradient_updates
            train_updates.append(updates)

        # grab the model parameters to use during training
        self.params = self.model.get_params()
        log.info("%s params: %s", str(type(self.model)), str(self.params))

        # deal with the monitor channels if they were given (or take them from the plot)
        if monitor_channels is None and plot is not None and len(plot.channels) > 0:
            monitor_channels = plot.channels
        self.train_monitors_dict = {}
        self.valid_monitors_dict = {}
        self.test_monitors_dict = {}
        self.train_monitors_outservice_dict = {}
        self.valid_monitors_outservice_dict = {}
        self.test_monitors_outservice_dict = {}
        if monitor_channels:
            # collapse the appropriate monitors into their (name, expression, out_service) tuples
            train_collapsed = collapse_channels(monitor_channels, train=True)
            valid_collapsed = collapse_channels(monitor_channels, valid=True)
            test_collapsed  = collapse_channels(monitor_channels, test=True)
            # get name: expression dictionary
            self.train_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in train_collapsed])
            self.valid_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in valid_collapsed])
            self.test_monitors_dict  = OrderedDict([(name, expression) for name, expression, _ in test_collapsed])
            # get name: outservice dictionary
            self.train_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in train_collapsed])
            self.valid_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in valid_collapsed])
            self.test_monitors_outservice_dict  = OrderedDict([(name, out) for name, _, out in test_collapsed])
        # finally deal with an outservice provided to monitor training cost
        self.train_outservice = train_outservice
        # remove redundant files made by the fileservice for the train monitor.
        # TODO: THIS FEELS LIKE A HACK. I don't like it.
        if isinstance(self.train_outservice, FileService):
            os.remove(self.train_outservice.valid_filename)
            os.remove(self.train_outservice.test_filename)

        #######################################
        # compile train and monitor functions #
        #######################################
        train_functions = []
        for i in range(len(train_costs)):
            updates = train_updates[i]
            train_cost = train_costs[i]
            # Compile the training function!
            log.info('Compiling f_learn %d/%d function for model %s...', i + 1, len(train_updates),
                     str(type(self.model)))
            t = time.time()

            f_learn = function(inputs=function_input,
                               updates=updates,
                               outputs=[train_cost] + self.train_monitors_dict.values(),
                               givens=train_givens,
                               name='f_learn_%d' % i)

            log.info('f_learn compilation took %s', make_time_units_string(time.time() - t))
            train_functions.append(f_learn)

        # figure out if we want valid and test
        self.valid_flag = (self.dataset.getSubset(VALID)[0] is not None) and (len(self.valid_monitors_dict) > 0)
        self.test_flag = (self.dataset.getSubset(TEST)[0] is not None) and (len(self.test_monitors_dict) > 0)
        # Now compile the monitor functions!
        log.debug("Compiling monitor functions...")
        monitor_t = time.time()
        # valid monitors
        if self.valid_flag:
            self.valid_monitor_function = function(
                inputs=function_input,
                updates=self.model.get_updates(),
                outputs=self.valid_monitors_dict.values(),
                givens=valid_givens,
                name='valid_monitor_function'
            )
        else:
            self.valid_monitor_function = None

        # test monitors
        if self.test_flag:
            self.test_monitor_function = function(
                inputs=function_input,
                updates=self.model.get_updates(),
                outputs=self.test_monitors_dict.values(),
                givens=test_givens,
                name='test_monitor_function'
            )
        else:
            self.test_monitor_function = None

        log.debug("Compilation done. Took %s", make_time_units_string(time.time() - monitor_t))

        ##################
        # start training #
        ##################
        # make sure to deal with a list of train_cost functions - for layer-wise pretraining!
        # this list of training functions was created during __init__()
        start_time = time.time()
        for func_i, train_function in enumerate(train_functions):
            log.info("-----------TRAINING %s function %d/%d FOR %d EPOCHS (continue_training=%s)-----------",
                     str(type(self.model)), func_i + 1, len(train_functions), self.n_epoch, str(continue_training))

            log.debug("Train dataset size is: %s", self.dataset.getDataShape(TRAIN))
            if self.dataset.getSubset(VALID)[0] is not None:
                log.debug("Valid dataset size is: %s", self.dataset.getDataShape(VALID))
            if self.dataset.getSubset(TEST)[0] is not None:
                log.debug("Test dataset size is: %s", self.dataset.getDataShape(TEST))

            self.STOP = False
            self.epoch_counter = 0
            if not continue_training:
                # reset any decay params
                for decay_param in self.get_decay_params():
                    decay_param.reset()

            self.times = []
            self.best_cost = numpy.inf
            self.best_params = None
            self.patience = 0

            t = time.time()

            while not self.STOP:
                try:
                    self.STOP = self._perform_one_epoch(train_function, plot)
                except KeyboardInterrupt:
                    log.info("STOPPING EARLY FROM KEYBOARDINTERRUPT")
                    self.STOP = True

            # save params
            if self.best_params is not None:
                log.debug("Restoring best model parameters...")
                set_shared_values(self.params, self.best_params)
            log.debug("Saving model parameters...")
            self.model.save_params('trained_epoch_' + str(self.epoch_counter) + '.pkl')

            log.info("------------TRAIN TIME TOOK %s---------", make_time_units_string(time.time() - t))

        log.info("------------TOTAL %s TRAIN TIME TOOK %s---------",
                 str(type(self.model)), make_time_units_string(time.time() - start_time))
Exemple #30
0
    def __init__(self,
                 model,
                 dataset,
                 iterator_class=SequentialIterator,
                 config=None,
                 defaults=_defaults,
                 rng=None,
                 n_epoch=None,
                 batch_size=None,
                 minimum_batch_size=None,
                 save_frequency=None,
                 early_stop_threshold=None,
                 early_stop_length=None,
                 learning_rate=None,
                 lr_decay=None,
                 lr_factor=None,
                 momentum=None,
                 momentum_decay=None,
                 momentum_factor=None,
                 nesterov_momentum=None,
                 flag_para_load=None):
        # superclass init
        super(SGD, self).__init__(config=config, defaults=defaults)
        # config and defaults are now combined in self.args! yay!

        self.model = model
        self.dataset = dataset
        self.iterator = iterator_class

        # Training epochs - how many times to iterate over the whole dataset
        self.n_epoch = n_epoch or self.args.get('n_epoch')

        # Dataset iteration batch sizes - number of examples in each calculation
        self.batch_size = batch_size or self.args.get('batch_size')
        self.minimum_batch_size = minimum_batch_size or self.args.get(
            'minimum_batch_size')

        # Number of epochs between saving model parameters
        self.save_frequency = save_frequency or self.args.get('save_frequency')

        # Early stopping threshold and patience - by how much does the cost have to improve over a number of epochs
        self.early_stop_threshold = early_stop_threshold or self.args.get(
            'early_stop_threshold')
        self.early_stop_length = early_stop_length or self.args.get(
            'early_stop_length')

        # Learning rate - how drastic of a step do the parameters change
        lr = learning_rate or self.args.get('learning_rate')
        self.learning_rate = sharedX(lr, 'learning_rate')
        self.lr_scalers = self.model.get_lr_scalers()
        if lr_decay or self.args.get('lr_decay'):
            self.learning_rate_decay = get_decay_function(
                lr_decay or self.args.get('lr_decay'), self.learning_rate,
                self.learning_rate.get_value(), lr_factor
                or self.args.get('lr_factor'))

        # Momentum - smoothing over the parameter changes (see Hinton)
        self.momentum = sharedX(momentum or self.args.get('momentum'),
                                'momentum')
        if self.args.get('momentum_decay'):
            self.momentum_decay = get_decay_function(
                momentum_decay or self.args.get('momentum_decay'),
                self.momentum, self.momentum.get_value(), momentum_factor
                or self.args.get('momentum_factor'))
        self.nesterov_momentum = nesterov_momentum or self.args.get(
            'nesterov_momentum')

        # RNG for working on random iterator
        if rng is None:
            random.seed(123)
            self.rng = random
        else:
            self.rng = rng

        self.params = self.model.get_params()

        # Now create the training cost function for the model to use while training - update parameters
        log.info("%s params: %s", str(type(self.model)), str(self.params))
        # gradient!
        gradient = grad(self.model.get_train_cost(), self.params)
        grads = OrderedDict(zip(self.params, gradient))

        # Calculate the optimizer updates each run
        # This is where the magic happens for a lot of sub-implementations of SGD, including AdaDelta!
        # It tells how to update the params each training epoch
        gradient_updates = self.get_updates(grads)

        # Combine the updates from the model also if applicable
        train_updates = model.get_updates()
        if train_updates:
            train_updates.update(gradient_updates)
        else:
            train_updates = gradient_updates

        # Compile the training function!
        log.info('Compiling f_learn function for model %s...',
                 str(type(self.model)))
        t = time.time()
        self.f_learn = function(inputs=model.get_inputs(),
                                updates=train_updates,
                                outputs=self.model.get_train_cost(),
                                name='f_learn')
        log.info('f_learn compilation took %s',
                 make_time_units_string(time.time() - t))

        # Determine if this function is unsupervised or not by looking at the number of inputs to the f_learn function.
        # If there is only one input, it is unsupervised, otherwise, it is supervised.
        # This workaround was provided by Pascal Lamblin on the theano-users google group
        num_inputs = len(
            [i for i in self.f_learn.maker.inputs if not i.shared])
        if num_inputs == 1:
            log.debug("Model is unsupervised: 1 input to f_learn.")
            self.unsupervised = True
        elif num_inputs == 2:
            log.debug("Model is supervised: 2 inputs to f_learn.")
            self.unsupervised = False
        else:
            log.error(
                "Number of inputs to f_learn on model %s was %s. Needs to be 1 for unsupervised or 2 for supervised.",
                str(type(self.model)), str(num_inputs))
            raise AssertionError(
                "Number of inputs to f_learn on model %s was %s. Needs to be 1 for unsupervised or 2 for supervised."
                % str(type(self.model)), str(num_inputs))

        # grab the function(s) to use to monitor different model values during training
        self.monitors = self.model.get_monitors()
Exemple #31
0
    def _build_computation_graph(self):
        ###################### BUILD NETWORK ##########################
        # whether or not to mirror the input images before feeding them into the network
        if self.flag_datalayer:
            layer_1_input = mirror_images(
                input=self.x,
                image_shape=(self.batch_size, 3, 256, 256),  # bc01 format
                cropsize=227,
                rand=self.rand,
                flag_rand=self.rand_crop)
        else:
            layer_1_input = self.x  # 4D tensor (going to be in bc01 format)

        # Start with 5 convolutional pooling layers
        log.debug("convpool layer 1...")
        convpool_layer1 = ConvPoolLayer(inputs_hook=((self.batch_size, 3, 227,
                                                      227), layer_1_input),
                                        filter_shape=(96, 3, 11, 11),
                                        convstride=4,
                                        padsize=0,
                                        group=1,
                                        poolsize=3,
                                        poolstride=2,
                                        bias_init=0.0,
                                        local_response_normalization=True)
        # Add this layer's parameters!
        self.params += convpool_layer1.get_params()

        log.debug("convpool layer 2...")
        convpool_layer2 = ConvPoolLayer(inputs_hook=((
            self.batch_size,
            96,
            27,
            27,
        ), convpool_layer1.get_outputs()),
                                        filter_shape=(256, 96, 5, 5),
                                        convstride=1,
                                        padsize=2,
                                        group=2,
                                        poolsize=3,
                                        poolstride=2,
                                        bias_init=0.1,
                                        local_response_normalization=True)
        # Add this layer's parameters!
        self.params += convpool_layer2.get_params()

        log.debug("convpool layer 3...")
        convpool_layer3 = ConvPoolLayer(
            inputs_hook=((self.batch_size, 256, 13, 13),
                         convpool_layer2.get_outputs()),
            filter_shape=(384, 256, 3, 3),
            convstride=1,
            padsize=1,
            group=1,
            poolsize=1,
            poolstride=0,
            bias_init=0.0,
            local_response_normalization=False)
        # Add this layer's parameters!
        self.params += convpool_layer3.get_params()

        log.debug("convpool layer 4...")
        convpool_layer4 = ConvPoolLayer(
            inputs_hook=((self.batch_size, 384, 13, 13),
                         convpool_layer3.get_outputs()),
            filter_shape=(384, 384, 3, 3),
            convstride=1,
            padsize=1,
            group=2,
            poolsize=1,
            poolstride=0,
            bias_init=0.1,
            local_response_normalization=False)
        # Add this layer's parameters!
        self.params += convpool_layer4.get_params()

        log.debug("convpool layer 5...")
        convpool_layer5 = ConvPoolLayer(
            inputs_hook=((self.batch_size, 384, 13, 13),
                         convpool_layer4.get_outputs()),
            filter_shape=(256, 384, 3, 3),
            convstride=1,
            padsize=1,
            group=2,
            poolsize=3,
            poolstride=2,
            bias_init=0.0,
            local_response_normalization=False)
        # Add this layer's parameters!
        self.params += convpool_layer5.get_params()

        # Now onto the fully-connected layers!
        fc_config = {
            'activation':
            'rectifier',  # type of activation function to use for output
            'weights_init':
            'gaussian',  # either 'gaussian' or 'uniform' - how to initialize weights
            'weights_mean': 0.0,  # mean for gaussian weights init
            'weights_std':
            0.005,  # standard deviation for gaussian weights init
            'bias_init': 0.0  # how to initialize the bias parameter
        }
        log.debug("fully connected layer 1 (model layer 6)...")
        # we want to have dropout applied to the training version, but not the test version.
        fc_layer6_input = T.flatten(convpool_layer5.get_outputs(), 2)
        fc_layer6 = BasicLayer(inputs_hook=(9216, fc_layer6_input),
                               output_size=4096,
                               noise='dropout',
                               noise_level=0.5,
                               **fc_config)
        # Add this layer's parameters!
        self.params += fc_layer6.get_params()
        # Add the dropout noise switch
        self.noise_switches += fc_layer6.get_noise_switch()

        log.debug("fully connected layer 2 (model layer 7)...")
        fc_layer7 = BasicLayer(inputs_hook=(4096, fc_layer6.get_outputs()),
                               output_size=4096,
                               noise='dropout',
                               noise_level=0.5,
                               **fc_config)

        # Add this layer's parameters!
        self.params += fc_layer7.get_params()
        # Add the dropout noise switch
        self.noise_switches += fc_layer7.get_noise_switch()

        # last layer is a softmax prediction output layer
        softmax_config = {
            'weights_init': 'gaussian',
            'weights_mean': 0.0,
            'weights_std': 0.005,
            'bias_init': 0.0
        }
        log.debug("softmax classification layer (model layer 8)...")
        softmax_layer8 = SoftmaxLayer(inputs_hook=(4096,
                                                   fc_layer7.get_outputs()),
                                      output_size=1000,
                                      **softmax_config)

        # Add this layer's parameters!
        self.params += softmax_layer8.get_params()

        # finally the softmax output from the whole thing!
        self.output = softmax_layer8.get_outputs()
        self.targets = softmax_layer8.get_targets()

        #####################
        # Cost and monitors #
        #####################
        self.train_cost = softmax_layer8.negative_log_likelihood()
        cost = softmax_layer8.negative_log_likelihood()
        errors = softmax_layer8.errors()
        train_errors = softmax_layer8.errors()

        self.monitors = OrderedDict([('cost', cost), ('errors', errors),
                                     ('dropout_errors', train_errors)])

        #########################
        # Compile the functions #
        #########################
        log.debug("Compiling functions!")
        t = time.time()
        log.debug("f_run...")
        # use the actual argmax from the classification
        self.f_run = function(inputs=[self.x],
                              outputs=softmax_layer8.get_argmax_prediction())
        log.debug("compilation took %s",
                  make_time_units_string(time.time() - t))
    def train(self, monitor_channels=None, plot=None):
        """
        This method performs the training!!!
        It is an online training method that goes over minibatches from the dataset for a number of epochs,
        updating parameters after each minibatch.

        You can disrupt training with a KeyBoardInterrupt and it should exit/save parameters gracefully.

        Parameters
        ----------
        monitor_channels : list(MonitorsChannel or Monitor), optional
            The list of channels or monitors containing monitor expressions/variables to compile and evaluate
            on the data.
        plot : Plot, optional
            The Plot object to use if we want to graph the outputs (uses bokeh server).
        """
        if not self.model:
            log.error("No self.model for the Optimizer!")
            raise AssertionError("Needs to be initialized with a Model! (Or something went wrong if train() "
                                 "was called from the Model. Try initializing the Optimizer with the model param "
                                 "and calling optimizer.train().")

        #########################
        # gradients and updates #
        #########################
        # grab the model parameters to use during training
        self.params = self.model.get_params()
        # Now create the training cost function for the model to use while training - update parameters
        # gradient!
        # First find the basic variables that will be updated
        params = set()
        for param in self.params.values():
            params.update(base_variables(param))
        params = list(params)
        gradients = grad(cost=self.loss_expression, wrt=params)
        # now create the dictionary mapping the parameter with its gradient
        gradients = OrderedDict(
            [(param, g) for param, g in zip(params, gradients)]
        )
        # clip gradients if we want.
        gradients = clip_gradients(gradients, self.grad_clip, self.hard_clip)

        # Calculate the optimizer updates each run
        # This is where the magic happens for a lot of sub-implementations of SGD!
        # It tells how to update the params each training epoch
        gradient_updates = self.get_updates(gradients)

        # Combine the updates from the model also if applicable
        updates = self.model.get_updates()
        if updates:
            updates.update(gradient_updates)
        else:
            updates = gradient_updates

        log.info("%s params: %s", self.model._classname, str(list(self.params.keys())))

        ############
        # monitors #
        ############
        # deal with the monitor channels if they were given (or take them from the plot)
        if monitor_channels is None and plot is not None and len(plot.channels) > 0:
            monitor_channels = plot.channels
        self.train_monitors_dict = {}
        self.valid_monitors_dict = {}
        self.test_monitors_dict = {}
        self.train_monitors_outservice_dict = {}
        self.valid_monitors_outservice_dict = {}
        self.test_monitors_outservice_dict = {}
        if monitor_channels:
            # collapse the appropriate monitors into their (name, expression, out_service) tuples
            train_collapsed = collapse_channels(monitor_channels, train=True)
            valid_collapsed = collapse_channels(monitor_channels, valid=True)
            test_collapsed  = collapse_channels(monitor_channels, test=True)
            # get name: expression dictionary
            self.train_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in train_collapsed])
            self.valid_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in valid_collapsed])
            self.test_monitors_dict  = OrderedDict([(name, expression) for name, expression, _ in test_collapsed])
            # get name: outservice dictionary
            self.train_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in train_collapsed])
            self.valid_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in valid_collapsed])
            self.test_monitors_outservice_dict  = OrderedDict([(name, out) for name, _, out in test_collapsed])

        #######################################
        # compile train and monitor functions #
        #######################################
        function_input = raise_to_list(self.model.get_inputs())
        if self.loss_targets is not None:
            function_input += self.loss_targets
        # Compile the training function!
        log.info('Compiling f_learn function for model %s...', self.model._classname)
        t = time.time()

        f_learn = function(inputs=function_input,
                           updates=updates,
                           outputs=[self.loss_expression] + list(self.train_monitors_dict.values()),
                           name='f_learn')

        log.info('f_learn compilation took %s', make_time_units_string(time.time() - t))

        # figure out if we want valid and test (monitors)
        self.valid_flag = (self.dataset.valid_inputs is not None) and (len(self.valid_monitors_dict) > 0)
        self.test_flag = (self.dataset.test_inputs is not None) and (len(self.test_monitors_dict) > 0)
        # Now compile the monitor functions!
        log.debug("Compiling monitor functions...")
        monitor_t = time.time()
        # valid monitors
        if self.valid_flag:
            self.valid_monitor_function = function(
                inputs=function_input,
                updates=self.model.get_updates(),
                outputs=list(self.valid_monitors_dict.values()),
                name='valid_monitor_function'
            )
        else:
            self.valid_monitor_function = None

        # test monitors
        if self.test_flag:
            self.test_monitor_function = function(
                inputs=function_input,
                updates=self.model.get_updates(),
                outputs=list(self.test_monitors_dict.values()),
                name='test_monitor_function'
            )
        else:
            self.test_monitor_function = None

        log.debug("Compilation done. Took %s", make_time_units_string(time.time() - monitor_t))

        ##################
        # start training #
        ##################
        log.info("-----------TRAINING %s FOR %d EPOCHS-----------",
                 self.model._classname, self.n_epoch)

        self.STOP = False
        self.epoch_counter = 0
        # reset any decay params
        for decay_param in self.get_decay_params():
            decay_param.reset()

        self.times = []
        self.best_cost = numpy.inf
        self.best_params = None
        self.patience = 0

        t = time.time()

        while not self.STOP:
            try:
                self.STOP = self._perform_one_epoch(f_learn, plot)
            except KeyboardInterrupt:
                log.info("STOPPING EARLY FROM KEYBOARDINTERRUPT")
                self.STOP = True

        # save params
        if self.best_params is not None:
            log.debug("Restoring best model parameters...")
            self.model.set_param_values(self.best_params, borrow=False)
        log.debug("Saving model parameters...")
        self.model.save_params('trained_epoch_' + str(self.epoch_counter))

        log.info("------------TRAIN TIME TOOK %s---------", make_time_units_string(time.time() - t))
    def _perform_one_epoch(self, f_learn, plot=None):
        """
        Performs a single training iteration with the given learn function.
        """
        self.epoch_counter += 1
        t = time.time()
        log.info('EPOCH %s', str(self.epoch_counter))

        # set the noise switches on for training function! (this is where things like dropout happen)
        if not self.model.switches_on:
            self.model.turn_on_switches()

        #########
        # train #
        #########
        train_costs = []
        train_monitors = {key: [] for key in self.train_monitors_dict.keys()}
        train_data = [
            minibatch(input_data, self.batch_size, self.min_batch_size)
            for input_data in raise_to_list(self.dataset.train_inputs)
            ]
        if self.dataset.train_targets is not None and not self.unsupervised:
            train_data += [
                minibatch(target, self.batch_size, self.min_batch_size)
                for target in raise_to_list(self.dataset.train_targets)
                ]

        for batch in min_normalized_izip(*train_data):
            _outs = raise_to_list(f_learn(*batch))
            train_costs.append(_outs[0])
            # handle any user defined monitors (if different from the train cost)
            if len(train_monitors) > 0:
                current_monitors = zip(self.train_monitors_dict.keys(), _outs[1:])
                for name, val in current_monitors:
                    val = numpy.asarray(val)
                    train_monitors[name].append(val)

        # get the mean values for the batches
        mean_train = numpy.mean(train_costs, 0)
        current_mean_monitors = {key: numpy.mean(vals, 0) for key, vals in train_monitors.items()}
        # log the mean values!
        log.info('Train cost: %s', trunc(mean_train))
        if len(current_mean_monitors) > 0:
            log.info('Train monitors: %s', str(current_mean_monitors))
        # send the values to their outservices
        for name, service in self.train_monitors_outservice_dict.items():
            if name in current_mean_monitors and service:
                service.write(current_mean_monitors[name], "train")
        # if there is a plot, also send them over!
        if plot:
            plot.update_plots(epoch=self.epoch_counter, monitors=current_mean_monitors)

        # set the noise switches off for valid and test sets! we assume unseen data is noisy anyway :)
        if self.model.switches_on:
            self.model.turn_off_switches()

        #########
        # valid #
        #########
        self._compute_over_subset("valid", self.dataset.valid_inputs, self.dataset.valid_targets,
                                  self.valid_monitors_dict, self.valid_monitor_function,
                                  self.valid_monitors_outservice_dict, plot)

        ########
        # test #
        ########
        self._compute_over_subset("test", self.dataset.test_inputs, self.dataset.test_targets,
                                  self.test_monitors_dict, self.test_monitor_function,
                                  self.test_monitors_outservice_dict, plot)

        ###########
        # cleanup #
        ###########
        # check for early stopping on train costs
        cost = numpy.sum(train_costs)
        # if the cost improved, reset the patience and record the best cost.
        if cost < self.best_cost * self.early_stop_threshold:
            self.patience = 0
            self.best_cost = cost
            # save the parameters that made it the best
            self.best_params = self.model.get_param_values(borrow=False)
        elif not numpy.isnan(cost):
            self.patience += 1

        # check for stopping either from n_epochs or from threshold/patience
        stop = False
        if self.epoch_counter >= self.n_epoch:
            log.info("Stopping (reached max number of epochs)...")
            stop = True
        if self.patience >= self.early_stop_length:
            log.info("Stopping early (reached stop threshold)...")
            stop = True

        timing = time.time() - t
        self.times.append(timing)

        log.info('time: ' + make_time_units_string(timing))

        log.debug('remaining time: ' +
                 make_time_units_string((self.n_epoch - self.epoch_counter) * numpy.mean(self.times)))

        if (self.epoch_counter % self.save_frequency) == 0:
            #save params
            self.model.save_params('trained_epoch_' + str(self.epoch_counter))

        # ANNEAL!
        if not stop:
            # perform the appropriate decay on the decay functions/parameters for this optimizer and model
            for decay_param in self.get_decay_params():
                decay_param.decay()

        # return whether or not to stop this epoch
        return stop
Exemple #34
0
    def train(self, monitor_channels=None, train_outservice=None, plot=None, additional_cost=None):
        """
        This method performs the training!!!
        It is an online training method that goes over minibatches from the dataset for a number of epochs,
        updating parameters after each minibatch.

        You can disrupt training with a KeyBoardInterrupt and it should exit/save parameters gracefully.

        Parameters
        ----------
        monitor_channels : list(MonitorsChannel or Monitor), optional
            The list of channels or monitors containing monitor expressions/variables to compile and evaluate
            on the data.
        train_outservice : OutService, optional
            The OutService to use for the automatically created train_cost monitor. Default of None just outputs
            to logs.
        plot : Plot, optional
            The Plot object to use if we want to graph the outputs (uses bokeh server).
        additional_cost : theano expression or list(theano expression), optional
            Any additional cost expressions to use during training (things like regularization). These will be summed
            with the existing cost.
        """
        if not self.model:
            log.error("No self.model for the Optimizer!")
            raise AssertionError("Needs to be initialized with a Model! (Or something went wrong if train() "
                                 "was called from the Model. Try initializing the Optimizer with the model param "
                                 "and calling optimizer.train().")

        #####################################################
        # handle additional costs (normally regularization) #
        #####################################################
        # Create the gradient updates for the model - make sure to handle the possible
        # list of costs used for pretraining of certain parts of the model.
        train_costs = raise_to_list(self.model.get_train_cost())
        # deal with any other additional costs (like regularization, etc.)
        if additional_cost is not None:
            additional_costs = raise_to_list(additional_cost)
            if len(additional_costs) > 1:
                additional_cost = T.sum(additional_costs)

        #########################
        # gradients and updates #
        #########################
        train_updates = []
        self.gradients = []
        for i, train_cost in enumerate(train_costs):
            # Now create the training cost function for the model to use while training - update parameters
            # gradient!
            if len(train_costs) > 1 and additional_cost is not None:
                log.warning("additional_cost will double count with gradients during layer-wise pretraining!")
                warnings.warn("additional_cost will double count with gradients during layer-wise pretraining!")
            # TODO: additional_cost will double count with gradients during layer-wise pretraining.
            # Need to somehow make w.r.t. params appropriate for the individual training costs.
            gradients, _ = self.model.get_gradient(cost=train_cost, additional_cost=additional_cost)
            # clip gradients if we want.
            gradients = clip_gradients(gradients, self.grad_clip, self.hard_clip)
            # append to list
            self.gradients.append(gradients)

            # Calculate the optimizer updates each run
            # This is where the magic happens for a lot of sub-implementations of SGD!
            # It tells how to update the params each training epoch
            gradient_updates = self.get_updates(gradients)

            # Combine the updates from the model also if applicable
            updates = self.model.get_updates()
            if updates:
                updates.update(gradient_updates)
            else:
                updates = gradient_updates
            train_updates.append(updates)

        # grab the model parameters to use during training
        self.params = self.model.get_params()
        log.info("%s params: %s", str(type(self.model)), str(self.params))

        ############
        # monitors #
        ############
        # deal with the monitor channels if they were given (or take them from the plot)
        if monitor_channels is None and plot is not None and len(plot.channels) > 0:
            monitor_channels = plot.channels
        self.train_monitors_dict = {}
        self.valid_monitors_dict = {}
        self.test_monitors_dict = {}
        self.train_monitors_outservice_dict = {}
        self.valid_monitors_outservice_dict = {}
        self.test_monitors_outservice_dict = {}
        if monitor_channels:
            # collapse the appropriate monitors into their (name, expression, out_service) tuples
            train_collapsed = collapse_channels(monitor_channels, train=True)
            valid_collapsed = collapse_channels(monitor_channels, valid=True)
            test_collapsed  = collapse_channels(monitor_channels, test=True)
            # get name: expression dictionary
            self.train_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in train_collapsed])
            self.valid_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in valid_collapsed])
            self.test_monitors_dict  = OrderedDict([(name, expression) for name, expression, _ in test_collapsed])
            # get name: outservice dictionary
            self.train_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in train_collapsed])
            self.valid_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in valid_collapsed])
            self.test_monitors_outservice_dict  = OrderedDict([(name, out) for name, _, out in test_collapsed])
        # finally deal with an outservice provided to monitor training cost
        self.train_outservice = train_outservice
        # remove redundant files made by the fileservice for the train monitor.
        # TODO: THIS FEELS LIKE A HACK. I don't like it.
        if isinstance(self.train_outservice, FileService):
            os.remove(self.train_outservice.valid_filename)
            os.remove(self.train_outservice.test_filename)

        #######################################
        # compile train and monitor functions #
        #######################################
        function_input = raise_to_list(self.model.get_inputs()) + raise_to_list(self.model.get_targets())
        train_functions = []
        for i, (updates, train_cost) in enumerate(zip(train_updates, train_costs)):
            # Compile the training function!
            log.info('Compiling f_learn %d/%d function for model %s...', i + 1, len(train_updates),
                     str(type(self.model)))
            t = time.time()

            f_learn = function(inputs=function_input,
                               updates=updates,
                               outputs=[train_cost] + list(self.train_monitors_dict.values()),
                               name='f_learn_%d' % i)

            log.info('f_learn %d compilation took %s', i + 1, make_time_units_string(time.time() - t))
            train_functions.append(f_learn)

        # figure out if we want valid and test (monitors)
        self.valid_flag = (self.dataset.valid_inputs is not None) and (len(self.valid_monitors_dict) > 0)
        self.test_flag = (self.dataset.test_inputs is not None) and (len(self.test_monitors_dict) > 0)
        # Now compile the monitor functions!
        log.debug("Compiling monitor functions...")
        monitor_t = time.time()
        # valid monitors
        if self.valid_flag:
            self.valid_monitor_function = function(
                inputs=function_input,
                updates=self.model.get_updates(),
                outputs=list(self.valid_monitors_dict.values()),
                name='valid_monitor_function'
            )
        else:
            self.valid_monitor_function = None

        # test monitors
        if self.test_flag:
            self.test_monitor_function = function(
                inputs=function_input,
                updates=self.model.get_updates(),
                outputs=list(self.test_monitors_dict.values()),
                name='test_monitor_function'
            )
        else:
            self.test_monitor_function = None

        log.debug("Compilation done. Took %s", make_time_units_string(time.time() - monitor_t))

        ##################
        # start training #
        ##################
        # make sure to deal with a list of train_cost functions - for layer-wise pretraining!
        # this list of training functions was created during __init__()
        start_time = time.time()
        for func_i, train_function in enumerate(train_functions):
            log.info("-----------TRAINING %s function %d/%d FOR %d EPOCHS-----------",
                     str(type(self.model)), func_i + 1, len(train_functions), self.n_epoch)

            self.STOP = False
            self.epoch_counter = 0
            # reset any decay params
            for decay_param in self.get_decay_params():
                decay_param.reset()

            self.times = []
            self.best_cost = numpy.inf
            self.best_params = None
            self.patience = 0

            t = time.time()

            while not self.STOP:
                try:
                    self.STOP = self._perform_one_epoch(train_function, plot)
                except KeyboardInterrupt:
                    log.info("STOPPING EARLY FROM KEYBOARDINTERRUPT")
                    self.STOP = True

            # save params
            if self.best_params is not None:
                log.debug("Restoring best model parameters...")
                set_shared_values(self.params, self.best_params)
            log.debug("Saving model parameters...")
            self.model.save_params('trained_epoch_' + str(self.epoch_counter))

            log.info("------------TRAIN TIME TOOK %s---------", make_time_units_string(time.time() - t))

        log.info("------------TOTAL %s TRAIN TIME TOOK %s---------",
                 str(type(self.model)), make_time_units_string(time.time() - start_time))
    def build_computation_graph(self):
        #################
        # Build the GSN #
        #################
        log.debug("Building GSN graphs...")

        # GSN for training - with noise specified in initialization
        # if there is no hiddens_hook, build the GSN normally using the input X
        if not self.hiddens_flag:
            p_X_chain, _ = self.build_gsn(add_noise=self.add_noise)

        # if there is a hiddens_hook, we want to change the order layers are updated and make this purely
        # generative from the hiddens
        else:
            p_X_chain, _,  = self.build_gsn(hiddens=self.hiddens, add_noise=self.add_noise, reverse=True)

        # GSN for prediction - same as above but no noise
        # deal with hiddens_hook exactly as above.
        if not self.hiddens_flag:
            p_X_chain_recon, recon_hiddens = self.build_gsn(add_noise=False)
        else:
            p_X_chain_recon, recon_hiddens = self.build_gsn(hiddens=self.hiddens, add_noise=False, reverse=True)

        ####################
        # Costs and output #
        ####################
        log.debug('Cost w.r.t p(X|...) at every step in the graph for the GSN')
        # use the noisy ones for training cost
        costs          = [self.cost_function(output=rX, target=self.X, **self.cost_args) for rX in p_X_chain]
        self.show_cost = costs[-1]  # for a monitor to show progress
        cost           = numpy.sum(costs)  # THIS IS THE TRAINING COST - RECONSTRUCTION OF OUTPUT FROM NOISY GRAPH

        # use the non-noisy graph for prediction
        gsn_costs_recon = [self.cost_function(output=rX, target=self.X, **self.cost_args) for rX in p_X_chain_recon]
        # another monitor, same as self.show_cost but on the non-noisy graph.
        self.monitor    = gsn_costs_recon[-1]
        # this should be considered the main output of the computation, the sample after the
        # last walkback from the non-noisy graph.
        output     = p_X_chain_recon[-1]
        # these should be considered the model's hidden representation - the hidden representation after
        # the last walkback from the non-noisy graph.
        hiddens    = recon_hiddens

        train_mse = T.mean(T.sqr(p_X_chain[-1] - self.X), axis=0)
        train_mse = T.mean(train_mse)

        mse = T.mean(T.sqr(p_X_chain_recon[-1] - self.X), axis=0)
        mse = T.mean(mse)

        monitors = OrderedDict([('noisy_recon_cost', self.show_cost),
                                ('recon_cost', self.monitor),
                                ('mse', mse),
                                ('train_mse', train_mse)])

        ############
        # Sampling #
        ############
        # the input to the sampling function
        X_sample = T.matrix("X_sampling")
        self.network_state_input = [X_sample] + [T.matrix("H_sampling_"+str(i+1)) for i in range(self.layers)]
       
        # "Output" state of the network (noisy)
        # initialized with input, then we apply updates
        self.network_state_output = [X_sample] + self.network_state_input[1:]
        visible_pX_chain = []
    
        # ONE update
        log.debug("Performing one walkback in network state sampling.")
        self.update_layers(self.network_state_output, visible_pX_chain, add_noise=True, reverse=False)

        #####################################################
        #     Create the run and monitor functions      #
        #####################################################
        log.debug("Compiling functions...")
        t = time.time()

        # doesn't make sense to have this if there is a hiddens_hook
        if not self.hiddens_flag:
            # THIS IS THE MAIN PREDICT FUNCTION - takes in a real matrix and produces the output from the non-noisy
            # computation graph
            log.debug("f_run...")
            self.f_run = function(inputs  = [self.X],
                                  outputs = output,
                                  name    = 'gsn_f_run')

        # this is a helper function - it corrupts inputs when testing the non-noisy graph (aka before feeding the
        # input to f_run)
        log.debug("f_noise...")
        self.f_noise = function(inputs  = [self.X],
                                outputs = self.input_noise(self.X),
                                name    = 'gsn_f_noise')

        # the sampling function, for creating lots of samples from the computational graph. (mostly for log-likelihood
        # or visualization)
        log.debug("f_sample...")
        if self.layers == 1: 
            self.f_sample = function(inputs  = [X_sample],
                                     outputs = visible_pX_chain[-1],
                                     name    = 'gsn_f_sample_single_layer')
        else:
            # WHY IS THERE A WARNING????
            # because the first odd layers are not used -> directly computed FROM THE EVEN layers
            # unused input = warn
            self.f_sample = function(inputs  = self.network_state_input,
                                     outputs = self.network_state_output + visible_pX_chain,
                                     name    = 'gsn_f_sample')

        log.debug("GSN compiling done. Took %s", make_time_units_string(time.time() - t))

        return cost, monitors, output, hiddens
Exemple #36
0
    def _perform_one_epoch(self, f_learn):
        self.epoch_counter += 1
        t = time.time()
        log.info('EPOCH %s', str(self.epoch_counter))

        # set the noise switches on for training function! (this is where things like dropout happen)
        if len(self.noise_switches) > 0:
            log.debug("Turning on %s noise switches", str(len(self.noise_switches)))
            switch_vals = [switch.get_value() for switch in self.noise_switches]
            [switch.set_value(0.) for switch in self.noise_switches]

        # train
        train_costs = []
        train_monitors = {key: [] for key in self.monitors.keys()}
        for batch_start, batch_end in self.train_batches:
            train_costs.append(f_learn(batch_start, batch_end))
            self.call_monitors(monitor_function=self.train_monitor_function,
                               monitors_dict=train_monitors,
                               inputs=[batch_start, batch_end])
        log.info('Train cost: %s', trunc(numpy.mean(train_costs, 0)))
        if len(self.monitors.keys()) > 0:
            log.info('Train monitors: %s',
                     str({key: numpy.mean(value, 0) for key, value in train_monitors.items()}))


        # set the noise switches off for valid and test sets! we assume unseen data is noisy anyway :)
        if len(self.noise_switches) > 0:
            log.debug("Turning off %s noise switches", str(len(self.noise_switches)))
            [switch.set_value(0.) for switch in self.noise_switches]
        # valid
        if self.dataset.hasSubset(VALID) and len(self.monitors.keys()) > 0:
            valid_monitors = {key: [] for key in self.monitors.keys()}
            for batch_start, batch_end in self.valid_batches:
                self.call_monitors(monitor_function=self.valid_monitor_function,
                                   monitors_dict=valid_monitors,
                                   inputs=[batch_start, batch_end])
            log.info('Valid monitors: %s',
                     str({key: numpy.mean(value, 0) for key, value in valid_monitors.items()}))

        #test
        if self.dataset.hasSubset(TEST) and len(self.monitors.keys()) > 0:
            test_monitors = {key: [] for key in self.monitors.keys()}
            for batch_start, batch_end in self.test_batches:
                self.call_monitors(monitor_function=self.test_monitor_function,
                                   monitors_dict=test_monitors,
                                   inputs=[batch_start, batch_end])
            log.info('Test monitors: %s', str({key: numpy.mean(value, 0) for key, value in test_monitors.items()}))

        # check for early stopping on train costs
        cost = numpy.sum(train_costs)
        if cost < self.best_cost * self.early_stop_threshold:
            self.patience = 0
            self.best_cost = cost
            # save the parameters that made it the best
            self.best_params = get_shared_values(self.params)
        else:
            self.patience += 1

        # check for stopping either from n_epochs or from threshold/patience
        stop = False
        if self.epoch_counter >= self.n_epoch:
            log.info("Stopping (reached max number of epochs)...")
            stop = True
        if self.patience >= self.early_stop_length:
            log.info("Stopping early (reached stop threshold)...")
            stop = True

        timing = time.time() - t
        self.times.append(timing)

        log.info('time: ' + make_time_units_string(timing))

        log.info('remaining time: ' +
                 make_time_units_string((self.n_epoch - self.epoch_counter) * numpy.mean(self.times)))

        if (self.epoch_counter % self.save_frequency) == 0:
            #save params
            self.model.save_params('trained_epoch_' + str(self.epoch_counter) + '.pkl')

        # ANNEAL!
        if not stop:
            if hasattr(self, 'learning_rate_decay') and self.learning_rate_decay:
                self.learning_rate_decay.decay()
            if hasattr(self, 'momentum_decay') and self.momentum_decay:
                self.momentum_decay.decay()
            for decay_param in self.model.get_decay_params():
                decay_param.decay()

        # reset the switches
        if len(self.noise_switches) > 0:
            [switch.set_value(val) for switch, val in zip(self.noise_switches, switch_vals)]

        # return whether or not to stop this epoch
        return stop
Exemple #37
0
def main():
    var = theano.shared(T.zeros(shape=(88, 100),
                                dtype=theano.config.floatX).eval(),
                        name='W')
    updates = [(var, add_uniform(input=var, noise_level=.02))]

    stats = get_stats(var)
    l1 = stats.pop('l1')
    l2 = stats.pop('l2')
    min = stats.pop('min')
    max = stats.pop('max')
    var = stats.pop('var')
    std = stats.pop('std')
    mean = stats.pop('mean')

    mean_monitor = Monitor('mean', mean, train=True, valid=True)
    var_monitor = Monitor('var', var)

    w_channel = MonitorsChannel('W', monitors=mean_monitor)

    stat_channel = MonitorsChannel('stats', monitors=[var_monitor])

    monitors = [w_channel, stat_channel]

    train_collapsed = collapse_channels(monitors, train=True)
    train_collapsed = OrderedDict([(name, expression)
                                   for name, expression, _ in train_collapsed])
    valid_collapsed = collapse_channels(monitors, valid=True)
    valid_collapsed = OrderedDict([(name, expression)
                                   for name, expression, _ in valid_collapsed])

    plot = Plot(bokeh_doc_name='test_plots',
                monitor_channels=monitors,
                open_browser=True)

    log.debug('compiling...')
    f = theano.function(inputs=[],
                        outputs=list(train_collapsed.values()),
                        updates=updates)
    f2 = theano.function(inputs=[],
                         outputs=list(valid_collapsed.values()),
                         updates=updates)
    log.debug('done')

    t1 = time.time()

    for epoch in range(100):
        t = time.time()
        log.debug(epoch)
        vals = f()
        m = OrderedDict(zip(train_collapsed.keys(), vals))
        plot.update_plots(epoch, m)
        log.debug('----- ' + make_time_units_string(time.time() - t))

    for epoch in range(100):
        t = time.time()
        log.debug(epoch)
        vals = f2()
        m = OrderedDict(zip(valid_collapsed.keys(), vals))
        plot.update_plots(epoch, m)
        log.debug('----- ' + make_time_units_string(time.time() - t))

    log.debug("TOTAL TIME " + make_time_units_string(time.time() - t1))
    def _build_computation_graph(self):
        ###################### BUILD NETWORK ##########################
        # whether or not to mirror the input images before feeding them into the network
        if self.flag_datalayer:
            layer_1_input = mirror_images(input=self.x,
                                          image_shape=(self.batch_size, 3, 256, 256),  # bc01 format
                                          cropsize=227,
                                          rand=self.rand,
                                          flag_rand=self.rand_crop)
        else:
            layer_1_input = self.x  # 4D tensor (going to be in bc01 format)

        # Start with 5 convolutional pooling layers
        log.debug("convpool layer 1...")
        convpool_layer1 = ConvPoolLayer(inputs_hook=((self.batch_size, 3, 227, 227), layer_1_input),
                                        filter_shape=(96, 3, 11, 11),
                                        convstride=4,
                                        padsize=0,
                                        group=1,
                                        poolsize=3,
                                        poolstride=2,
                                        bias_init=0.0,
                                        local_response_normalization=True)
        # Add this layer's parameters!
        self.params += convpool_layer1.get_params()

        log.debug("convpool layer 2...")
        convpool_layer2 = ConvPoolLayer(inputs_hook=((self.batch_size, 96, 27, 27, ), convpool_layer1.get_outputs()),
                                        filter_shape=(256, 96, 5, 5),
                                        convstride=1,
                                        padsize=2,
                                        group=2,
                                        poolsize=3,
                                        poolstride=2,
                                        bias_init=0.1,
                                        local_response_normalization=True)
        # Add this layer's parameters!
        self.params += convpool_layer2.get_params()

        log.debug("convpool layer 3...")
        convpool_layer3 = ConvPoolLayer(inputs_hook=((self.batch_size, 256, 13, 13), convpool_layer2.get_outputs()),
                                        filter_shape=(384, 256, 3, 3),
                                        convstride=1,
                                        padsize=1,
                                        group=1,
                                        poolsize=1,
                                        poolstride=0,
                                        bias_init=0.0,
                                        local_response_normalization=False)
        # Add this layer's parameters!
        self.params += convpool_layer3.get_params()

        log.debug("convpool layer 4...")
        convpool_layer4 = ConvPoolLayer(inputs_hook=((self.batch_size, 384, 13, 13), convpool_layer3.get_outputs()),
                                        filter_shape=(384, 384, 3, 3),
                                        convstride=1,
                                        padsize=1,
                                        group=2,
                                        poolsize=1,
                                        poolstride=0,
                                        bias_init=0.1,
                                        local_response_normalization=False)
        # Add this layer's parameters!
        self.params += convpool_layer4.get_params()

        log.debug("convpool layer 5...")
        convpool_layer5 = ConvPoolLayer(inputs_hook=((self.batch_size, 384, 13, 13), convpool_layer4.get_outputs()),
                                        filter_shape=(256, 384, 3, 3),
                                        convstride=1,
                                        padsize=1,
                                        group=2,
                                        poolsize=3,
                                        poolstride=2,
                                        bias_init=0.0,
                                        local_response_normalization=False)
        # Add this layer's parameters!
        self.params += convpool_layer5.get_params()

        # Now onto the fully-connected layers!
        fc_config = {
            'activation': 'rectifier',  # type of activation function to use for output
            'weights_init': 'gaussian',  # either 'gaussian' or 'uniform' - how to initialize weights
            'weights_mean': 0.0,  # mean for gaussian weights init
            'weights_std': 0.005,  # standard deviation for gaussian weights init
            'bias_init': 0.0  # how to initialize the bias parameter
        }
        log.debug("fully connected layer 1 (model layer 6)...")
        # we want to have dropout applied to the training version, but not the test version.
        fc_layer6_input = T.flatten(convpool_layer5.get_outputs(), 2)
        fc_layer6 = BasicLayer(inputs_hook=(9216, fc_layer6_input),
                               output_size=4096,
                               noise='dropout',
                               noise_level=0.5,
                               **fc_config)
        # Add this layer's parameters!
        self.params += fc_layer6.get_params()
        # Add the dropout noise switch
        self.noise_switches += fc_layer6.get_noise_switch()

        log.debug("fully connected layer 2 (model layer 7)...")
        fc_layer7 = BasicLayer(inputs_hook=(4096, fc_layer6.get_outputs()),
                               output_size=4096,
                               noise='dropout',
                               noise_level=0.5,
                               **fc_config)

        # Add this layer's parameters!
        self.params += fc_layer7.get_params()
        # Add the dropout noise switch
        self.noise_switches += fc_layer7.get_noise_switch()

        # last layer is a softmax prediction output layer
        softmax_config = {
            'weights_init': 'gaussian',
            'weights_mean': 0.0,
            'weights_std': 0.005,
            'bias_init': 0.0
        }
        log.debug("softmax classification layer (model layer 8)...")
        softmax_layer8 = SoftmaxLayer(inputs_hook=(4096, fc_layer7.get_outputs()),
                                      output_size=1000,
                                      **softmax_config)

        # Add this layer's parameters!
        self.params += softmax_layer8.get_params()

        # finally the softmax output from the whole thing!
        self.output = softmax_layer8.get_outputs()
        self.targets = softmax_layer8.get_targets()

        #####################
        # Cost and monitors #
        #####################
        self.train_cost = softmax_layer8.negative_log_likelihood()
        cost = softmax_layer8.negative_log_likelihood()
        errors = softmax_layer8.errors()
        train_errors = softmax_layer8.errors()

        self.monitors = OrderedDict([('cost', cost), ('errors', errors), ('dropout_errors', train_errors)])

        #########################
        # Compile the functions #
        #########################
        log.debug("Compiling functions!")
        t = time.time()
        log.debug("f_run...")
        # use the actual argmax from the classification
        self.f_run = function(inputs=[self.x], outputs=softmax_layer8.get_argmax_prediction())
        log.debug("compilation took %s", make_time_units_string(time.time() - t))
Exemple #39
0
    def build_computation_graph(self):
        #################
        # Build the GSN #
        #################
        log.debug("Building GSN graphs...")

        # GSN for training - with noise specified in initialization
        # if there is no hiddens_hook, build the GSN normally using the input X
        if not self.hiddens_flag:
            p_X_chain, _ = self.build_gsn(add_noise=self.add_noise)

        # if there is a hiddens_hook, we want to change the order layers are updated and make this purely
        # generative from the hiddens
        else:
            p_X_chain, _, = self.build_gsn(hiddens=self.hiddens,
                                           add_noise=self.add_noise,
                                           reverse=True)

        # GSN for prediction - same as above but no noise
        # deal with hiddens_hook exactly as above.
        if not self.hiddens_flag:
            p_X_chain_recon, recon_hiddens = self.build_gsn(add_noise=False)
        else:
            p_X_chain_recon, recon_hiddens = self.build_gsn(
                hiddens=self.hiddens, add_noise=False, reverse=True)

        ####################
        # Costs and output #
        ####################
        log.debug('Cost w.r.t p(X|...) at every step in the graph for the GSN')
        # use the noisy ones for training cost
        costs = [
            self.cost_function(output=rX, target=self.X, **self.cost_args)
            for rX in p_X_chain
        ]
        self.show_cost = costs[-1]  # for a monitor to show progress
        cost = numpy.sum(
            costs
        )  # THIS IS THE TRAINING COST - RECONSTRUCTION OF OUTPUT FROM NOISY GRAPH

        # use the non-noisy graph for prediction
        gsn_costs_recon = [
            self.cost_function(output=rX, target=self.X, **self.cost_args)
            for rX in p_X_chain_recon
        ]
        # another monitor, same as self.show_cost but on the non-noisy graph.
        self.monitor = gsn_costs_recon[-1]
        # this should be considered the main output of the computation, the sample after the
        # last walkback from the non-noisy graph.
        output = p_X_chain_recon[-1]
        # these should be considered the model's hidden representation - the hidden representation after
        # the last walkback from the non-noisy graph.
        hiddens = recon_hiddens

        train_mse = T.mean(T.sqr(p_X_chain[-1] - self.X), axis=0)
        train_mse = T.mean(train_mse)

        mse = T.mean(T.sqr(p_X_chain_recon[-1] - self.X), axis=0)
        mse = T.mean(mse)

        monitors = OrderedDict([('noisy_recon_cost', self.show_cost),
                                ('recon_cost', self.monitor), ('mse', mse),
                                ('train_mse', train_mse)])

        ############
        # Sampling #
        ############
        # the input to the sampling function
        X_sample = T.matrix("X_sampling")
        self.network_state_input = [X_sample] + [
            T.matrix("H_sampling_" + str(i + 1)) for i in range(self.layers)
        ]

        # "Output" state of the network (noisy)
        # initialized with input, then we apply updates
        self.network_state_output = [X_sample] + self.network_state_input[1:]
        visible_pX_chain = []

        # ONE update
        log.debug("Performing one walkback in network state sampling.")
        self.update_layers(self.network_state_output,
                           visible_pX_chain,
                           add_noise=True,
                           reverse=False)

        #####################################################
        #     Create the run and monitor functions      #
        #####################################################
        log.debug("Compiling functions...")
        t = time.time()

        # doesn't make sense to have this if there is a hiddens_hook
        if not self.hiddens_flag:
            # THIS IS THE MAIN PREDICT FUNCTION - takes in a real matrix and produces the output from the non-noisy
            # computation graph
            log.debug("f_run...")
            self.f_run = function(inputs=[self.X],
                                  outputs=output,
                                  name='gsn_f_run')

        # this is a helper function - it corrupts inputs when testing the non-noisy graph (aka before feeding the
        # input to f_run)
        log.debug("f_noise...")
        self.f_noise = function(inputs=[self.X],
                                outputs=self.input_noise(self.X),
                                name='gsn_f_noise')

        # the sampling function, for creating lots of samples from the computational graph. (mostly for log-likelihood
        # or visualization)
        log.debug("f_sample...")
        if self.layers == 1:
            self.f_sample = function(inputs=[X_sample],
                                     outputs=visible_pX_chain[-1],
                                     name='gsn_f_sample_single_layer')
        else:
            # WHY IS THERE A WARNING????
            # because the first odd layers are not used -> directly computed FROM THE EVEN layers
            # unused input = warn
            self.f_sample = function(inputs=self.network_state_input,
                                     outputs=self.network_state_output +
                                     visible_pX_chain,
                                     name='gsn_f_sample')

        log.debug("GSN compiling done. Took %s",
                  make_time_units_string(time.time() - t))

        return cost, monitors, output, hiddens