Esempio n. 1
0
    def __init__(self, step_rule=None, gradients=None, known_grads=None,
                 consider_constant=None, on_unused_sources='raise',
                 theano_func_kwargs=None, **kwargs):
        if gradients:
            kwargs.setdefault("parameters", gradients.keys())
        super(GradientDescent, self).__init__(**kwargs)

        self.gradients = gradients
        if not self.gradients:
            logger.info("Taking the cost gradient")
            self.gradients = dict(
                equizip(self.parameters, tensor.grad(
                    self.cost, self.parameters,
                    known_grads=known_grads,
                    consider_constant=consider_constant)))
            logger.info("The cost gradient computation graph is built")
        else:
            if known_grads:
                raise ValueError("known_grads has no effect when gradients "
                                 "are passed in")
            if consider_constant is not None:
                raise ValueError("consider_constant has no effect when "
                                 "gradients are passed in")
        self.step_rule = step_rule if step_rule else Scale()

        self.total_gradient_norm = l2_norm(
            self.gradients.values()).copy(name="total_gradient_norm")
        self.steps, self.step_rule_updates = (
            self.step_rule.compute_steps(self.gradients))
        self.total_step_norm = l2_norm(
            self.steps.values()).copy(name="total_step_norm")
        self.on_unused_sources = on_unused_sources
        self.theano_func_kwargs = (theano_func_kwargs if theano_func_kwargs
                                   is not None else dict())
Esempio n. 2
0
    def __init__(self, step_rule=None, gradients=None, known_grads=None,
                 consider_constant=None, on_unused_sources='raise',
                 theano_func_kwargs=None, **kwargs):
        if gradients:
            kwargs.setdefault("parameters", gradients.keys())
        super(GradientDescent, self).__init__(**kwargs)

        self.gradients = gradients
        if not self.gradients:
            logger.info("Taking the cost gradient")
            self.gradients = dict(
                equizip(self.parameters, tensor.grad(
                    self.cost, self.parameters,
                    known_grads=known_grads,
                    consider_constant=consider_constant)))
            logger.info("The cost gradient computation graph is built")
        else:
            if known_grads:
                raise ValueError("known_grads has no effect when gradients "
                                 "are passed in")
            if consider_constant is not None:
                raise ValueError("consider_constant has no effect when "
                                 "gradients are passed in")
        self.step_rule = step_rule if step_rule else Scale()

        self.total_gradient_norm = l2_norm(
            self.gradients.values()).copy(name="total_gradient_norm")
        self.steps, self.step_rule_updates = (
            self.step_rule.compute_steps(self.gradients))
        self.total_step_norm = l2_norm(
            self.steps.values()).copy(name="total_step_norm")
        self.on_unused_sources = on_unused_sources
        self.theano_func_kwargs = (theano_func_kwargs if theano_func_kwargs
                                   is not None else dict())
Esempio n. 3
0
    def __init__(self, step_rule=None, gradients=None, known_grads=None,
                 **kwargs):
        if gradients:
            kwargs.setdefault("params", gradients.keys())
        super(GradientDescent, self).__init__(**kwargs)

        self.gradients = gradients
        if not self.gradients:
            logger.info("Taking the cost gradient")
            self.gradients = dict(
                equizip(self.params, tensor.grad(self.cost, self.params,
                                                 known_grads=known_grads)))
            logger.info("The cost gradient computation graph is built")
        else:
            if known_grads:
                raise ValueError("known_grads has no effect when gradients "
                                 "are passed in")
        self.step_rule = step_rule if step_rule else Scale()

        self.total_gradient_norm = named_copy(l2_norm(self.gradients.values()),
                                              "total_gradient_norm")
        self.steps, self.step_rule_updates = (
            self.step_rule.compute_steps(self.gradients))
        self.total_step_norm = named_copy(l2_norm(self.steps.values()),
                                          "total_step_norm")
Esempio n. 4
0
    def __init__(self,
                 step_rule=None,
                 gradients=None,
                 known_grads=None,
                 **kwargs):
        if gradients:
            kwargs.setdefault("params", gradients.keys())
        super(GradientDescent, self).__init__(**kwargs)

        self.gradients = gradients
        if not self.gradients:
            logger.info("Taking the cost gradient")
            self.gradients = dict(
                equizip(
                    self.params,
                    tensor.grad(self.cost,
                                self.params,
                                known_grads=known_grads)))
            logger.info("The cost gradient computation graph is built")
        else:
            if known_grads:
                raise ValueError("known_grads has no effect when gradients "
                                 "are passed in")
        self.step_rule = step_rule if step_rule else Scale()

        self.total_gradient_norm = named_copy(l2_norm(self.gradients.values()),
                                              "total_gradient_norm")
        self.steps, self.step_rule_updates = (self.step_rule.compute_steps(
            self.gradients))
        self.total_step_norm = named_copy(l2_norm(self.steps.values()),
                                          "total_step_norm")
Esempio n. 5
0
def test_l2_norm():
    assert_allclose(l2_norm([2]).eval(), 2.0)
    assert_allclose(l2_norm([3, 4]).eval(), 5.0)
    assert_allclose(l2_norm([3, [1, 2]]).eval(), 14.0**0.5)
    assert_allclose(l2_norm([3, [1, 2], [[1, 2], [3, 4]]]).eval(), 44.0**0.5)
    assert_allclose(
        l2_norm([3, [1, 2], [[1, 2], [3, 4]]], squared=True).eval(), 44.0)
Esempio n. 6
0
def test_l2_norm():
    assert_allclose(l2_norm([2]).eval(), 2.0)
    assert_allclose(l2_norm([3, 4]).eval(), 5.0)
    assert_allclose(l2_norm([3, [1, 2]]).eval(), 14.0 ** 0.5)
    assert_allclose(
        l2_norm([3, [1, 2], [[1, 2], [3, 4]]]).eval(), 44.0 ** 0.5)
    assert_allclose(
        l2_norm([3, [1, 2], [[1, 2], [3, 4]]], squared=True).eval(), 44.0)
Esempio n. 7
0
    def __init__(self,
                 cost,
                 params,
                 subtensor_params={},
                 step_rule=None,
                 *args,
                 **kwargs):
        full_params = params
        self.subtensor_params = subtensor_params

        # For each LookupTable, we replace it by its subtensors appearing in the graph
        params = [
            param for param in full_params if param not in subtensor_params
        ]
        for _, (_, _, outputs, _) in subtensor_params.iteritems():
            params.extend(outputs)

        super(GradientDescent, self).__init__(cost=cost,
                                              params=params,
                                              **kwargs)
        # self.params contains the list of outputs of the lookup tables

        logger.info("Taking the cost gradient")
        self.gradients = dict(
            equizip(self.params, tensor.grad(self.cost, self.params)))

        # We combine the gradients extracted from the same parameter
        for param, (subparam, canonized_indices, outputs,
                    indices) in subtensor_params.iteritems():
            # This is necessary if we want to compute the l2 norm correctly (e.g. for StepClipping)
            tmp = shared_floatx(param.get_value() * 0.)
            for (output, indice) in zip(outputs, indices):
                tmp = tensor.inc_subtensor(tmp[indice], self.gradients[output])
                del self.gradients[output]
            self.gradients[subparam] = tmp[canonized_indices]

        # We remove the subtensors from the list of parameters
        self.params = full_params

        logger.info("The cost gradient computation graph is built")

        self.step_rule = step_rule if step_rule else Scale()

        self.total_gradient_norm = named_copy(l2_norm(self.gradients.values()),
                                              "total_gradient_norm")
        self.steps, self.step_rule_updates = (self.step_rule.compute_steps(
            self.gradients))
        self.total_step_norm = named_copy(l2_norm(self.steps.values()),
                                          "total_step_norm")
Esempio n. 8
0
    def compute_steps(self, previous_steps):
        # if not hasattr(self, 'threshold'):
        #    return previous_steps

        adapt_steps_up = self.adapt_steps + 1.0

        # This will quickly converge the estimate for the mean
        cut_rho_mean = tensor.minimum(self.decay,
                                      self.adapt_steps / adapt_steps_up)
        if self.quick_variance_convergence:
            cut_rho_mean2 = cut_rho_mean
        else:
            cut_rho_mean2 = self.decay

        gnorm = l2_norm(previous_steps.values())
        gnorm_log = tensor.log(l2_norm(previous_steps.values()))

        # here we quiclky converge the mean
        gnorm_log_ave_up = (cut_rho_mean * self.gnorm_log_ave +
                            (1. - cut_rho_mean) * gnorm_log)

        # this can wait as it starts from 0 anyways!
        gnorm_log2_ave_up = (cut_rho_mean2 * self.gnorm_log2_ave +
                             (1. - cut_rho_mean2) * (gnorm_log ** 2))

        clip_threshold_up = tensor.exp(
            gnorm_log_ave_up +
            tensor.sqrt(tensor.maximum(0.0,
                                       gnorm_log2_ave_up -
                                       gnorm_log_ave_up ** 2)
                        ) * self.stdevs)

        if self.clip_to_mean:
            clip_level_up = tensor.exp(gnorm_log_ave_up)
        else:
            clip_level_up = clip_threshold_up

        multiplier = tensor.switch(gnorm < clip_threshold_up,
                                   1, clip_level_up / gnorm)
        steps = OrderedDict(
            (parameter, step * multiplier)
            for parameter, step in previous_steps.items())

        return steps, [(self.adapt_steps, adapt_steps_up),
                       (self.gnorm_log_ave, gnorm_log_ave_up),
                       (self.gnorm_log2_ave, gnorm_log2_ave_up),
                       (self.clip_threshold, clip_threshold_up),
                       (self.clip_level, clip_level_up)]
Esempio n. 9
0
 def compute_steps(self, previous_steps):
     if not hasattr(self, "threshold"):
         return previous_steps
     norm = l2_norm(previous_steps.values())
     multiplier = tensor.switch(norm < self.threshold, 1, self.threshold / norm)
     steps = OrderedDict((parameter, step * multiplier) for parameter, step in previous_steps.items())
     return steps, []
Esempio n. 10
0
    def __init__(self, step_rule=None, gradients=None, **kwargs):
        super(GradientDescent, self).__init__(**kwargs)
        self.gradients = gradients
        if not self.gradients:
            logger.info("Taking the cost gradient")
            self.gradients = dict(
                zip(self.params, tensor.grad(self.cost, self.params)))
            logger.info("The cost gradient computation graph is built")
        self.step_rule = step_rule if step_rule else SteepestDescent()

        self.total_gradient_norm = named_copy(l2_norm(self.gradients.values()),
                                              "total_gradient_norm")
        self.steps, self.step_rule_updates = (self.step_rule.compute_steps(
            self.gradients))
        self.total_step_norm = named_copy(l2_norm(self.steps.values()),
                                          "total_step_norm")
Esempio n. 11
0
    def compute_step(self, param, previous_step):
        grad_norm = l2_norm([previous_step])
        not_finite = tensor.or_(tensor.isnan(grad_norm),
                                tensor.isinf(grad_norm))
        step = tensor.switch(not_finite, self.scaler * param, previous_step)

        return step, []
Esempio n. 12
0
 def compute_steps(self, gradients):
     if not hasattr(self, 'threshold'):
         return gradients
     norm = l2_norm(gradients.values())
     multiplier = tensor.switch(norm < self.threshold, 1,
                                self.threshold / norm)
     steps = OrderedDict((param, gradient * multiplier)
                         for param, gradient in gradients.items())
     return steps, []
Esempio n. 13
0
    def compute_steps(self, steps):
        # memorize steps for one time step
        self.last_steps = OrderedDict()
        updates = []
        for parameter, step in steps.items():
            last_step = shared_floatx(parameter.get_value() * 0.,
                                      "last_step_%s" % parameter.name)
            add_role(last_step, ALGORITHM_BUFFER)
            updates.append((last_step, step))
            self.last_steps[parameter] = last_step

        # compare last and current step directions
        self.cosine = (sum(
            (step * self.last_steps[parameter]).sum()
            for parameter, step in steps.items()) / l2_norm(steps.values()) /
                       l2_norm(self.last_steps.values()))

        return steps, updates
Esempio n. 14
0
 def compute_steps(self, previous_steps):
     if not hasattr(self, 'threshold'):
         return previous_steps
     norm = l2_norm(previous_steps.values())
     multiplier = tensor.switch(norm < self.threshold, 1,
                                self.threshold / norm)
     steps = OrderedDict((param, step * multiplier)
                         for param, step in previous_steps.items())
     return steps, []
Esempio n. 15
0
    def compute_steps(self, previous_steps):
        # if not hasattr(self, 'threshold'):
        #    return previous_steps

        adapt_steps_up = self.adapt_steps + 1.0

        # This will quickly converge the estimate for the mean
        cut_rho_mean = tensor.minimum(self.decay, self.adapt_steps / adapt_steps_up)
        if self.quick_variance_convergence:
            cut_rho_mean2 = cut_rho_mean
        else:
            cut_rho_mean2 = self.decay

        gnorm = l2_norm(previous_steps.values())
        gnorm_log = tensor.log(l2_norm(previous_steps.values()))

        # here we quiclky converge the mean
        gnorm_log_ave_up = cut_rho_mean * self.gnorm_log_ave + (1.0 - cut_rho_mean) * gnorm_log

        # this can wait as it starts from 0 anyways!
        gnorm_log2_ave_up = cut_rho_mean2 * self.gnorm_log2_ave + (1.0 - cut_rho_mean2) * (gnorm_log ** 2)

        clip_threshold_up = tensor.exp(
            gnorm_log_ave_up + tensor.sqrt(tensor.maximum(0.0, gnorm_log2_ave_up - gnorm_log_ave_up ** 2)) * self.stdevs
        )

        if self.clip_to_mean:
            clip_level_up = tensor.exp(gnorm_log_ave_up)
        else:
            clip_level_up = clip_threshold_up

        multiplier = tensor.switch(gnorm < clip_threshold_up, 1, clip_level_up / gnorm)
        steps = OrderedDict((parameter, step * multiplier) for parameter, step in previous_steps.items())

        return (
            steps,
            [
                (self.adapt_steps, adapt_steps_up),
                (self.gnorm_log_ave, gnorm_log_ave_up),
                (self.gnorm_log2_ave, gnorm_log2_ave_up),
                (self.clip_threshold, clip_threshold_up),
                (self.clip_level, clip_level_up),
            ],
        )
Esempio n. 16
0
    def compute_steps(self, steps):
        # memorize steps for one time step
        self.last_steps = OrderedDict()
        updates = []
        for parameter, step in steps.items():
            last_step = shared_floatx(
                parameter.get_value() * 0.,
                "last_step_%s" % parameter.name)
            add_role(last_step, ALGORITHM_BUFFER)
            updates.append((last_step, step))
            self.last_steps[parameter] = last_step

        # compare last and current step directions
        self.cosine = (sum((step * self.last_steps[parameter]).sum()
                           for parameter, step in steps.items())
                       / l2_norm(steps.values())
                       / l2_norm(self.last_steps.values()))

        return steps, updates
Esempio n. 17
0
 def compute_steps(self, previous_steps):
     if self.threshold is None:
         steps = previous_steps
     else:
         norm = l2_norm(previous_steps.values())
         multiplier = tensor.switch(norm < self.threshold, 1,
                                    self.threshold / norm)
         steps = OrderedDict((parameter, step * multiplier)
                             for parameter, step in previous_steps.items())
     return steps, []
Esempio n. 18
0
 def compute_steps(self, previous_steps):
     if self.threshold is None:
         steps = previous_steps
     else:
         norm = l2_norm(previous_steps.values())
         multiplier = tensor.switch(norm < self.threshold,
                                    1, self.threshold / norm)
         steps = OrderedDict(
             (parameter, step * multiplier)
             for parameter, step in previous_steps.items())
     return steps, []
Esempio n. 19
0
    def _apply_reg(self, cost, params=None, *args, **kwargs):
        try:
            if self.config.l2_norm > 0:
                cost = cost + self.config.l2_norm * theano_expressions.l2_norm(
                    tensors=[self.hashtag_embed.W, self.word_embed.W])**2

            else:
                pass
        except Exception:
            pass
        return cost
Esempio n. 20
0
 def compute_step(self, parameter, previous_step):
     if any(ax >= previous_step.ndim for ax in self.axis):
         raise ValueError("Invalid axis {} for {}, ndim={}".format(self.axis, parameter, previous_step.ndim))
     if len(self.axis) == 0:
         norms = l2_norm([parameter - previous_step])
     else:
         squares = tensor.sqr(parameter - previous_step)
         norms = tensor.sqrt(reduce(lambda t, a: t.sum(axis=a, keepdims=True), sorted(self.axis), squares))
     # We want a step s* that is the same as scaling
     # (parameter - previous_step) by threshold / norm
     # when threshold < norm.
     shrinking_step = parameter - (self.threshold / norms) * (parameter - previous_step)
     return tensor.switch(norms > self.threshold, shrinking_step, previous_step), ()
Esempio n. 21
0
    def __init__(self, cost, params, subtensor_params={}, step_rule=None, *args, **kwargs):
        full_params = params
        self.subtensor_params = subtensor_params

        # For each LookupTable, we replace it by its subtensors appearing in the graph
        params = [param for param in full_params if param not in subtensor_params]
        for _, (_, _, outputs, _) in subtensor_params.iteritems():
            params.extend(outputs)

        super(GradientDescent, self).__init__(cost=cost, params=params, **kwargs)
        # self.params contains the list of outputs of the lookup tables

        logger.info("Taking the cost gradient")
        self.gradients = dict(
            equizip(self.params, tensor.grad(self.cost, self.params)))

        # We combine the gradients extracted from the same parameter
        for param, (subparam, canonized_indices, outputs, indices) in subtensor_params.iteritems():
            # This is necessary if we want to compute the l2 norm correctly (e.g. for StepClipping)
            tmp = shared_floatx(param.get_value() * 0.)
            for (output, indice) in zip(outputs, indices):
                tmp = tensor.inc_subtensor(tmp[indice], self.gradients[output])
                del self.gradients[output]
            self.gradients[subparam] = tmp[canonized_indices]

        # We remove the subtensors from the list of parameters
        self.params = full_params

        logger.info("The cost gradient computation graph is built")

        self.step_rule = step_rule if step_rule else Scale()

        self.total_gradient_norm = named_copy(l2_norm(self.gradients.values()),
                                              "total_gradient_norm")
        self.steps, self.step_rule_updates = (
            self.step_rule.compute_steps(self.gradients))
        self.total_step_norm = named_copy(l2_norm(self.steps.values()),
                                          "total_step_norm")
Esempio n. 22
0
    def __init__(self,
                 gen_obj,
                 dis_obj,
                 step_rule,
                 model,
                 gen_consider_constant,
                 dis_iter=1,
                 gradient_clip=[-0.01, 0.01],
                 **kwargs):
        super(AdverserialTraning, self).__init__(**kwargs)

        self.model = model

        self.gen_cost = gen_obj
        self.dis_cost = dis_obj

        self.dis_iter = dis_iter
        self._n_call = 0
        self.gradient_clip = gradient_clip

        self.gen_gradients = self._compute_gradients(
            gen_obj,
            self.model.gen_params,
            consider_constant=gen_consider_constant,
            name='Generator')

        self.dis_gradients = self._compute_gradients(dis_obj,
                                                     self.model.dis_params,
                                                     name='Discriminator')
        gradient_values = self.dis_gradients + self.gen_gradients

        self.total_gradient_norm = (l2_norm(gradient_values).copy(
            name="total_gradient_norm"))

        self.step_rule = step_rule
        logger.debug("Computing parameter steps...")

        self._dis_updates = self._prepare_updates(self.model.dis_params,
                                                  self.dis_gradients,
                                                  self.step_rule,
                                                  name='Discriminator')

        self._gen_updates = self._prepare_updates(self.model.gen_params,
                                                  self.gen_gradients,
                                                  self.step_rule,
                                                  name='Generator')
Esempio n. 23
0
 def compute_step(self, param, previous_step):
     if any(ax >= previous_step.ndim for ax in self.axis):
         raise ValueError("Invalid axis {} for {}, ndim={}".format(
             self.axis, param, previous_step.ndim))
     if len(self.axis) == 0:
         norms = l2_norm([param - previous_step])
     else:
         squares = tensor.sqr(param - previous_step)
         norms = tensor.sqrt(
             reduce(lambda t, a: t.sum(axis=a, keepdims=True),
                    sorted(self.axis), squares))
     # We want a step s* that is the same as scaling (param - previous_step)
     # by threshold / norm when threshold < norm.
     shrinking_step = (param - (self.threshold / norms) *
                       (param - previous_step))
     return tensor.switch(norms > self.threshold, shrinking_step,
                          previous_step), ()
Esempio n. 24
0
 def _apply_reg(self, cost, params=None, *args, **kwargs):
     '''
     Apply regularization (default L2 norm) on parameters (default user, hashtag and word embedding)
     :param params: A list of parameters to which regularization applied
     :return:
     '''
     try:
         if self.config.l2_norm > 0:
             cost = cost + self.config.l2_norm * theano_expressions.l2_norm(
                 tensors=[
                     self.user_embed.W, self.hashtag_embed.W,
                     self.word_embed.W
                 ])**2
         else:
             pass
     except Exception:
         pass
     return cost
Esempio n. 25
0
def l2_regularization(cg, rate=0.01):
    """compute L2 regularization decay.

    Parameters
    ----------
    cg : ComputationGraph
        computation graph for a network
    rate : float
        L2 regularization rate

    Returns
    -------
    L2_cost : expression
        L2 cost for a network
    """
    W = VariableFilter(roles=[WEIGHT])(cg.variables)
    L2_cost = rate * l2_norm(W)

    return L2_cost
Esempio n. 26
0
    def compute_steps(self, previous_steps):
        def median(window):
            return tensor.sort(window)[self.window.shape[0] / 2]

        self.median = median(self.window)

        # allow within 1 median absolute deviation
        #self.deviation = median(abs(self.window - self.median))
        #self.max_ratio = 1 + self.deviation / self.median
        self.max_ratio = 1.

        self.norm = l2_norm(previous_steps.values())
        self.ratio = self.norm / self.median
        acceptable = self.ratio <= self.max_ratio
        multiplier = (
            tensor.switch(acceptable,
                          # smaller steps are used as is
                          1,
                          # larger steps are pushed down
                          self.norm ** (1 / self.ratio) / self.norm))
        self.newnorm = multiplier * self.norm

        newwindow = tensor.concatenate([
            tensor.shape_padleft(self.norm),
            self.window[:(self.window_width - 1)]
        ], axis=0)

        # let the norm affect the median only if it was acceptable
        # or if the window hasn't been fully populated yet
        #newwindow = ifelse(
        #    acceptable + (self.window.shape[0] < self.window_width),
        #    tensor.concatenate([
        #        tensor.shape_padleft(self.norm),
        #        self.window[:(self.window_width - 1)]],
        #        axis=0),
        #    self.window)

        steps = OrderedDict(
            (parameter, multiplier * step)
            for parameter, step in previous_steps.items())
        updates = [(self.window, newwindow)]
        return steps, updates
Esempio n. 27
0
    def __init__(self, cost=None, parameters=None, step_rule=None,
                 gradients=None, known_grads=None, consider_constant=None,
                 **kwargs):
        # Set initial values for cost, parameters, gradients.
        self.cost = cost
        self.parameters = parameters
        # Coerce lists of tuples to OrderedDict. Do not coerce Mappings,
        # as we don't want to convert dict -> OrderedDict and give it
        # an arbitrary, non-deterministic order.
        if gradients is not None and not isinstance(gradients, Mapping):
            gradients = OrderedDict(gradients)
        self.gradients = gradients

        # If we don't have gradients, we'll need to infer them from the
        # cost and the parameters, both of which must not be None.
        if not self.gradients:
            self.gradients = self._compute_gradients(known_grads,
                                                     consider_constant)
        else:
            if cost is not None:
                logger.warning(('{}: gradients already specified directly; '
                                'cost is unused.'
                                .format(self.__class__.__name__)))
            if self.parameters is None and isinstance(gradients, OrderedDict):
                # If the dictionary is ordered, it's safe to use the keys
                # as they have a deterministic order.
                self.parameters = list(self.gradients.keys())
            elif self.parameters is not None:
                # If parameters and gradients.keys() don't match we can
                # try to recover if gradients is ordered.
                if set(self.parameters) != set(self.gradients.keys()):
                    logger.warn("Specified parameters list does not match "
                                "keys in provided gradient dictionary; "
                                "using parameters inferred from gradients")
                    if not isinstance(self.gradients, OrderedDict):
                        raise ValueError(determinism_error)
                    self.parameters = list(self.gradients.keys())
            else:
                # self.parameters is not None, and gradients isn't
                # an OrderedDict. We can't do anything safe.
                raise ValueError(determinism_error)
            if known_grads:
                raise ValueError("known_grads has no effect when gradients "
                                 "are passed in")
            if consider_constant is not None:
                raise ValueError("consider_constant has no effect when "
                                 "gradients are passed in")

        # The order in which the different gradient terms appears
        # here matters, as floating point addition is non-commutative (and
        # Theano's graph optimizations are not order-independent).
        # This is why we do not use .values().
        gradient_values = [self.gradients[p] for p in self.parameters]
        self.total_gradient_norm = (l2_norm(gradient_values)
                                    .copy(name="total_gradient_norm"))

        self.step_rule = step_rule if step_rule else Scale()
        logger.debug("Computing parameter steps...")
        self.steps, self.step_rule_updates = (
            self.step_rule.compute_steps(self.gradients))

        # Same as gradient_values above: the order may influence a
        # bunch of things, so enforce a consistent one (don't use
        # .values()).
        step_values = [self.steps[p] for p in self.parameters]
        self.total_step_norm = (l2_norm(step_values)
                                .copy(name="total_step_norm"))

        # Once again, iterating on gradients may not be deterministically
        # ordered if it is not an OrderedDict. We add the updates here in
        # the order specified in self.parameters. Keep it this way to
        # maintain reproducibility.
        kwargs.setdefault('updates', []).extend(
            itertools.chain(((parameter, parameter - self.steps[parameter])
                             for parameter in self.parameters),
                            self.step_rule_updates)
        )
        super(GradientDescent, self).__init__(**kwargs)
Esempio n. 28
0
def main(job_id, params):

    config = ConfigParser.ConfigParser()
    config.readfp(open('./params'))

    max_epoch = int(config.get('hyperparams', 'max_iter', 100))
    base_lr = float(config.get('hyperparams', 'base_lr', 0.01))
    train_batch = int(config.get('hyperparams', 'train_batch', 256))
    valid_batch = int(config.get('hyperparams', 'valid_batch', 512))
    test_batch = int(config.get('hyperparams', 'valid_batch', 512))
    hidden_units = int(config.get('hyperparams', 'hidden_units', 16))


    W_sd = float(config.get('hyperparams', 'W_sd', 0.01))
    W_mu = float(config.get('hyperparams', 'W_mu', 0.0))
    b_sd = float(config.get('hyperparams', 'b_sd', 0.01))
    b_mu = float(config.get('hyperparams', 'b_mu', 0.0))

    dropout_ratio = float(config.get('hyperparams', 'dropout_ratio', 0.2))
    weight_decay = float(config.get('hyperparams', 'weight_decay', 0.001))
    max_norm = float(config.get('hyperparams', 'max_norm', 100.0))
    solver = config.get('hyperparams', 'solver_type', 'rmsprop')
    data_file = config.get('hyperparams', 'data_file')
    fine_tune = config.getboolean('hyperparams', 'fine_tune')

    # Spearmint optimization parameters:
    if params:
        base_lr = float(params['base_lr'][0])
        dropout_ratio = float(params['dropout_ratio'][0])
        hidden_units = params['hidden_units'][0]
        weight_decay = params['weight_decay'][0]

    if 'adagrad' in solver:
        solver_type = CompositeRule([AdaGrad(learning_rate=base_lr), VariableClipping(threshold=max_norm)])
    else:
        solver_type = CompositeRule([RMSProp(learning_rate=base_lr), VariableClipping(threshold=max_norm)])

    rn_file = '/projects/francisco/repositories/NI-ML/models/deepnets/blocks/ff/models/rnet/2015-06-25-18:13'
    ln_file = '/projects/francisco/repositories/NI-ML/models/deepnets/blocks/ff/models/lnet/2015-06-29-11:45'

    right_dim = 10519
    left_dim = 11427

    train = H5PYDataset(data_file, which_set='train')
    valid = H5PYDataset(data_file, which_set='valid')
    test = H5PYDataset(data_file, which_set='test')

    l_x = tensor.matrix('l_features')
    r_x = tensor.matrix('r_features')
    y = tensor.lmatrix('targets')

    lnet = load(ln_file).model.get_top_bricks()[0]
    rnet = load(rn_file).model.get_top_bricks()[0]

    # Pre-trained layers:

    # Inputs -> hidden_1 -> hidden 2
    for side, net in zip(['l', 'r'], [lnet, rnet]):
        for child in net.children:
            child.name = side + '_' + child.name

    ll1 = lnet.children[0]
    lr1 = lnet.children[1]
    ll2 = lnet.children[2]
    lr2 = lnet.children[3]

    rl1 = rnet.children[0]
    rr1 = rnet.children[1]
    rl2 = rnet.children[2]
    rr2 = rnet.children[3]

    l_h = lr2.apply(ll2.apply(lr1.apply(ll1.apply(l_x))))
    r_h = rr2.apply(rl2.apply(rr1.apply(rl1.apply(r_x))))

    input_dim = ll2.output_dim + rl2.output_dim

    # hidden_2 -> hidden_3 -> hidden_4 -> Logistic output
    output_mlp = MLP(activations=[
        Rectifier(name='h3'),
        Rectifier(name='h4'),
        Softmax(name='output'),
    ],
                     dims=[
                         input_dim,
                         hidden_units,
                         hidden_units,
                         2,
                     ],
                     weights_init=IsotropicGaussian(std=W_sd, mean=W_mu),
                     biases_init=IsotropicGaussian(std=W_sd, mean=W_mu))

    output_mlp.initialize()

    # # Concatenate the inputs from the two hidden subnets into a single variable
    # # for input into the next layer.
    merge = tensor.concatenate([l_h, r_h], axis=1)
    #
    y_hat = output_mlp.apply(merge)

    # Define a cost function to optimize, and a classification error rate.
    # Also apply the outputs from the net and corresponding targets:
    cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat)
    error = MisclassificationRate().apply(y.flatten(), y_hat)
    error.name = 'error'

    # This is the model: before applying dropout
    model = Model(cost)

    # Need to define the computation graph for the cost func:
    cost_graph = ComputationGraph([cost])

    # This returns a list of weight vectors for each layer
    W = VariableFilter(roles=[WEIGHT])(cost_graph.variables)

    # Add some regularization to this model:
    cost += weight_decay * l2_norm(W)
    cost.name = 'entropy'

    # computational graph with l2 reg
    cost_graph = ComputationGraph([cost])

    # Apply dropout to inputs:
    inputs = VariableFilter([INPUT])(cost_graph.variables)
    dropout_inputs = [input for input in inputs if input.name.startswith('linear_')]
    dropout_graph = apply_dropout(cost_graph, [dropout_inputs[0]], 0.2)
    dropout_graph = apply_dropout(dropout_graph, dropout_inputs[1:], dropout_ratio)
    dropout_cost = dropout_graph.outputs[0]
    dropout_cost.name = 'dropout_entropy'

    # If no fine-tuning of l-r models is wanted, find the params for only
    # the joint layers:
    if fine_tune:
        params_to_update = dropout_graph.parameters
    else:
        params_to_update = VariableFilter([PARAMETER], bricks=output_mlp.children)(cost_graph)

    # Learning Algorithm:
    algo = GradientDescent(
        step_rule=solver_type,
        params=params_to_update,
        cost=dropout_cost)

    # algo.step_rule.learning_rate.name = 'learning_rate'

    # Data stream used for training model:
    training_stream = Flatten(
        DataStream.default_stream(
            dataset=train,
            iteration_scheme=ShuffledScheme(
                train.num_examples,
                batch_size=train_batch)))

    training_monitor = TrainingDataMonitoring([dropout_cost,
                                               aggregation.mean(error),
                                               aggregation.mean(algo.total_gradient_norm)],
                                              after_batch=True)


    # Use the 'valid' set for validation during training:
    validation_stream = Flatten(
        DataStream.default_stream(
            dataset=valid,
            iteration_scheme=ShuffledScheme(
                valid.num_examples,
                batch_size=valid_batch)))

    validation_monitor = DataStreamMonitoring(
        variables=[cost, error],
        data_stream=validation_stream,
        prefix='validation',
        after_epoch=True)

    test_stream = Flatten(
        DataStream.default_stream(
            dataset=test,
            iteration_scheme=ShuffledScheme(
                test.num_examples,
                batch_size=test_batch)))

    test_monitor = DataStreamMonitoring(
        variables=[error],
        data_stream=test_stream,
        prefix='test',
        after_training=True)

    plotting = Plot('AdniNet_LeftRight',
                    channels=[
                        ['dropout_entropy'],
                        ['error', 'validation_error'],
                    ],
    )

    # Checkpoint class used to save model and log:
    stamp = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d-%H:%M')
    checkpoint = Checkpoint('./models/{}'.format(stamp),
                            save_separately=['model', 'log'],
                            every_n_epochs=1)

    # The main loop will train the network and output reports, etc
    main_loop = MainLoop(
        data_stream=training_stream,
        model=model,
        algorithm=algo,
        extensions=[
            validation_monitor,
            training_monitor,
            plotting,
            FinishAfter(after_n_epochs=max_epoch),
            FinishIfNoImprovementAfter(notification_name='validation_error', epochs=1),
            Printing(),
            ProgressBar(),
            checkpoint,
            test_monitor,
        ])
    main_loop.run()
    ve = float(main_loop.log.last_epoch_row['validation_error'])
    te = float(main_loop.log.last_epoch_row['error'])
    spearmint_loss = ve + abs(te - ve)
    print 'Spearmint Loss: {}'.format(spearmint_loss)
    return spearmint_loss
Esempio n. 29
0
def main(job_id, params):
    config = ConfigParser.ConfigParser()
    config.readfp(open('./params'))
    max_epoch = int(config.get('hyperparams', 'max_iter', 100))
    base_lr = float(config.get('hyperparams', 'base_lr', 0.01))
    train_batch = int(config.get('hyperparams', 'train_batch', 256))
    valid_batch = int(config.get('hyperparams', 'valid_batch', 512))
    test_batch = int(config.get('hyperparams', 'valid_batch', 512))

    W_sd = float(config.get('hyperparams', 'W_sd', 0.01))
    W_mu = float(config.get('hyperparams', 'W_mu', 0.0))
    b_sd = float(config.get('hyperparams', 'b_sd', 0.01))
    b_mu = float(config.get('hyperparams', 'b_mu', 0.0))

    hidden_units = int(config.get('hyperparams', 'hidden_units', 32))
    input_dropout_ratio = float(
        config.get('hyperparams', 'input_dropout_ratio', 0.2))
    dropout_ratio = float(config.get('hyperparams', 'dropout_ratio', 0.2))
    weight_decay = float(config.get('hyperparams', 'weight_decay', 0.001))
    max_norm = float(config.get('hyperparams', 'max_norm', 100.0))
    solver = config.get('hyperparams', 'solver_type', 'rmsprop')
    data_file = config.get('hyperparams', 'data_file')
    side = config.get('hyperparams', 'side', 'b')

    # Spearmint optimization parameters:
    if params:
        base_lr = float(params['base_lr'][0])
        dropout_ratio = float(params['dropout_ratio'][0])
        hidden_units = params['hidden_units'][0]
        weight_decay = params['weight_decay'][0]

    if 'adagrad' in solver:
        solver_type = CompositeRule([
            AdaGrad(learning_rate=base_lr),
            VariableClipping(threshold=max_norm)
        ])
    else:
        solver_type = CompositeRule([
            RMSProp(learning_rate=base_lr),
            VariableClipping(threshold=max_norm)
        ])

    input_dim = {'l': 11427, 'r': 10519, 'b': 10519 + 11427}
    data_file = config.get('hyperparams', 'data_file')

    if 'b' in side:
        train = H5PYDataset(data_file, which_set='train')
        valid = H5PYDataset(data_file, which_set='valid')
        test = H5PYDataset(data_file, which_set='test')
        x_l = tensor.matrix('l_features')
        x_r = tensor.matrix('r_features')
        x = tensor.concatenate([x_l, x_r], axis=1)

    else:
        train = H5PYDataset(data_file,
                            which_set='train',
                            sources=['{}_features'.format(side), 'targets'])
        valid = H5PYDataset(data_file,
                            which_set='valid',
                            sources=['{}_features'.format(side), 'targets'])
        test = H5PYDataset(data_file,
                           which_set='test',
                           sources=['{}_features'.format(side), 'targets'])
        x = tensor.matrix('{}_features'.format(side))

    y = tensor.lmatrix('targets')

    # Define a feed-forward net with an input, two hidden layers, and a softmax output:
    model = MLP(activations=[
        Rectifier(name='h1'),
        Rectifier(name='h2'),
        Softmax(name='output'),
    ],
                dims=[input_dim[side], hidden_units, hidden_units, 2],
                weights_init=IsotropicGaussian(std=W_sd, mean=W_mu),
                biases_init=IsotropicGaussian(b_sd, b_mu))

    # Don't forget to initialize params:
    model.initialize()

    # y_hat is the output of the neural net with x as its inputs
    y_hat = model.apply(x)

    # Define a cost function to optimize, and a classification error rate.
    # Also apply the outputs from the net and corresponding targets:
    cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat)
    error = MisclassificationRate().apply(y.flatten(), y_hat)
    error.name = 'error'

    # This is the model: before applying dropout
    model = Model(cost)

    # Need to define the computation graph for the cost func:
    cost_graph = ComputationGraph([cost])

    # This returns a list of weight vectors for each layer
    W = VariableFilter(roles=[WEIGHT])(cost_graph.variables)

    # Add some regularization to this model:
    cost += weight_decay * l2_norm(W)
    cost.name = 'entropy'

    # computational graph with l2 reg
    cost_graph = ComputationGraph([cost])

    # Apply dropout to inputs:
    inputs = VariableFilter([INPUT])(cost_graph.variables)
    dropout_inputs = [
        input for input in inputs if input.name.startswith('linear_')
    ]
    dropout_graph = apply_dropout(cost_graph, [dropout_inputs[0]],
                                  input_dropout_ratio)
    dropout_graph = apply_dropout(dropout_graph, dropout_inputs[1:],
                                  dropout_ratio)
    dropout_cost = dropout_graph.outputs[0]
    dropout_cost.name = 'dropout_entropy'

    # Learning Algorithm (notice: we use the dropout cost for learning):
    algo = GradientDescent(step_rule=solver_type,
                           params=dropout_graph.parameters,
                           cost=dropout_cost)

    # algo.step_rule.learning_rate.name = 'learning_rate'

    # Data stream used for training model:
    training_stream = Flatten(
        DataStream.default_stream(dataset=train,
                                  iteration_scheme=ShuffledScheme(
                                      train.num_examples,
                                      batch_size=train_batch)))

    training_monitor = TrainingDataMonitoring([
        dropout_cost,
        aggregation.mean(error),
        aggregation.mean(algo.total_gradient_norm)
    ],
                                              after_batch=True)

    # Use the 'valid' set for validation during training:
    validation_stream = Flatten(
        DataStream.default_stream(dataset=valid,
                                  iteration_scheme=ShuffledScheme(
                                      valid.num_examples,
                                      batch_size=valid_batch)))

    validation_monitor = DataStreamMonitoring(variables=[cost, error],
                                              data_stream=validation_stream,
                                              prefix='validation',
                                              after_epoch=True)

    test_stream = Flatten(
        DataStream.default_stream(
            dataset=test,
            iteration_scheme=ShuffledScheme(test.num_examples,
                                            batch_size=test_batch)))

    test_monitor = DataStreamMonitoring(variables=[error],
                                        data_stream=test_stream,
                                        prefix='test',
                                        after_training=True)

    plotting = Plot('AdniNet_{}'.format(side),
                    channels=[
                        ['dropout_entropy', 'validation_entropy'],
                        ['error', 'validation_error'],
                    ],
                    after_batch=False)

    # Checkpoint class used to save model and log:
    stamp = datetime.datetime.fromtimestamp(
        time.time()).strftime('%Y-%m-%d-%H:%M')
    checkpoint = Checkpoint('./models/{}net/{}'.format(side, stamp),
                            save_separately=['model', 'log'],
                            every_n_epochs=1)

    # Home-brewed class for early stopping when we detect we have started to overfit
    early_stopper = FinishIfOverfitting(error_name='error',
                                        validation_name='validation_error',
                                        threshold=0.1,
                                        epochs=5,
                                        burn_in=100)

    # The main loop will train the network and output reports, etc
    main_loop = MainLoop(data_stream=training_stream,
                         model=model,
                         algorithm=algo,
                         extensions=[
                             validation_monitor,
                             training_monitor,
                             plotting,
                             FinishAfter(after_n_epochs=max_epoch),
                             early_stopper,
                             Printing(),
                             ProgressBar(),
                             checkpoint,
                             test_monitor,
                         ])
    main_loop.run()

    ve = float(main_loop.log.last_epoch_row['validation_error'])
    te = float(main_loop.log.last_epoch_row['error'])
    spearmint_loss = ve + abs(te - ve)
    print 'Spearmint Loss: {}'.format(spearmint_loss)
    return spearmint_loss
Esempio n. 30
0
def main(job_id, params):

    config = ConfigParser.ConfigParser()
    config.readfp(open('./params'))

    max_epoch = int(config.get('hyperparams', 'max_iter', 100))
    base_lr = float(config.get('hyperparams', 'base_lr', 0.01))
    train_batch = int(config.get('hyperparams', 'train_batch', 256))
    valid_batch = int(config.get('hyperparams', 'valid_batch', 512))
    test_batch = int(config.get('hyperparams', 'valid_batch', 512))
    hidden_units = int(config.get('hyperparams', 'hidden_units', 16))

    W_sd = float(config.get('hyperparams', 'W_sd', 0.01))
    W_mu = float(config.get('hyperparams', 'W_mu', 0.0))
    b_sd = float(config.get('hyperparams', 'b_sd', 0.01))
    b_mu = float(config.get('hyperparams', 'b_mu', 0.0))

    dropout_ratio = float(config.get('hyperparams', 'dropout_ratio', 0.2))
    weight_decay = float(config.get('hyperparams', 'weight_decay', 0.001))
    max_norm = float(config.get('hyperparams', 'max_norm', 100.0))
    solver = config.get('hyperparams', 'solver_type', 'rmsprop')
    data_file = config.get('hyperparams', 'data_file')
    fine_tune = config.getboolean('hyperparams', 'fine_tune')

    # Spearmint optimization parameters:
    if params:
        base_lr = float(params['base_lr'][0])
        dropout_ratio = float(params['dropout_ratio'][0])
        hidden_units = params['hidden_units'][0]
        weight_decay = params['weight_decay'][0]

    if 'adagrad' in solver:
        solver_type = CompositeRule([
            AdaGrad(learning_rate=base_lr),
            VariableClipping(threshold=max_norm)
        ])
    else:
        solver_type = CompositeRule([
            RMSProp(learning_rate=base_lr),
            VariableClipping(threshold=max_norm)
        ])

    rn_file = '/projects/francisco/repositories/NI-ML/models/deepnets/blocks/ff/models/rnet/2015-06-25-18:13'
    ln_file = '/projects/francisco/repositories/NI-ML/models/deepnets/blocks/ff/models/lnet/2015-06-29-11:45'

    right_dim = 10519
    left_dim = 11427

    train = H5PYDataset(data_file, which_set='train')
    valid = H5PYDataset(data_file, which_set='valid')
    test = H5PYDataset(data_file, which_set='test')

    l_x = tensor.matrix('l_features')
    r_x = tensor.matrix('r_features')
    y = tensor.lmatrix('targets')

    lnet = load(ln_file).model.get_top_bricks()[0]
    rnet = load(rn_file).model.get_top_bricks()[0]

    # Pre-trained layers:

    # Inputs -> hidden_1 -> hidden 2
    for side, net in zip(['l', 'r'], [lnet, rnet]):
        for child in net.children:
            child.name = side + '_' + child.name

    ll1 = lnet.children[0]
    lr1 = lnet.children[1]
    ll2 = lnet.children[2]
    lr2 = lnet.children[3]

    rl1 = rnet.children[0]
    rr1 = rnet.children[1]
    rl2 = rnet.children[2]
    rr2 = rnet.children[3]

    l_h = lr2.apply(ll2.apply(lr1.apply(ll1.apply(l_x))))
    r_h = rr2.apply(rl2.apply(rr1.apply(rl1.apply(r_x))))

    input_dim = ll2.output_dim + rl2.output_dim

    # hidden_2 -> hidden_3 -> hidden_4 -> Logistic output
    output_mlp = MLP(activations=[
        Rectifier(name='h3'),
        Rectifier(name='h4'),
        Softmax(name='output'),
    ],
                     dims=[
                         input_dim,
                         hidden_units,
                         hidden_units,
                         2,
                     ],
                     weights_init=IsotropicGaussian(std=W_sd, mean=W_mu),
                     biases_init=IsotropicGaussian(std=W_sd, mean=W_mu))

    output_mlp.initialize()

    # # Concatenate the inputs from the two hidden subnets into a single variable
    # # for input into the next layer.
    merge = tensor.concatenate([l_h, r_h], axis=1)
    #
    y_hat = output_mlp.apply(merge)

    # Define a cost function to optimize, and a classification error rate.
    # Also apply the outputs from the net and corresponding targets:
    cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat)
    error = MisclassificationRate().apply(y.flatten(), y_hat)
    error.name = 'error'

    # This is the model: before applying dropout
    model = Model(cost)

    # Need to define the computation graph for the cost func:
    cost_graph = ComputationGraph([cost])

    # This returns a list of weight vectors for each layer
    W = VariableFilter(roles=[WEIGHT])(cost_graph.variables)

    # Add some regularization to this model:
    cost += weight_decay * l2_norm(W)
    cost.name = 'entropy'

    # computational graph with l2 reg
    cost_graph = ComputationGraph([cost])

    # Apply dropout to inputs:
    inputs = VariableFilter([INPUT])(cost_graph.variables)
    dropout_inputs = [
        input for input in inputs if input.name.startswith('linear_')
    ]
    dropout_graph = apply_dropout(cost_graph, [dropout_inputs[0]], 0.2)
    dropout_graph = apply_dropout(dropout_graph, dropout_inputs[1:],
                                  dropout_ratio)
    dropout_cost = dropout_graph.outputs[0]
    dropout_cost.name = 'dropout_entropy'

    # If no fine-tuning of l-r models is wanted, find the params for only
    # the joint layers:
    if fine_tune:
        params_to_update = dropout_graph.parameters
    else:
        params_to_update = VariableFilter(
            [PARAMETER], bricks=output_mlp.children)(cost_graph)

    # Learning Algorithm:
    algo = GradientDescent(step_rule=solver_type,
                           params=params_to_update,
                           cost=dropout_cost)

    # algo.step_rule.learning_rate.name = 'learning_rate'

    # Data stream used for training model:
    training_stream = Flatten(
        DataStream.default_stream(dataset=train,
                                  iteration_scheme=ShuffledScheme(
                                      train.num_examples,
                                      batch_size=train_batch)))

    training_monitor = TrainingDataMonitoring([
        dropout_cost,
        aggregation.mean(error),
        aggregation.mean(algo.total_gradient_norm)
    ],
                                              after_batch=True)

    # Use the 'valid' set for validation during training:
    validation_stream = Flatten(
        DataStream.default_stream(dataset=valid,
                                  iteration_scheme=ShuffledScheme(
                                      valid.num_examples,
                                      batch_size=valid_batch)))

    validation_monitor = DataStreamMonitoring(variables=[cost, error],
                                              data_stream=validation_stream,
                                              prefix='validation',
                                              after_epoch=True)

    test_stream = Flatten(
        DataStream.default_stream(
            dataset=test,
            iteration_scheme=ShuffledScheme(test.num_examples,
                                            batch_size=test_batch)))

    test_monitor = DataStreamMonitoring(variables=[error],
                                        data_stream=test_stream,
                                        prefix='test',
                                        after_training=True)

    plotting = Plot(
        'AdniNet_LeftRight',
        channels=[
            ['dropout_entropy'],
            ['error', 'validation_error'],
        ],
    )

    # Checkpoint class used to save model and log:
    stamp = datetime.datetime.fromtimestamp(
        time.time()).strftime('%Y-%m-%d-%H:%M')
    checkpoint = Checkpoint('./models/{}'.format(stamp),
                            save_separately=['model', 'log'],
                            every_n_epochs=1)

    # The main loop will train the network and output reports, etc
    main_loop = MainLoop(data_stream=training_stream,
                         model=model,
                         algorithm=algo,
                         extensions=[
                             validation_monitor,
                             training_monitor,
                             plotting,
                             FinishAfter(after_n_epochs=max_epoch),
                             FinishIfNoImprovementAfter(
                                 notification_name='validation_error',
                                 epochs=1),
                             Printing(),
                             ProgressBar(),
                             checkpoint,
                             test_monitor,
                         ])
    main_loop.run()
    ve = float(main_loop.log.last_epoch_row['validation_error'])
    te = float(main_loop.log.last_epoch_row['error'])
    spearmint_loss = ve + abs(te - ve)
    print 'Spearmint Loss: {}'.format(spearmint_loss)
    return spearmint_loss
Esempio n. 31
0
    def training(self, fea2obj, batch_size, learning_rate=0.005, steprule='adagrad', wait_epochs=5, kl_weight_init=None, klw_ep=50, klw_inc_rate=0, num_epochs=None):
        networkfile = self._config['net']

        n_epochs = num_epochs or int(self._config['nepochs'])
        reg_weight=float(self._config['loss_weight'])
        reg_type=self._config['loss_reg']
        numtrain = int(self._config['num_train']) if 'num_train' in self._config else None
        train_stream, num_samples_train = get_comb_stream(fea2obj, 'train', batch_size, shuffle=True, num_examples=numtrain)
        dev_stream, num_samples_dev = get_comb_stream(fea2obj, 'dev', batch_size=None, shuffle=False)
        logger.info('sources: %s -- number of train/dev samples: %d/%d', train_stream.sources, num_samples_train, num_samples_dev)

        t2idx = fea2obj['targets'].t2idx
        klw_init = kl_weight_init or float(self._config['kld_weight']) if 'kld_weight' in self._config else 1
        logger.info('kl_weight_init: %d', klw_init)
        kl_weight = shared_floatx(klw_init, 'kl_weight')
        entropy_weight = shared_floatx(1., 'entropy_weight')

        cost, p_at_1, _, KLD, logpy_xz, pat1_recog, misclassify_rate= build_model_new(fea2obj, len(t2idx), self._config, kl_weight, entropy_weight)

        cg = ComputationGraph(cost)

        weights = VariableFilter(roles=[WEIGHT])(cg.parameters)
        logger.info('Model weights are: %s', weights)
        if 'L2' in reg_type:
            cost += reg_weight * l2_norm(weights)
            logger.info('applying %s with weight: %f ', reg_type, reg_weight)

        dropout = -0.1
        if dropout > 0:
            cg = apply_dropout(cg, weights, dropout)
            cost = cg.outputs[0]

        cost.name = 'cost'
        logger.info('Our Algorithm is : %s, and learning_rate: %f', steprule, learning_rate)
        if 'adagrad' in steprule:
            cnf_step_rule = AdaGrad(learning_rate)
        elif 'adadelta' in steprule:
            cnf_step_rule = AdaDelta(decay_rate=0.95)
        elif 'decay' in steprule:
            cnf_step_rule = RMSProp(learning_rate=learning_rate, decay_rate=0.90)
            cnf_step_rule = CompositeRule([cnf_step_rule, StepClipping(1)])
        elif 'momentum' in steprule:
            cnf_step_rule = Momentum(learning_rate=learning_rate, momentum=0.9)
        elif 'adam' in steprule:
            cnf_step_rule = Adam(learning_rate=learning_rate)
        else:
            logger.info('The steprule param is wrong! which is: %s', steprule)

        algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=cnf_step_rule, on_unused_sources='warn')
        #algorithm.add_updates(updates)
        gradient_norm = aggregation.mean(algorithm.total_gradient_norm)
        step_norm = aggregation.mean(algorithm.total_step_norm)
        monitored_vars = [cost, gradient_norm, step_norm, p_at_1, KLD, logpy_xz, kl_weight, pat1_recog]
        train_monitor = TrainingDataMonitoring(variables=monitored_vars, after_batch=True,
                                               before_first_epoch=True, prefix='tra')

        dev_monitor = DataStreamMonitoring(variables=[cost, p_at_1, KLD, logpy_xz, pat1_recog, misclassify_rate], after_epoch=True,
                                           before_first_epoch=True, data_stream=dev_stream, prefix="dev")

        extensions = [dev_monitor, train_monitor, Timing(),
                      TrackTheBest('dev_cost'),
                      FinishIfNoImprovementAfter('dev_cost_best_so_far', epochs=wait_epochs),
                      Printing(after_batch=False), #, ProgressBar()
                      FinishAfter(after_n_epochs=n_epochs),
                      saveload.Load(networkfile+'.toload.pkl'),
                      ] + track_best('dev_cost', networkfile+ '.best.pkl')

        #extensions.append(SharedVariableModifier(kl_weight,
        #                                          lambda n, klw: numpy.cast[theano.config.floatX] (klw_inc_rate + klw), after_epoch=False, every_n_epochs=klw_ep, after_batch=False))
#         extensions.append(SharedVariableModifier(entropy_weight,
#                                                   lambda n, crw: numpy.cast[theano.config.floatX](crw - klw_inc_rate), after_epoch=False, every_n_epochs=klw_ep, after_batch=False))

        logger.info('number of parameters in the model: %d', tensor.sum([p.size for p in cg.parameters]).eval())
        logger.info('Lookup table sizes: %s', [p.size.eval() for p in cg.parameters if 'lt' in p.name])

        main_loop = MainLoop(data_stream=train_stream, algorithm=algorithm,
                             model=Model(cost), extensions=extensions)
        main_loop.run()
Esempio n. 32
0
def train(config, save_path, bokeh_name,
          params, bokeh_server, test_tag, use_load_ext,
          load_log, fast_start, validation_epochs, validation_batches,
          per_epochs, per_batches):
    root_path, extension = os.path.splitext(save_path)

    data = Data(**config['data'])

    # Build the main brick and initialize all parameters.
    recognizer = SpeechRecognizer(
        data.recordings_source, data.labels_source,
        data.eos_label,
        data.num_features, data.num_labels,
        name="recognizer",
        data_prepend_eos=data.prepend_eos,
        character_map=data.character_map,
        **config["net"])
    for brick_path, attribute_dict in sorted(
            config['initialization'].items(),
            key=lambda (k, v): -k.count('/')):
        for attribute, value in attribute_dict.items():
            brick, = Selector(recognizer).select(brick_path).bricks
            setattr(brick, attribute, value)
            brick.push_initialization_config()
    recognizer.initialize()

    # Separate attention_params to be handled differently
    # when regularization is applied
    attention = recognizer.generator.transition.attention
    attention_params = Selector(attention).get_parameters().values()

    logger.info(
        "Initialization schemes for all bricks.\n"
        "Works well only in my branch with __repr__ added to all them,\n"
        "there is an issue #463 in Blocks to do that properly.")

    def show_init_scheme(cur):
        result = dict()
        for attr in dir(cur):
            if attr.endswith('_init'):
                result[attr] = getattr(cur, attr)
        for child in cur.children:
            result[child.name] = show_init_scheme(child)
        return result
    logger.info(pprint.pformat(show_init_scheme(recognizer)))

    if params:
        logger.info("Load parameters from " + params)
        recognizer.load_params(params)

    if test_tag:
        tensor.TensorVariable.__str__ = tensor.TensorVariable.__repr__
        __stream = data.get_stream("train")
        __data = next(__stream.get_epoch_iterator(as_dict=True))
        recognizer.recordings.tag.test_value = __data[data.recordings_source]
        recognizer.recordings_mask.tag.test_value = __data[data.recordings_source + '_mask']
        recognizer.labels.tag.test_value = __data[data.labels_source]
        recognizer.labels_mask.tag.test_value = __data[data.labels_source + '_mask']
        theano.config.compute_test_value = 'warn'

    batch_cost = recognizer.get_cost_graph().sum()
    batch_size = named_copy(recognizer.recordings.shape[1], "batch_size")
    # Assumes constant batch size. `aggregation.mean` is not used because
    # of Blocks #514.
    cost = batch_cost / batch_size
    cost.name = "sequence_log_likelihood"
    logger.info("Cost graph is built")

    # Fetch variables useful for debugging.
    # It is important not to use any aggregation schemes here,
    # as it's currently impossible to spread the effect of
    # regularization on their variables, see Blocks #514.
    cost_cg = ComputationGraph(cost)
    r = recognizer
    energies, = VariableFilter(
        applications=[r.generator.readout.readout], name="output_0")(
                cost_cg)
    bottom_output, = VariableFilter(
        applications=[r.bottom.apply], name="output")(
                cost_cg)
    attended, = VariableFilter(
        applications=[r.generator.transition.apply], name="attended")(
                cost_cg)
    attended_mask, = VariableFilter(
        applications=[r.generator.transition.apply], name="attended_mask")(
                cost_cg)
    weights, = VariableFilter(
        applications=[r.generator.evaluate], name="weights")(
                cost_cg)
    max_recording_length = named_copy(r.recordings.shape[0],
                                      "max_recording_length")
    # To exclude subsampling related bugs
    max_attended_mask_length = named_copy(attended_mask.shape[0],
                                          "max_attended_mask_length")
    max_attended_length = named_copy(attended.shape[0],
                                     "max_attended_length")
    max_num_phonemes = named_copy(r.labels.shape[0],
                                  "max_num_phonemes")
    min_energy = named_copy(energies.min(), "min_energy")
    max_energy = named_copy(energies.max(), "max_energy")
    mean_attended = named_copy(abs(attended).mean(),
                               "mean_attended")
    mean_bottom_output = named_copy(abs(bottom_output).mean(),
                                    "mean_bottom_output")
    weights_penalty = named_copy(monotonicity_penalty(weights, r.labels_mask),
                                 "weights_penalty")
    weights_entropy = named_copy(entropy(weights, r.labels_mask),
                                 "weights_entropy")
    mask_density = named_copy(r.labels_mask.mean(),
                              "mask_density")
    cg = ComputationGraph([
        cost, weights_penalty, weights_entropy,
        min_energy, max_energy,
        mean_attended, mean_bottom_output,
        batch_size, max_num_phonemes,
        mask_density])

    # Regularization. It is applied explicitly to all variables
    # of interest, it could not be applied to the cost only as it
    # would not have effect on auxiliary variables, see Blocks #514.
    reg_config = config['regularization']
    regularized_cg = cg
    if reg_config.get('dropout'):
        logger.info('apply dropout')
        regularized_cg = apply_dropout(cg, [bottom_output], 0.5)
    if reg_config.get('noise'):
        logger.info('apply noise')
        noise_subjects = [p for p in cg.parameters if p not in attention_params]
        regularized_cg = apply_noise(cg, noise_subjects, reg_config['noise'])
    regularized_cost = regularized_cg.outputs[0]
    regularized_weights_penalty = regularized_cg.outputs[1]

    # Model is weird class, we spend lots of time arguing with Bart
    # what it should be. However it can already nice things, e.g.
    # one extract all the parameters from the computation graphs
    # and give them hierahical names. This help to notice when a
    # because of some bug a parameter is not in the computation
    # graph.
    model = SpeechModel(regularized_cost)
    params = model.get_parameter_dict()
    logger.info("Parameters:\n" +
                pprint.pformat(
                    [(key, params[key].get_value().shape) for key
                        in sorted(params.keys())],
                    width=120))

    # Define the training algorithm.
    train_conf = config['training']
    clipping = StepClipping(train_conf['gradient_threshold'])
    clipping.threshold.name = "gradient_norm_threshold"
    rule_names = train_conf.get('rules', ['momentum'])
    core_rules = []
    if 'momentum' in rule_names:
        logger.info("Using scaling and momentum for training")
        core_rules.append(Momentum(train_conf['scale'], train_conf['momentum']))
    if 'adadelta' in rule_names:
        logger.info("Using AdaDelta for training")
        core_rules.append(AdaDelta(train_conf['decay_rate'], train_conf['epsilon']))
    max_norm_rules = []
    if reg_config.get('max_norm', False):
        logger.info("Apply MaxNorm")
        maxnorm_subjects = VariableFilter(roles=[WEIGHT])(cg.parameters)
        if reg_config.get('max_norm_exclude_lookup', False):
            maxnorm_subjects = [v for v in maxnorm_subjects
                                if not isinstance(get_brick(v), LookupTable)]
        logger.info("Parameters covered by MaxNorm:\n"
                    + pprint.pformat([name for name, p in params.items()
                                        if p in maxnorm_subjects]))
        logger.info("Parameters NOT covered by MaxNorm:\n"
                    + pprint.pformat([name for name, p in params.items()
                                        if not p in maxnorm_subjects]))
        max_norm_rules = [
            Restrict(VariableClipping(reg_config['max_norm'], axis=0),
                        maxnorm_subjects)]
    algorithm = GradientDescent(
        cost=regularized_cost +
            reg_config.get("penalty_coof", .0) * regularized_weights_penalty / batch_size +
            reg_config.get("decay", .0) *
            l2_norm(VariableFilter(roles=[WEIGHT])(cg.parameters)) ** 2,
        parameters=params.values(),
        step_rule=CompositeRule(
            [clipping] + core_rules + max_norm_rules +
            # Parameters are not changed at all
            # when nans are encountered.
            [RemoveNotFinite(0.0)]))

    # More variables for debugging: some of them can be added only
    # after the `algorithm` object is created.
    observables = regularized_cg.outputs
    observables += [
        algorithm.total_step_norm, algorithm.total_gradient_norm,
        clipping.threshold]
    for name, param in params.items():
        num_elements = numpy.product(param.get_value().shape)
        norm = param.norm(2) / num_elements ** 0.5
        grad_norm = algorithm.gradients[param].norm(2) / num_elements ** 0.5
        step_norm = algorithm.steps[param].norm(2) / num_elements ** 0.5
        stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm)
        stats.name = name + '_stats'
        observables.append(stats)

    def attach_aggregation_schemes(variables):
        # Aggregation specification has to be factored out as a separate
        # function as it has to be applied at the very last stage
        # separately to training and validation observables.
        result = []
        for var in variables:
            if var.name == 'weights_penalty':
                result.append(named_copy(aggregation.mean(var, batch_size),
                                            'weights_penalty_per_recording'))
            elif var.name == 'weights_entropy':
                result.append(named_copy(aggregation.mean(
                    var, recognizer.labels_mask.sum()), 'weights_entropy_per_label'))
            else:
                result.append(var)
        return result

    # Build main loop.
    logger.info("Initialize extensions")
    extensions = []
    if use_load_ext and params:
        extensions.append(Load(params, load_iteration_state=True, load_log=True))
    if load_log and params:
        extensions.append(LoadLog(params))
    extensions += [
        Timing(after_batch=True),
        CGStatistics(),
        #CodeVersion(['lvsr']),
        ]
    extensions.append(TrainingDataMonitoring(
        [observables[0], algorithm.total_gradient_norm,
            algorithm.total_step_norm, clipping.threshold,
            max_recording_length,
            max_attended_length, max_attended_mask_length], after_batch=True))
    average_monitoring = TrainingDataMonitoring(
        attach_aggregation_schemes(observables),
        prefix="average", every_n_batches=10)
    extensions.append(average_monitoring)
    validation = DataStreamMonitoring(
        attach_aggregation_schemes([cost, weights_entropy, weights_penalty]),
        data.get_stream("valid"), prefix="valid").set_conditions(
            before_first_epoch=not fast_start,
            every_n_epochs=validation_epochs,
            every_n_batches=validation_batches,
            after_training=False)
    extensions.append(validation)
    recognizer.init_beam_search(10)
    per = PhonemeErrorRate(recognizer, data.get_dataset("valid"))
    per_monitoring = DataStreamMonitoring(
        [per], data.get_stream("valid", batches=False, shuffle=False),
        prefix="valid").set_conditions(
            before_first_epoch=not fast_start,
            every_n_epochs=per_epochs,
            every_n_batches=per_batches,
            after_training=False)
    extensions.append(per_monitoring)
    track_the_best_per = TrackTheBest(
        per_monitoring.record_name(per)).set_conditions(
            before_first_epoch=True, after_epoch=True)
    track_the_best_likelihood = TrackTheBest(
        validation.record_name(cost)).set_conditions(
            before_first_epoch=True, after_epoch=True)
    extensions += [track_the_best_likelihood, track_the_best_per]
    extensions.append(AdaptiveClipping(
        algorithm.total_gradient_norm.name,
        clipping, train_conf['gradient_threshold'],
        decay_rate=0.998, burnin_period=500))
    extensions += [
        SwitchOffLengthFilter(data.length_filter,
            after_n_batches=train_conf.get('stop_filtering')),
        FinishAfter(after_n_batches=train_conf['num_batches'],
                    after_n_epochs=train_conf['num_epochs'])
        .add_condition(["after_batch"], _gradient_norm_is_none),
        # Live plotting: requires launching `bokeh-server`
        # and allows to see what happens online.
        Plot(bokeh_name
             if bokeh_name
             else os.path.basename(save_path),
             [# Plot 1: training and validation costs
             [average_monitoring.record_name(regularized_cost),
             validation.record_name(cost)],
             # Plot 2: gradient norm,
             [average_monitoring.record_name(algorithm.total_gradient_norm),
             average_monitoring.record_name(clipping.threshold)],
             # Plot 3: phoneme error rate
             [per_monitoring.record_name(per)],
             # Plot 4: training and validation mean weight entropy
             [average_monitoring._record_name('weights_entropy_per_label'),
             validation._record_name('weights_entropy_per_label')],
             # Plot 5: training and validation monotonicity penalty
             [average_monitoring._record_name('weights_penalty_per_recording'),
             validation._record_name('weights_penalty_per_recording')]],
             every_n_batches=10,
             server_url=bokeh_server),
        Checkpoint(save_path,
                   before_first_epoch=not fast_start, after_epoch=True,
                   every_n_batches=train_conf.get('save_every_n_batches'),
                   save_separately=["model", "log"],
                   use_cpickle=True)
        .add_condition(
            ['after_epoch'],
            OnLogRecord(track_the_best_per.notification_name),
            (root_path + "_best" + extension,))
        .add_condition(
            ['after_epoch'],
            OnLogRecord(track_the_best_likelihood.notification_name),
            (root_path + "_best_ll" + extension,)),
        ProgressBar(),
        Printing(every_n_batches=1,
                    attribute_filter=PrintingFilterList()
                    )]

    # Save the config into the status
    log = TrainingLog()
    log.status['_config'] = repr(config)
    main_loop = MainLoop(
        model=model, log=log, algorithm=algorithm,
        data_stream=data.get_stream("train"),
        extensions=extensions)
    main_loop.run()
Esempio n. 33
0
    def __init__(self, langevin_itr, eta=0.1, alpha=0.75, gamma0=1e-4,
                 hard_limit_on_scopping=True, scoping=1.001, epsilon=1e-4,
                 cost=None, parameters=None, step_rule=None,
                 gradients=None, known_grads=None, consider_constant=None,
                 **kwargs):
        self.eta = np.float32(eta)
        self.alpha = np.float32(alpha)
        self.gamma = theano.shared(np.float32(gamma0), name='ESGD_gamma')
        self.scoping = np.float32(scoping)
        self.hard_limit_on_scopping = hard_limit_on_scopping
        self.epsilon = np.float32(epsilon)
        self.langevin_itr = langevin_itr
        self.langevin_step = 1

        # Set initial values for cost, parameters, gradients.
        self.cost = cost
        self.parameters = parameters
        # Coerce lists of tuples to OrderedDict. Do not coerce Mappings,
        # as we don't want to convert dict -> OrderedDict and give it
        # an arbitrary, non-deterministic order.
        if gradients is not None and not isinstance(gradients, Mapping):
            gradients = OrderedDict(gradients)
        self.gradients = gradients

        # If we don't have gradients, we'll need to infer them from the
        # cost and the parameters, both of which must not be None.
        if not self.gradients:
            self.gradients = self._compute_gradients(known_grads,
                                                     consider_constant)
        else:
            if cost is not None:
                logger.warning(('{}: gradients already specified directly; '
                                'cost is unused.'
                                .format(self.__class__.__name__)))
            if self.parameters is None and isinstance(gradients, OrderedDict):
                # If the dictionary is ordered, it's safe to use the keys
                # as they have a deterministic order.
                self.parameters = list(self.gradients.keys())
            elif self.parameters is not None:
                # If parameters and gradients.keys() don't match we can
                # try to recover if gradients is ordered.
                if set(self.parameters) != set(self.gradients.keys()):
                    logger.warn("Specified parameters list does not match "
                                "keys in provided gradient dictionary; "
                                "using parameters inferred from gradients")
                    if not isinstance(self.gradients, OrderedDict):
                        raise ValueError(determinism_error)
                    self.parameters = list(self.gradients.keys())
            else:
                # self.parameters is not None, and gradients isn't
                # an OrderedDict. We can't do anything safe.
                raise ValueError(determinism_error)
            if known_grads:
                raise ValueError("known_grads has no effect when gradients "
                                 "are passed in")
            if consider_constant is not None:
                raise ValueError("consider_constant has no effect when "
                                 "gradients are passed in")

        # ------------ESGD interception! -----------------
        # recreating a two list of parameters of theano shared
        # they are x_prime and mu in the paper
        true_parameters = []
        mu_parameters = []
        for param in self.parameters:
            new_param = theano.shared(param.get_value(),
                                      name=param.name)
            # same thing but we need a unique object
            mu_param = theano.shared(param.get_value(),
                                     name=param.name)
            true_parameters += [new_param]
            mu_parameters += [mu_param]

        self.true_parameters = true_parameters
        self.mu_parameters = mu_parameters

        new_gradients = OrderedDict()
        #import ipdb; ipdb.set_trace()
        for true_param, param in zip(true_parameters, self.parameters):
            gradient = self.gradients[param]
            new_gradient = gradient - self.gamma * (true_param - param)
            new_gradients.update({param: new_gradient})
        # gradients now contain the ESGD step (line 4 algo 1 of the paper)
        del self.gradients
        self.gradients = new_gradients


        # The order in which the different gradient terms appears
        # here matters, as floating point addition is non-commutative (and
        # Theano's graph optimizations are not order-independent).
        # This is why we do not use .values().
        gradient_values = [self.gradients[p] for p in self.parameters]
        self.total_gradient_norm = (l2_norm(gradient_values)
                                    .copy(name="total_gradient_norm"))

        self.step_rule = step_rule if step_rule else Scale()


        logger.debug("Computing parameter steps...")
        self.steps, self.step_rule_updates = (
            self.step_rule.compute_steps(self.gradients))

        # Same as gradient_values above: the order may influence a
        # bunch of things, so enforce a consistent one (don't use
        # .values()).
        step_values = [self.steps[p] for p in self.parameters]
        self.total_step_norm = (l2_norm(step_values)
                                .copy(name="total_step_norm"))

        # Once again, iterating on gradients may not be deterministically
        # ordered if it is not an OrderedDict. We add the updates here in
        # the order specified in self.parameters. Keep it this way to
        # maintain reproducibility.

        # ---- Another ESGD interception here! -----------------
        randrg = theano.tensor.shared_randomstreams.RandomStreams(seed=1234)
        eps = self.epsilon * randrg.normal(dtype=theano.config.floatX)
        eta_prime = getattr(self.step_rule, 'learning_rate')
        slgd_eta_update = theano.tensor.sqrt(eta_prime) * eps

        kwargs.setdefault('updates', []).extend(
            itertools.chain(((parameter, parameter - self.steps[parameter] + slgd_eta_update)
                             for parameter in self.parameters),
                            self.step_rule_updates)
        )

        mu_updates = [(mu, np.float32(1. - self.alpha) * mu + self.alpha * x_prime) for mu, x_prime \
                      in zip(self.mu_parameters, self.parameters)]
        self.mu_updates = mu_updates

        super(EntropySGD, self).__init__(**kwargs)
Esempio n. 34
0
def initialize_all(config, save_path, bokeh_name, params, bokeh_server, bokeh,
                   test_tag, use_load_ext, load_log, fast_start):
    root_path, extension = os.path.splitext(save_path)

    data = Data(**config['data'])
    train_conf = config['training']
    recognizer = create_model(config, data, test_tag)

    # Separate attention_params to be handled differently
    # when regularization is applied
    attention = recognizer.generator.transition.attention
    attention_params = Selector(attention).get_parameters().values()

    logger.info(
        "Initialization schemes for all bricks.\n"
        "Works well only in my branch with __repr__ added to all them,\n"
        "there is an issue #463 in Blocks to do that properly.")

    def show_init_scheme(cur):
        result = dict()
        for attr in dir(cur):
            if attr.endswith('_init'):
                result[attr] = getattr(cur, attr)
        for child in cur.children:
            result[child.name] = show_init_scheme(child)
        return result

    logger.info(pprint.pformat(show_init_scheme(recognizer)))

    prediction, prediction_mask = add_exploration(recognizer, data, train_conf)

    #
    # Observables:
    #
    primary_observables = []  # monitored each batch
    secondary_observables = []  # monitored every 10 batches
    validation_observables = []  # monitored on the validation set

    cg = recognizer.get_cost_graph(batch=True,
                                   prediction=prediction,
                                   prediction_mask=prediction_mask)
    labels, = VariableFilter(applications=[recognizer.cost], name='labels')(cg)
    labels_mask, = VariableFilter(applications=[recognizer.cost],
                                  name='labels_mask')(cg)

    gain_matrix = VariableFilter(
        theano_name=RewardRegressionEmitter.GAIN_MATRIX)(cg)
    if len(gain_matrix):
        gain_matrix, = gain_matrix
        primary_observables.append(rename(gain_matrix.min(), 'min_gain'))
        primary_observables.append(rename(gain_matrix.max(), 'max_gain'))

    batch_cost = cg.outputs[0].sum()
    batch_size = rename(recognizer.labels.shape[1], "batch_size")
    # Assumes constant batch size. `aggregation.mean` is not used because
    # of Blocks #514.
    cost = batch_cost / batch_size
    cost.name = "sequence_total_cost"
    logger.info("Cost graph is built")

    # Fetch variables useful for debugging.
    # It is important not to use any aggregation schemes here,
    # as it's currently impossible to spread the effect of
    # regularization on their variables, see Blocks #514.
    cost_cg = ComputationGraph(cost)
    r = recognizer
    energies, = VariableFilter(applications=[r.generator.readout.readout],
                               name="output_0")(cost_cg)
    bottom_output = VariableFilter(
        # We need name_regex instead of name because LookupTable calls itsoutput output_0
        applications=[r.bottom.apply],
        name_regex="output")(cost_cg)[-1]
    attended, = VariableFilter(applications=[r.generator.transition.apply],
                               name="attended")(cost_cg)
    attended_mask, = VariableFilter(applications=[
        r.generator.transition.apply
    ],
                                    name="attended_mask")(cost_cg)
    weights, = VariableFilter(applications=[r.generator.evaluate],
                              name="weights")(cost_cg)

    from blocks.roles import AUXILIARY
    l2_cost, = VariableFilter(roles=[AUXILIARY],
                              theano_name='l2_cost_aux')(cost_cg)
    cost_forward, = VariableFilter(roles=[AUXILIARY],
                                   theano_name='costs_forward_aux')(cost_cg)

    max_recording_length = rename(bottom_output.shape[0],
                                  "max_recording_length")
    # To exclude subsampling related bugs
    max_attended_mask_length = rename(attended_mask.shape[0],
                                      "max_attended_mask_length")
    max_attended_length = rename(attended.shape[0], "max_attended_length")
    max_num_phonemes = rename(labels.shape[0], "max_num_phonemes")
    min_energy = rename(energies.min(), "min_energy")
    max_energy = rename(energies.max(), "max_energy")
    mean_attended = rename(abs(attended).mean(), "mean_attended")
    mean_bottom_output = rename(
        abs(bottom_output).mean(), "mean_bottom_output")
    weights_penalty = rename(monotonicity_penalty(weights, labels_mask),
                             "weights_penalty")
    weights_entropy = rename(entropy(weights, labels_mask), "weights_entropy")
    mask_density = rename(labels_mask.mean(), "mask_density")
    cg = ComputationGraph([
        cost, weights_penalty, weights_entropy, min_energy, max_energy,
        mean_attended, mean_bottom_output, batch_size, max_num_phonemes,
        mask_density
    ])
    # Regularization. It is applied explicitly to all variables
    # of interest, it could not be applied to the cost only as it
    # would not have effect on auxiliary variables, see Blocks #514.
    reg_config = config.get('regularization', dict())
    regularized_cg = cg
    if reg_config.get('dropout'):
        logger.info('apply dropout')
        regularized_cg = apply_dropout(cg, [bottom_output], 0.5)
    if reg_config.get('noise'):
        logger.info('apply noise')
        noise_subjects = [
            p for p in cg.parameters if p not in attention_params
        ]
        regularized_cg = apply_noise(cg, noise_subjects, reg_config['noise'])

    train_cost = regularized_cg.outputs[0]
    if reg_config.get("penalty_coof", .0) > 0:
        # big warning!!!
        # here we assume that:
        # regularized_weights_penalty = regularized_cg.outputs[1]
        train_cost = (train_cost + reg_config.get("penalty_coof", .0) *
                      regularized_cg.outputs[1] / batch_size)
    if reg_config.get("decay", .0) > 0:
        train_cost = (
            train_cost + reg_config.get("decay", .0) *
            l2_norm(VariableFilter(roles=[WEIGHT])(cg.parameters))**2)

    train_cost = rename(train_cost, 'train_cost')

    gradients = None
    if reg_config.get('adaptive_noise'):
        logger.info('apply adaptive noise')
        if ((reg_config.get("penalty_coof", .0) > 0)
                or (reg_config.get("decay", .0) > 0)):
            logger.error('using  adaptive noise with alignment weight panalty '
                         'or weight decay is probably stupid')
        train_cost, regularized_cg, gradients, noise_brick = apply_adaptive_noise(
            cg,
            cg.outputs[0],
            variables=cg.parameters,
            num_examples=data.get_dataset('train').num_examples,
            parameters=Model(
                regularized_cg.outputs[0]).get_parameter_dict().values(),
            **reg_config.get('adaptive_noise'))
        train_cost.name = 'train_cost'
        adapt_noise_cg = ComputationGraph(train_cost)
        model_prior_mean = rename(
            VariableFilter(applications=[noise_brick.apply],
                           name='model_prior_mean')(adapt_noise_cg)[0],
            'model_prior_mean')
        model_cost = rename(
            VariableFilter(applications=[noise_brick.apply],
                           name='model_cost')(adapt_noise_cg)[0], 'model_cost')
        model_prior_variance = rename(
            VariableFilter(applications=[noise_brick.apply],
                           name='model_prior_variance')(adapt_noise_cg)[0],
            'model_prior_variance')
        regularized_cg = ComputationGraph(
            [train_cost, model_cost] + regularized_cg.outputs +
            [model_prior_mean, model_prior_variance])
        primary_observables += [
            regularized_cg.outputs[1],  # model cost
            regularized_cg.outputs[2],  # task cost
            regularized_cg.outputs[-2],  # model prior mean
            regularized_cg.outputs[-1]
        ]  # model prior variance

    model = Model(train_cost)
    if params:
        logger.info("Load parameters from " + params)
        # please note: we cannot use recognizer.load_params
        # as it builds a new computation graph that dies not have
        # shapred variables added by adaptive weight noise
        with open(params, 'r') as src:
            param_values = load_parameters(src)
        model.set_parameter_values(param_values)

    parameters = model.get_parameter_dict()
    logger.info("Parameters:\n" +
                pprint.pformat([(key, parameters[key].get_value().shape)
                                for key in sorted(parameters.keys())],
                               width=120))

    # Define the training algorithm.
    clipping = StepClipping(train_conf['gradient_threshold'])
    clipping.threshold.name = "gradient_norm_threshold"
    rule_names = train_conf.get('rules', ['momentum'])
    core_rules = []
    if 'momentum' in rule_names:
        logger.info("Using scaling and momentum for training")
        core_rules.append(Momentum(train_conf['scale'],
                                   train_conf['momentum']))
    if 'adadelta' in rule_names:
        logger.info("Using AdaDelta for training")
        core_rules.append(
            AdaDelta(train_conf['decay_rate'], train_conf['epsilon']))
    max_norm_rules = []
    if reg_config.get('max_norm', False) > 0:
        logger.info("Apply MaxNorm")
        maxnorm_subjects = VariableFilter(roles=[WEIGHT])(cg.parameters)
        if reg_config.get('max_norm_exclude_lookup', False):
            maxnorm_subjects = [
                v for v in maxnorm_subjects
                if not isinstance(get_brick(v), LookupTable)
            ]
        logger.info("Parameters covered by MaxNorm:\n" + pprint.pformat(
            [name for name, p in parameters.items() if p in maxnorm_subjects]))
        logger.info("Parameters NOT covered by MaxNorm:\n" + pprint.pformat([
            name for name, p in parameters.items() if not p in maxnorm_subjects
        ]))
        max_norm_rules = [
            Restrict(VariableClipping(reg_config['max_norm'], axis=0),
                     maxnorm_subjects)
        ]
    burn_in = []
    if train_conf.get('burn_in_steps', 0):
        burn_in.append(BurnIn(num_steps=train_conf['burn_in_steps']))
    algorithm = GradientDescent(
        cost=train_cost,
        parameters=parameters.values(),
        gradients=gradients,
        step_rule=CompositeRule(
            [clipping] + core_rules + max_norm_rules +
            # Parameters are not changed at all
            # when nans are encountered.
            [RemoveNotFinite(0.0)] + burn_in),
        on_unused_sources='warn')

    logger.debug("Scan Ops in the gradients")
    gradient_cg = ComputationGraph(algorithm.gradients.values())
    for op in ComputationGraph(gradient_cg).scans:
        logger.debug(op)

    # More variables for debugging: some of them can be added only
    # after the `algorithm` object is created.
    secondary_observables += list(regularized_cg.outputs)
    if not 'train_cost' in [v.name for v in secondary_observables]:
        secondary_observables += [train_cost]
    secondary_observables += [
        algorithm.total_step_norm, algorithm.total_gradient_norm,
        clipping.threshold
    ]
    for name, param in parameters.items():
        num_elements = numpy.product(param.get_value().shape)
        norm = param.norm(2) / num_elements**0.5
        grad_norm = algorithm.gradients[param].norm(2) / num_elements**0.5
        step_norm = algorithm.steps[param].norm(2) / num_elements**0.5
        stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm)
        stats.name = name + '_stats'
        secondary_observables.append(stats)

    primary_observables += [
        train_cost, algorithm.total_gradient_norm, algorithm.total_step_norm,
        clipping.threshold, max_recording_length, max_attended_length,
        max_attended_mask_length
    ]

    validation_observables += [
        rename(aggregation.mean(batch_cost, batch_size), cost.name),
        rename(aggregation.sum_(batch_size), 'num_utterances'),
        weights_entropy, weights_penalty
    ]

    def attach_aggregation_schemes(variables):
        # Aggregation specification has to be factored out as a separate
        # function as it has to be applied at the very last stage
        # separately to training and validation observables.
        result = []
        for var in variables:
            if var.name == 'weights_penalty':
                result.append(
                    rename(aggregation.mean(var, batch_size),
                           'weights_penalty_per_recording'))
            elif var.name == 'weights_entropy':
                result.append(
                    rename(aggregation.mean(var, labels_mask.sum()),
                           'weights_entropy_per_label'))
            else:
                result.append(var)
        return result

    mon_conf = config['monitoring']

    # Build main loop.
    logger.info("Initialize extensions")
    extensions = []
    if use_load_ext and params:
        extensions.append(
            Load(params, load_iteration_state=True, load_log=True))
    if load_log and params:
        extensions.append(LoadLog(params))
    extensions += [
        Timing(after_batch=True),
        CGStatistics(),
        #CodeVersion(['lvsr']),
    ]
    extensions.append(
        TrainingDataMonitoring(primary_observables + [l2_cost, cost_forward],
                               after_batch=True))
    average_monitoring = TrainingDataMonitoring(
        attach_aggregation_schemes(secondary_observables),
        prefix="average",
        every_n_batches=10)
    extensions.append(average_monitoring)
    validation = DataStreamMonitoring(
        attach_aggregation_schemes(validation_observables +
                                   [l2_cost, cost_forward]),
        data.get_stream("valid", shuffle=False),
        prefix="valid").set_conditions(
            before_first_epoch=not fast_start,
            every_n_epochs=mon_conf['validate_every_epochs'],
            every_n_batches=mon_conf['validate_every_batches'],
            after_training=False)
    extensions.append(validation)
    per = PhonemeErrorRate(recognizer, data, **config['monitoring']['search'])
    per_monitoring = DataStreamMonitoring(
        [per],
        data.get_stream("valid", batches=False, shuffle=False),
        prefix="valid").set_conditions(
            before_first_epoch=not fast_start,
            every_n_epochs=mon_conf['search_every_epochs'],
            every_n_batches=mon_conf['search_every_batches'],
            after_training=False)
    extensions.append(per_monitoring)
    track_the_best_per = TrackTheBest(
        per_monitoring.record_name(per)).set_conditions(
            before_first_epoch=True, after_epoch=True)
    track_the_best_cost = TrackTheBest(
        validation.record_name(cost)).set_conditions(before_first_epoch=True,
                                                     after_epoch=True)
    extensions += [track_the_best_cost, track_the_best_per]
    extensions.append(
        AdaptiveClipping(algorithm.total_gradient_norm.name,
                         clipping,
                         train_conf['gradient_threshold'],
                         decay_rate=0.998,
                         burnin_period=500))
    extensions += [
        SwitchOffLengthFilter(
            data.length_filter,
            after_n_batches=train_conf.get('stop_filtering')),
        FinishAfter(after_n_batches=train_conf.get('num_batches'),
                    after_n_epochs=train_conf.get('num_epochs')).add_condition(
                        ["after_batch"], _gradient_norm_is_none),
    ]
    channels = [
        # Plot 1: training and validation costs
        [
            average_monitoring.record_name(train_cost),
            validation.record_name(cost)
        ],
        # Plot 2: gradient norm,
        [
            average_monitoring.record_name(algorithm.total_gradient_norm),
            average_monitoring.record_name(clipping.threshold)
        ],
        # Plot 3: phoneme error rate
        [per_monitoring.record_name(per)],
        # Plot 4: training and validation mean weight entropy
        [
            average_monitoring._record_name('weights_entropy_per_label'),
            validation._record_name('weights_entropy_per_label')
        ],
        # Plot 5: training and validation monotonicity penalty
        [
            average_monitoring._record_name('weights_penalty_per_recording'),
            validation._record_name('weights_penalty_per_recording')
        ]
    ]
    if bokeh:
        extensions += [
            Plot(bokeh_name if bokeh_name else os.path.basename(save_path),
                 channels,
                 every_n_batches=10,
                 server_url=bokeh_server),
        ]
    extensions += [
        Checkpoint(save_path,
                   before_first_epoch=not fast_start,
                   after_epoch=True,
                   every_n_batches=train_conf.get('save_every_n_batches'),
                   save_separately=["model", "log"],
                   use_cpickle=True).add_condition(
                       ['after_epoch'],
                       OnLogRecord(track_the_best_per.notification_name),
                       (root_path + "_best" + extension, )).add_condition(
                           ['after_epoch'],
                           OnLogRecord(track_the_best_cost.notification_name),
                           (root_path + "_best_ll" + extension, )),
        ProgressBar()
    ]
    extensions.append(EmbedIPython(use_main_loop_run_caller_env=True))
    if config['net']['criterion']['name'].startswith('mse'):
        extensions.append(
            LogInputsGains(labels, cg, recognizer.generator.readout.emitter,
                           data))

    if train_conf.get('patience'):
        patience_conf = train_conf['patience']
        if not patience_conf.get('notification_names'):
            # setdefault will not work for empty list
            patience_conf['notification_names'] = [
                track_the_best_per.notification_name,
                track_the_best_cost.notification_name
            ]
        extensions.append(Patience(**patience_conf))

    extensions.append(
        Printing(every_n_batches=1, attribute_filter=PrintingFilterList()))

    return model, algorithm, data, extensions
Esempio n. 35
0
def initialize_graph(recognizer, data, config, params):
    # Separate attention_params to be handled differently
    # when regularization is applied
    attentions = recognizer.all_children().generator.transition.attention.get()
    attention_params = [Selector(attention).get_parameters().values()
                        for attention in attentions]

    logger.info(
        "Initialization schemes for all bricks.\n"
        "Works well only in my branch with __repr__ added to all them,\n"
        "there is an issue #463 in Blocks to do that properly.")

    def show_init_scheme(cur):
        result = dict()
        for attr in dir(cur):
            if attr.endswith('_init'):
                result[attr] = getattr(cur, attr)
        for child in cur.children:
            result[child.name] = show_init_scheme(child)
        return result
    logger.info(pprint.pformat(show_init_scheme(recognizer)))

    observables = []  # monitored each batch
    cg = recognizer.get_cost_graph(batch=True)
    labels = []
    labels_mask = []
    for chld in recognizer.children:
        lbls = VariableFilter(applications=[chld.cost], name='labels'+chld.names_postfix)(cg)
        lbls_mask = VariableFilter(applications=[chld.cost], name='labels_mask'+chld.names_postfix)(cg)
        if len(lbls) == 1:
            labels += lbls
            labels_mask += lbls_mask

    batch_cost = cg.outputs[0].sum()
    batch_size = rename(labels[0].shape[1], "batch_size")
    # Assumes constant batch size. `aggregation.mean` is not used because
    # of Blocks #514.
    cost = batch_cost / batch_size

    cost.name = "sequence_total_cost"
    logger.info("Cost graph is built")

    # Fetch variables useful for debugging.
    # It is important not to use any aggregation schemes here,
    # as it's currently impossible to spread the effect of
    # regularization on their variables, see Blocks #514.
    cost_cg = ComputationGraph(cost)
    
    bottom_output = VariableFilter(
        # We need name_regex instead of name because LookupTable calls itsoutput output_0
        applications=recognizer.all_children().bottom.apply.get(), name_regex="output")(
            cost_cg)
    
    attended = VariableFilter(
        applications=recognizer.all_children().generator.transition.apply.get(), name="attended")(
            cost_cg)
    attended_mask = VariableFilter(
        applications=recognizer.all_children().generator.transition.apply.get(), name="attended_mask")(
            cost_cg)
    weights = VariableFilter(
        applications=recognizer.all_children().generator.evaluate.get(), name="weights")(
            cost_cg)
    
    def get_renamed_list(rlist, elem_func, elem_name):
        return [rename(elem_func(elem), elem_name+chld.names_postfix)
                    for elem,chld in zip(rlist, recognizer.children)]
        
    max_sentence_lengths = get_renamed_list(bottom_output,
                                            lambda e: e.shape[0],
                                            "max_sentence_length")
    max_attended_mask_lengths = get_renamed_list(attended_mask,
                                            lambda e: e.shape[0],
                                            "max_attended_mask_length")
    max_attended_lengths = get_renamed_list(attended,
                                            lambda e: e.shape[0],
                                            "max_attended_length")
    max_num_characters = get_renamed_list(labels,
                                            lambda e: e.shape[0],
                                            "max_num_characters")
    
    mean_attended = get_renamed_list(attended,
                                            lambda e: abs(e).mean(),
                                            "mean_attended")
    mean_bottom_output = get_renamed_list(bottom_output,
                                            lambda e: abs(e).mean(),
                                            "mean_bottom_output")
    
    mask_density = get_renamed_list(labels_mask,
                                            lambda e: e.mean(),
                                            "mask_density")
    weights_entropy = [rename(entropy(w, lm),
                             "weights_entropy"+chld.names_postfix)
                       for w, lm, chld in zip(weights, labels_mask, recognizer.children)]

    observables += max_attended_lengths + max_attended_mask_lengths + max_sentence_lengths
    #
    # Monitoring of cost terms is tricky because of Blocks #514 - since the
    # costs are annotations that are not part of the original output graph,
    # they are unaffected by replacements such as dropout!!
    #
    cost_terms = []
    for chld in recognizer.children:
        chld_cost_terms = VariableFilter(applications=[chld.generator.evaluate],
                                name_regex='.*_nll')(cost_cg)
        chld_cost_terms = [rename(var, var.name[:-4] + chld.names_postfix + '_nll')
                           for var in chld_cost_terms]
        cost_terms += chld_cost_terms
        
    cg = ComputationGraph([cost, batch_size] +
        weights_entropy + mean_attended +
        mean_bottom_output + max_num_characters +
        mask_density + cost_terms)

    # Regularization. It is applied explicitly to all variables
    # of interest, it could not be applied to the cost only as it
    # would not have effect on auxiliary variables, see Blocks #514.
    reg_config = config['regularization']
    regularized_cg = cg

    if reg_config.get('dropout'):
        drop_conf = reg_config['dropout']
        bot_drop = drop_conf.get('bottom', 0.0)
        if bot_drop:
            logger.info('apply bottom dropout')
            regularized_cg = apply_dropout(regularized_cg,
                                           bottom_output, bot_drop)
        enc_drop = drop_conf.get('encoder', 0.0)
        if enc_drop:
            logger.info('apply encoder dropout')
            enc_bricks = reduce(lambda acc,x: acc+list(x), recognizer.all_children().encoder.children.get(), [])
            enc_states = VariableFilter(bricks=enc_bricks,
                                        name_regex='states')(regularized_cg)
            regularized_cg = apply_dropout(regularized_cg,
                                           enc_states,
                                           enc_drop)
        post_merge_drop = drop_conf.get('post_merge', 0.0)
        if post_merge_drop:
            logger.info('apply post_merge dropout')
            pm_bricks = []
            for chld in recognizer.children:
                cpm_bricks = list(chld.generator.readout.post_merge.children)
                cpm_bricks += cpm_bricks[-1].children
                cpm_bricks = [b for b in cpm_bricks if
                             isinstance(b, type(chld.post_merge_activation))]
                pm_bricks += cpm_bricks
            regularized_cg = apply_dropout(
                regularized_cg,
                VariableFilter(bricks=pm_bricks, name='output')(regularized_cg),
                post_merge_drop)

    if reg_config.get('noise'):
        logger.info('apply noise')
        noise_subjects = [p for p in cg.parameters if p not in attention_params]
        regularized_cg = apply_noise(cg, noise_subjects, reg_config['noise'])

    train_cost = regularized_cg.outputs[0]

    if reg_config.get("penalty_coof", .0) > 0:
        # big warning!!!
        # here we assume that:
        # regularized_weights_penalty = regularized_cg.outputs[1]
        train_cost = (train_cost +
                      reg_config.get("penalty_coof", .0) *
                      regularized_cg.outputs[1] / batch_size)
        
    if reg_config.get("decay", .0) > 0:
        train_cost = (train_cost + reg_config.get("decay", .0) *
                      l2_norm(VariableFilter(roles=[WEIGHT])(cg.parameters)) ** 2)

    train_cost = train_cost.copy(name='train_cost')

    gradients = None
    if reg_config.get('adaptive_noise'):
        logger.info('apply adaptive noise')
        if ((reg_config.get("penalty_coof", .0) > 0) or
                (reg_config.get("decay", .0) > 0)):
            logger.error('using  adaptive noise with alignment weight panalty '
                         'or weight decay is probably stupid')
        train_cost, regularized_cg, gradients, noise_brick = apply_adaptive_noise(
            cg, cg.outputs[0],
            variables=cg.parameters,
            num_examples=data.get_dataset('train').num_examples,
            parameters=SpeechModel(regularized_cg.outputs[0]
                                   ).get_parameter_dict().values(),
            **reg_config.get('adaptive_noise')
        )
        train_cost.name = 'train_cost'
        adapt_noise_cg = ComputationGraph(train_cost)
        model_prior_mean = rename(
            VariableFilter(applications=[noise_brick.apply],
                           name='model_prior_mean')(adapt_noise_cg)[0],
            'model_prior_mean')
        model_cost = rename(
            VariableFilter(applications=[noise_brick.apply],
                           name='model_cost')(adapt_noise_cg)[0],
            'model_cost')
        model_prior_variance = rename(
            VariableFilter(applications=[noise_brick.apply],
                           name='model_prior_variance')(adapt_noise_cg)[0],
            'model_prior_variance')
        regularized_cg = ComputationGraph(
            [train_cost, model_cost] +
            regularized_cg.outputs +
            [model_prior_mean, model_prior_variance])
        observables += [
            regularized_cg.outputs[1],  # model cost
            regularized_cg.outputs[2],  # task cost
            regularized_cg.outputs[-2],  # model prior mean
            regularized_cg.outputs[-1]]  # model prior variance

    if len(cost_terms):
        # Please note - the aggragation (mean) is done in
        # "attach_aggregation_schemes"
        ct_names = [v.name for v in cost_terms]
        for v in regularized_cg.outputs:
            if v.name in ct_names:
                observables.append(rename(v.sum()/batch_size,
                                                  v.name))
    for chld in recognizer.children:
        if chld.train_tags:
            tags_cost = VariableFilter(applications=[chld.addTagCost],
                                       name='output')(regularized_cg)[0]
            observables += [rename(tags_cost.sum()/batch_size, 'tags_nll'+chld.names_postfix)]

    # Model is weird class, we spend lots of time arguing with Bart
    # what it should be. However it can already nice things, e.g.
    # one extract all the parameters from the computation graphs
    # and give them hierahical names. This help to notice when a
    # because of some bug a parameter is not in the computation
    # graph.
    model = SpeechModel(train_cost)
    if params:
        logger.info("Load parameters from " + params)
        # please note: we cannot use recognizer.load_params
        # as it builds a new computation graph that dies not have
        # shapred variables added by adaptive weight noise
        param_values = load_parameter_values(params)
        model.set_parameter_values(param_values)

    parameters = model.get_parameter_dict()

    logger.info("Parameters:\n" +
                pprint.pformat(
                    [(key, parameters[key].get_value().shape) for key
                     in sorted(parameters.keys())],
                    width=120))
    max_norm_rules = []
    if reg_config.get('max_norm', False) > 0:
        logger.info("Apply MaxNorm")
        maxnorm_subjects = VariableFilter(roles=[WEIGHT])(cg.parameters)
        if reg_config.get('max_norm_exclude_lookup', False):
            maxnorm_subjects = [v for v in maxnorm_subjects
                                if not isinstance(get_brick(v), LookupTable)]
        logger.info("Parameters covered by MaxNorm:\n"
                    + pprint.pformat([name for name, p in parameters.items()
                                      if p in maxnorm_subjects]))
        logger.info("Parameters NOT covered by MaxNorm:\n"
                    + pprint.pformat([name for name, p in parameters.items()
                                      if not p in maxnorm_subjects]))
        max_norm_rules = [
            Restrict(VariableClipping(reg_config['max_norm'], axis=0),
                     maxnorm_subjects)]

        
    return { 'observables': observables, 'max_norm_rules': max_norm_rules,
             'cg': cg, 'regularized_cg' : regularized_cg, 'train_cost' : train_cost,
             'cost' : cost, 'batch_size' : batch_size, 'batch_cost' : batch_cost,
             'parameters' : parameters, 'gradients': gradients, 
             'model' : model, 'data' : data, 'recognizer' : recognizer,
             'weights_entropy' : weights_entropy,
             'labels_mask' : labels_mask, 'labels' : labels }
Esempio n. 36
0
def main(mode, save_to, num_epochs, load_params=None,
         feature_maps=None, mlp_hiddens=None,
         conv_sizes=None, pool_sizes=None, stride=None, repeat_times=None,
         batch_size=None, num_batches=None, algo=None,
         test_set=None, valid_examples=None,
         dropout=None, max_norm=None, weight_decay=None,
         batch_norm=None):
    if feature_maps is None:
        feature_maps = [20, 50, 50]
    if mlp_hiddens is None:
        mlp_hiddens = [500]
    if conv_sizes is None:
        conv_sizes = [5, 5, 5]
    if pool_sizes is None:
        pool_sizes = [2, 2, 2]
    if repeat_times is None:
        repeat_times = [1, 1, 1]
    if batch_size is None:
        batch_size = 500
    if valid_examples is None:
        valid_examples = 2500
    if stride is None:
        stride = 1
    if test_set is None:
        test_set = 'test'
    if algo is None:
        algo = 'rmsprop'
    if batch_norm is None:
        batch_norm = False

    image_size = (128, 128)
    output_size = 2

    if (len(feature_maps) != len(conv_sizes) or
        len(feature_maps) != len(pool_sizes) or
        len(feature_maps) != len(repeat_times)):
        raise ValueError("OMG, inconsistent arguments")

    # Use ReLUs everywhere and softmax for the final prediction
    conv_activations = [Rectifier() for _ in feature_maps]
    mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()]
    convnet = LeNet(conv_activations, 3, image_size,
                    stride=stride,
                    filter_sizes=zip(conv_sizes, conv_sizes),
                    feature_maps=feature_maps,
                    pooling_sizes=zip(pool_sizes, pool_sizes),
                    repeat_times=repeat_times,
                    top_mlp_activations=mlp_activations,
                    top_mlp_dims=mlp_hiddens + [output_size],
                    border_mode='full',
                    batch_norm=batch_norm,
                    weights_init=Glorot(),
                    biases_init=Constant(0))
    # We push initialization config to set different initialization schemes
    # for convolutional layers.
    convnet.initialize()
    logging.info("Input dim: {} {} {}".format(
        *convnet.children[0].get_dim('input_')))
    for i, layer in enumerate(convnet.layers):
        if isinstance(layer, Activation):
            logging.info("Layer {} ({})".format(
                i, layer.__class__.__name__))
        else:
            logging.info("Layer {} ({}) dim: {} {} {}".format(
                i, layer.__class__.__name__, *layer.get_dim('output')))


    single_x = tensor.tensor3('image_features')
    x = tensor.tensor4('image_features')
    single_y = tensor.lvector('targets')
    y = tensor.lmatrix('targets')

    # Training
    with batch_normalization(convnet):
        probs = convnet.apply(x)
    cost = (CategoricalCrossEntropy().apply(y.flatten(), probs)
            .copy(name='cost'))
    error_rate = (MisclassificationRate().apply(y.flatten(), probs)
                  .copy(name='error_rate'))

    cg = ComputationGraph([cost, error_rate])
    extra_updates = []

    if batch_norm: # batch norm:
        logger.debug("Apply batch norm")
        pop_updates = get_batch_normalization_updates(cg)
        # p stands for population mean
        # m stands for minibatch
        alpha = 0.005
        extra_updates = [(p, m * alpha + p * (1 - alpha))
                         for p, m in pop_updates]
        population_statistics = [p for p, m in extra_updates]
    if dropout:
        relu_outputs = VariableFilter(bricks=[Rectifier], roles=[OUTPUT])(cg)
        cg = apply_dropout(cg, relu_outputs, dropout)
    cost, error_rate = cg.outputs
    if weight_decay:
        logger.debug("Apply weight decay {}".format(weight_decay))
        cost += weight_decay * l2_norm(cg.parameters)
        cost.name = 'cost'

    # Validation
    valid_probs = convnet.apply_5windows(single_x)
    valid_cost = (CategoricalCrossEntropy().apply(single_y, valid_probs)
            .copy(name='cost'))
    valid_error_rate = (MisclassificationRate().apply(
        single_y, valid_probs).copy(name='error_rate'))

    model = Model([cost, error_rate])
    if load_params:
        logger.info("Loaded params from {}".format(load_params))
        with open(load_params, 'r') as src:
            model.set_parameter_values(load_parameters(src))

    # Training stream with random cropping
    train = DogsVsCats(("train",), subset=slice(None, 25000 - valid_examples, None))
    train_str =  DataStream(
        train, iteration_scheme=ShuffledScheme(train.num_examples, batch_size))
    train_str = add_transformers(train_str, random_crop=True)

    # Validation stream without cropping
    valid = DogsVsCats(("train",), subset=slice(25000 - valid_examples, None, None))
    valid_str = DataStream(
        valid, iteration_scheme=SequentialExampleScheme(valid.num_examples))
    valid_str = add_transformers(valid_str)

    if mode == 'train':
        directory, _ = os.path.split(sys.argv[0])
        env = dict(os.environ)
        env['THEANO_FLAGS'] = 'floatX=float32'
        port = numpy.random.randint(1025, 10000)
        server = subprocess.Popen(
            [directory + '/server.py',
             str(25000 - valid_examples), str(batch_size), str(port)],
            env=env, stderr=subprocess.STDOUT)
        train_str = ServerDataStream(
            ('image_features', 'targets'), produces_examples=False,
            port=port)

        save_to_base, save_to_extension = os.path.splitext(save_to)

        # Train with simple SGD
        if algo == 'rmsprop':
            step_rule = RMSProp(decay_rate=0.999, learning_rate=0.0003)
        elif algo == 'adam':
            step_rule = Adam()
        else:
            assert False
        if max_norm:
            conv_params = VariableFilter(bricks=[Convolutional], roles=[WEIGHT])(cg)
            linear_params = VariableFilter(bricks=[Linear], roles=[WEIGHT])(cg)
            step_rule = CompositeRule(
                [step_rule,
                 Restrict(VariableClipping(max_norm, axis=0), linear_params),
                 Restrict(VariableClipping(max_norm, axis=(1, 2, 3)), conv_params)])

        algorithm = GradientDescent(
            cost=cost, parameters=model.parameters,
            step_rule=step_rule)
        algorithm.add_updates(extra_updates)
        # `Timing` extension reports time for reading data, aggregating a batch
        # and monitoring;
        # `ProgressBar` displays a nice progress bar during training.
        extensions = [Timing(every_n_batches=100),
                    FinishAfter(after_n_epochs=num_epochs,
                                after_n_batches=num_batches),
                    DataStreamMonitoring(
                        [valid_cost, valid_error_rate],
                        valid_str,
                        prefix="valid"),
                    TrainingDataMonitoring(
                        [cost, error_rate,
                        aggregation.mean(algorithm.total_gradient_norm)],
                        prefix="train",
                        after_epoch=True),
                    TrackTheBest("valid_error_rate"),
                    Checkpoint(save_to, save_separately=['log'],
                               parameters=cg.parameters +
                               (population_statistics if batch_norm else []),
                               before_training=True, after_epoch=True)
                        .add_condition(
                            ['after_epoch'],
                            OnLogRecord("valid_error_rate_best_so_far"),
                            (save_to_base + '_best' + save_to_extension,)),
                    Printing(every_n_batches=100)]

        model = Model(cost)

        main_loop = MainLoop(
            algorithm,
            train_str,
            model=model,
            extensions=extensions)
        try:
            main_loop.run()
        finally:
            server.terminate()
    elif mode == 'test':
        classify = theano.function([single_x], valid_probs.argmax())
        test = DogsVsCats((test_set,))
        test_str = DataStream(
            test, iteration_scheme=SequentialExampleScheme(test.num_examples))
        test_str = add_transformers(test_str)
        correct = 0
        with open(save_to, 'w') as dst:
            print("id", "label", sep=',', file=dst)
            for index, example in enumerate(test_str.get_epoch_iterator()):
                image = example[0]
                prediction = classify(image)
                print(index + 1, classify(image), sep=',', file=dst)
                if len(example) > 1 and prediction == example[1]:
                    correct += 1
        print(correct / float(test.num_examples))
    else:
        assert False
Esempio n. 37
0
def construct_monitors(algorithm,
                       task,
                       n_patches,
                       x,
                       x_shape,
                       graph,
                       name,
                       ram,
                       model,
                       cost,
                       n_spatial_dims,
                       plot_url,
                       patchmonitor_interval=100,
                       **kwargs):
    location, scale, savings = util.get_recurrent_auxiliaries(
        "location scale savings".split(), graph, n_patches)

    channels = util.Channels()
    channels.extend(task.monitor_channels(graph))

    channels.append(util.named(savings.mean(), "savings.mean"))

    for variable_name in "location scale".split():
        variable = locals()[variable_name]
        channels.append(variable.mean(axis=0), "%s.mean" % variable_name)
        channels.append(variable.var(axis=0), "%s.variance" % variable_name)

    channels.append(algorithm.total_gradient_norm, "total_gradient_norm")

    step_norms = util.Channels()
    step_norms.extend(
        util.named(l2_norm([algorithm.steps[param]]), "%s.step_norm" % name)
        for name, param in model.get_parameter_dict().items())
    step_channels = step_norms.get_channels()

    #for activation in VariableFilter(roles=[OUTPUT])(graph.variables):
    #    quantity = activation.mean()
    #    quantity.name = "%s.mean" % util.get_path(activation)
    #    channels.append(quantity)

    data_independent_channels = util.Channels()
    for parameter in graph.parameters:
        if parameter.name in "gamma beta".split():
            quantity = parameter.mean()
            quantity.name = "%s.mean" % util.get_path(parameter)
            data_independent_channels.append(quantity)

    extensions = []

    extensions.append(
        TrainingDataMonitoring(step_channels, prefix="train",
                               after_epoch=True))

    extensions.append(
        DataStreamMonitoring(data_independent_channels.get_channels(),
                             data_stream=None,
                             after_epoch=True))
    extensions.extend(
        DataStreamMonitoring((channels.get_channels() + [cost]),
                             data_stream=task.get_stream(which, monitor=True),
                             prefix=which,
                             after_epoch=True)
        for which in "train valid test".split())

    patchmonitor = None
    if n_spatial_dims == 2:
        patchmonitor_klass = PatchMonitoring
    elif n_spatial_dims == 3:
        patchmonitor_klass = VideoPatchMonitoring

    if patchmonitor_klass:
        patch = T.stack(*[
            ram.crop(x, x_shape, location[:, i, :], scale[:, i, :])
            for i in xrange(n_patches)
        ])
        patch = patch.dimshuffle(1, 0, *range(2, patch.ndim))
        patch_extractor = theano.function([x, x_shape],
                                          [location, scale, patch])

        for which in "train valid".split():
            patchmonitor = patchmonitor_klass(
                save_to="%s_patches_%s" % (name, which),
                data_stream=task.get_stream(which,
                                            shuffle=False,
                                            num_examples=5),
                every_n_batches=patchmonitor_interval,
                extractor=patch_extractor,
                map_to_input_space=attention.static_map_to_input_space)
            patchmonitor.save_patches("patchmonitor_test.png")
            extensions.append(patchmonitor)

    if plot_url:
        plot_channels = []
        plot_channels.extend(task.plot_channels())
        plot_channels.append(["train_cost"])
        #plot_channels.append(["train_%s" % step_channel.name for step_channel in step_channels])

        from blocks.extras.extensions.plot import Plot
        extensions.append(
            Plot(name,
                 channels=plot_channels,
                 after_epoch=True,
                 server_url=plot_url))

    return extensions
Esempio n. 38
0
def main(job_id, params, config_file='params.ec'):
    config = ConfigParser.ConfigParser()
    config.readfp(open('./configs/{}'.format(config_file)))

    pr = pprint.PrettyPrinter(indent=4)
    pr.pprint(config)

    net_name  =  config.get('hyperparams', 'net_name', 'adni')
    struct_name = net_name.split('_')[0]

    max_epoch = int(config.get('hyperparams', 'max_iter', 100))
    base_lr = float(config.get('hyperparams', 'base_lr', 0.01))
    train_batch = int(config.get('hyperparams', 'train_batch', 256))
    valid_batch = int(config.get('hyperparams', 'valid_batch', 512))
    test_batch = int(config.get('hyperparams', 'valid_batch', 512))

    W_sd = float(config.get('hyperparams', 'W_sd', 0.01))
    W_mu = float(config.get('hyperparams', 'W_mu', 0.0))
    b_sd = float(config.get('hyperparams', 'b_sd', 0.01))
    b_mu = float(config.get('hyperparams', 'b_mu', 0.0))

    hidden_units = int(config.get('hyperparams', 'hidden_units', 32))
    input_dropout_ratio = float(config.get('hyperparams', 'input_dropout_ratio', 0.2))
    dropout_ratio = float(config.get('hyperparams', 'dropout_ratio', 0.2))
    weight_decay = float(config.get('hyperparams', 'weight_decay', 0.001))
    max_norm = float(config.get('hyperparams', 'max_norm', 100.0))
    solver = config.get('hyperparams', 'solver_type', 'rmsprop')
    data_file = config.get('hyperparams', 'data_file')
    side = config.get('hyperparams', 'side', 'b')

    input_dim = input_dims[struct_name]

    # Spearmint optimization parameters:
    if params:
        base_lr = float(params['base_lr'][0])
        dropout_ratio = float(params['dropout_ratio'][0])
        hidden_units = params['hidden_units'][0]
        weight_decay = params['weight_decay'][0]

    if 'adagrad' in solver:
        solver_type = CompositeRule([AdaGrad(learning_rate=base_lr), VariableClipping(threshold=max_norm)])
    else:
        solver_type = CompositeRule([RMSProp(learning_rate=base_lr), VariableClipping(threshold=max_norm)])


    data_file = config.get('hyperparams', 'data_file')

    if 'b' in side:
        train = H5PYDataset(data_file, which_set='train')
        valid = H5PYDataset(data_file, which_set='valid')
        test = H5PYDataset(data_file, which_set='test')
        x_l = tensor.matrix('l_features')
        x_r = tensor.matrix('r_features')
        x = tensor.concatenate([x_l, x_r], axis=1)

    else:
        train = H5PYDataset(data_file, which_set='train', sources=['{}_features'.format(side), 'targets'])
        valid = H5PYDataset(data_file, which_set='valid', sources=['{}_features'.format(side), 'targets'])
        test = H5PYDataset(data_file, which_set='test', sources=['{}_features'.format(side), 'targets'])
        x = tensor.matrix('{}_features'.format(side))

    y = tensor.lmatrix('targets')


    # Define a feed-forward net with an input, two hidden layers, and a softmax output:
    model = MLP(activations=[
        Rectifier(name='h1'),
        Rectifier(name='h2'),
        Softmax(name='output'),
    ],
                dims=[
                    input_dim[side],
                    hidden_units,
                    hidden_units,
                    2],
                weights_init=IsotropicGaussian(std=W_sd, mean=W_mu),
                biases_init=IsotropicGaussian(b_sd, b_mu))

    # Don't forget to initialize params:
    model.initialize()

    # y_hat is the output of the neural net with x as its inputs
    y_hat = model.apply(x)

    # Define a cost function to optimize, and a classification error rate.
    # Also apply the outputs from the net and corresponding targets:
    cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat)
    error = MisclassificationRate().apply(y.flatten(), y_hat)
    error.name = 'error'

    # This is the model: before applying dropout
    model = Model(cost)

    # Need to define the computation graph for the cost func:
    cost_graph = ComputationGraph([cost])

    # This returns a list of weight vectors for each layer
    W = VariableFilter(roles=[WEIGHT])(cost_graph.variables)

    # Add some regularization to this model:
    cost += weight_decay * l2_norm(W)
    cost.name = 'entropy'

    # computational graph with l2 reg
    cost_graph = ComputationGraph([cost])

    # Apply dropout to inputs:
    inputs = VariableFilter([INPUT])(cost_graph.variables)
    dropout_inputs = [input for input in inputs if input.name.startswith('linear_')]
    dropout_graph = apply_dropout(cost_graph, [dropout_inputs[0]], input_dropout_ratio)
    dropout_graph = apply_dropout(dropout_graph, dropout_inputs[1:], dropout_ratio)
    dropout_cost = dropout_graph.outputs[0]
    dropout_cost.name = 'dropout_entropy'

    # Learning Algorithm (notice: we use the dropout cost for learning):
    algo = GradientDescent(
        step_rule=solver_type,
        params=dropout_graph.parameters,
        cost=dropout_cost)

    # algo.step_rule.learning_rate.name = 'learning_rate'

    # Data stream used for training model:
    training_stream = Flatten(
        DataStream.default_stream(
            dataset=train,
            iteration_scheme=ShuffledScheme(
                train.num_examples,
                batch_size=train_batch)))

    training_monitor = TrainingDataMonitoring([dropout_cost,
                                               aggregation.mean(error),
                                               aggregation.mean(algo.total_gradient_norm)],
                                              after_batch=True)

    # Use the 'valid' set for validation during training:
    validation_stream = Flatten(
        DataStream.default_stream(
            dataset=valid,
            iteration_scheme=ShuffledScheme(
                valid.num_examples,
                batch_size=valid_batch)))

    validation_monitor = DataStreamMonitoring(
        variables=[cost, error],
        data_stream=validation_stream,
        prefix='validation',
        after_epoch=True)

    test_stream = Flatten(
        DataStream.default_stream(
            dataset=test,
            iteration_scheme=ShuffledScheme(
                test.num_examples,
                batch_size=test_batch)))

    test_monitor = DataStreamMonitoring(
        variables=[error],
        data_stream=test_stream,
        prefix='test',
        after_training=True)


    plotting = Plot('{}_{}'.format(net_name, side),
                    channels=[
                        ['dropout_entropy'],
                        ['error', 'validation_error'],
                    ],
                    after_batch=False)

    # Checkpoint class used to save model and log:
    stamp = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d-%H:%M')
    checkpoint = Checkpoint('./models/{}/{}/{}'.format(struct_name, side, stamp),
                            save_separately=['model', 'log'],
                            every_n_epochs=1)

    # Home-brewed class for early stopping when we detect we have started to overfit:
    # And by that I mean if the means of the val error and training error over the
    # previous 'epochs' is greater than the 'threshold', we are overfitting.
    early_stopper = FinishIfOverfitting(error_name='error',
                                        validation_name='validation_error',
                                        threshold=0.05,
                                        epochs=5,
                                        burn_in=100)

    # The main loop will train the network and output reports, etc
    main_loop = MainLoop(
        data_stream=training_stream,
        model=model,
        algorithm=algo,
        extensions=[
            validation_monitor,
            training_monitor,
            plotting,
            FinishAfter(after_n_epochs=max_epoch),
            early_stopper,
            Printing(),
            ProgressBar(),
            checkpoint,
            test_monitor,
        ])
    main_loop.run()

    ve = float(main_loop.log.last_epoch_row['validation_error'])
    te = float(main_loop.log.last_epoch_row['error'])
    spearmint_loss = ve + abs(te - ve)
    print 'Spearmint Loss: {}'.format(spearmint_loss)
    return spearmint_loss
Esempio n. 39
0
def initialize_all(config, save_path, bokeh_name,
                   params, bokeh_server, bokeh, test_tag, use_load_ext,
                   load_log, fast_start):
    root_path, extension = os.path.splitext(save_path)

    data = Data(**config['data'])
    train_conf = config['training']
    recognizer = create_model(config, data, test_tag)

    # Separate attention_params to be handled differently
    # when regularization is applied
    attention = recognizer.generator.transition.attention
    attention_params = Selector(attention).get_parameters().values()

    logger.info(
        "Initialization schemes for all bricks.\n"
        "Works well only in my branch with __repr__ added to all them,\n"
        "there is an issue #463 in Blocks to do that properly.")

    def show_init_scheme(cur):
        result = dict()
        for attr in dir(cur):
            if attr.endswith('_init'):
                result[attr] = getattr(cur, attr)
        for child in cur.children:
            result[child.name] = show_init_scheme(child)
        return result
    logger.info(pprint.pformat(show_init_scheme(recognizer)))

    prediction, prediction_mask = add_exploration(recognizer, data, train_conf)

    #
    # Observables:
    #
    primary_observables = []  # monitored each batch
    secondary_observables = []  # monitored every 10 batches
    validation_observables = []  # monitored on the validation set

    cg = recognizer.get_cost_graph(
        batch=True, prediction=prediction, prediction_mask=prediction_mask)
    labels, = VariableFilter(
        applications=[recognizer.cost], name='labels')(cg)
    labels_mask, = VariableFilter(
        applications=[recognizer.cost], name='labels_mask')(cg)

    gain_matrix = VariableFilter(
        theano_name=RewardRegressionEmitter.GAIN_MATRIX)(cg)
    if len(gain_matrix):
        gain_matrix, = gain_matrix
        primary_observables.append(
            rename(gain_matrix.min(), 'min_gain'))
        primary_observables.append(
            rename(gain_matrix.max(), 'max_gain'))

    batch_cost = cg.outputs[0].sum()
    batch_size = rename(recognizer.labels.shape[1], "batch_size")
    # Assumes constant batch size. `aggregation.mean` is not used because
    # of Blocks #514.
    cost = batch_cost / batch_size
    cost.name = "sequence_total_cost"
    logger.info("Cost graph is built")

    # Fetch variables useful for debugging.
    # It is important not to use any aggregation schemes here,
    # as it's currently impossible to spread the effect of
    # regularization on their variables, see Blocks #514.
    cost_cg = ComputationGraph(cost)
    r = recognizer
    energies, = VariableFilter(
        applications=[r.generator.readout.readout], name="output_0")(
            cost_cg)
    bottom_output = VariableFilter(
        # We need name_regex instead of name because LookupTable calls itsoutput output_0
        applications=[r.bottom.apply], name_regex="output")(
            cost_cg)[-1]
    attended, = VariableFilter(
        applications=[r.generator.transition.apply], name="attended")(
            cost_cg)
    attended_mask, = VariableFilter(
        applications=[r.generator.transition.apply], name="attended_mask")(
            cost_cg)
    weights, = VariableFilter(
        applications=[r.generator.evaluate], name="weights")(
            cost_cg)
    max_recording_length = rename(bottom_output.shape[0],
                                  "max_recording_length")
    # To exclude subsampling related bugs
    max_attended_mask_length = rename(attended_mask.shape[0],
                                      "max_attended_mask_length")
    max_attended_length = rename(attended.shape[0],
                                 "max_attended_length")
    max_num_phonemes = rename(labels.shape[0],
                              "max_num_phonemes")
    min_energy = rename(energies.min(), "min_energy")
    max_energy = rename(energies.max(), "max_energy")
    mean_attended = rename(abs(attended).mean(),
                           "mean_attended")
    mean_bottom_output = rename(abs(bottom_output).mean(),
                                "mean_bottom_output")
    weights_penalty = rename(monotonicity_penalty(weights, labels_mask),
                             "weights_penalty")
    weights_entropy = rename(entropy(weights, labels_mask),
                             "weights_entropy")
    mask_density = rename(labels_mask.mean(),
                          "mask_density")
    cg = ComputationGraph([
        cost, weights_penalty, weights_entropy,
        min_energy, max_energy,
        mean_attended, mean_bottom_output,
        batch_size, max_num_phonemes,
        mask_density])
    # Regularization. It is applied explicitly to all variables
    # of interest, it could not be applied to the cost only as it
    # would not have effect on auxiliary variables, see Blocks #514.
    reg_config = config.get('regularization', dict())
    regularized_cg = cg
    if reg_config.get('dropout'):
        logger.info('apply dropout')
        regularized_cg = apply_dropout(cg, [bottom_output], 0.5)
    if reg_config.get('noise'):
        logger.info('apply noise')
        noise_subjects = [p for p in cg.parameters if p not in attention_params]
        regularized_cg = apply_noise(cg, noise_subjects, reg_config['noise'])

    train_cost = regularized_cg.outputs[0]
    if reg_config.get("penalty_coof", .0) > 0:
        # big warning!!!
        # here we assume that:
        # regularized_weights_penalty = regularized_cg.outputs[1]
        train_cost = (train_cost +
                      reg_config.get("penalty_coof", .0) *
                      regularized_cg.outputs[1] / batch_size)
    if reg_config.get("decay", .0) > 0:
        train_cost = (train_cost + reg_config.get("decay", .0) *
                      l2_norm(VariableFilter(roles=[WEIGHT])(cg.parameters)) ** 2)

    train_cost = rename(train_cost, 'train_cost')

    gradients = None
    if reg_config.get('adaptive_noise'):
        logger.info('apply adaptive noise')
        if ((reg_config.get("penalty_coof", .0) > 0) or
                (reg_config.get("decay", .0) > 0)):
            logger.error('using  adaptive noise with alignment weight panalty '
                         'or weight decay is probably stupid')
        train_cost, regularized_cg, gradients, noise_brick = apply_adaptive_noise(
            cg, cg.outputs[0],
            variables=cg.parameters,
            num_examples=data.get_dataset('train').num_examples,
            parameters=Model(regularized_cg.outputs[0]).get_parameter_dict().values(),
            **reg_config.get('adaptive_noise')
        )
        train_cost.name = 'train_cost'
        adapt_noise_cg = ComputationGraph(train_cost)
        model_prior_mean = rename(
            VariableFilter(applications=[noise_brick.apply],
                           name='model_prior_mean')(adapt_noise_cg)[0],
            'model_prior_mean')
        model_cost = rename(
            VariableFilter(applications=[noise_brick.apply],
                           name='model_cost')(adapt_noise_cg)[0],
            'model_cost')
        model_prior_variance = rename(
            VariableFilter(applications=[noise_brick.apply],
                           name='model_prior_variance')(adapt_noise_cg)[0],
            'model_prior_variance')
        regularized_cg = ComputationGraph(
            [train_cost, model_cost] +
            regularized_cg.outputs +
            [model_prior_mean, model_prior_variance])
        primary_observables += [
            regularized_cg.outputs[1],  # model cost
            regularized_cg.outputs[2],  # task cost
            regularized_cg.outputs[-2],  # model prior mean
            regularized_cg.outputs[-1]]  # model prior variance

    model = Model(train_cost)
    if params:
        logger.info("Load parameters from " + params)
        # please note: we cannot use recognizer.load_params
        # as it builds a new computation graph that dies not have
        # shapred variables added by adaptive weight noise
        with open(params, 'r') as src:
            param_values = load_parameters(src)
        model.set_parameter_values(param_values)

    parameters = model.get_parameter_dict()
    logger.info("Parameters:\n" +
                pprint.pformat(
                    [(key, parameters[key].get_value().shape) for key
                     in sorted(parameters.keys())],
                    width=120))

    # Define the training algorithm.
    clipping = StepClipping(train_conf['gradient_threshold'])
    clipping.threshold.name = "gradient_norm_threshold"
    rule_names = train_conf.get('rules', ['momentum'])
    core_rules = []
    if 'momentum' in rule_names:
        logger.info("Using scaling and momentum for training")
        core_rules.append(Momentum(train_conf['scale'], train_conf['momentum']))
    if 'adadelta' in rule_names:
        logger.info("Using AdaDelta for training")
        core_rules.append(AdaDelta(train_conf['decay_rate'], train_conf['epsilon']))
    max_norm_rules = []
    if reg_config.get('max_norm', False) > 0:
        logger.info("Apply MaxNorm")
        maxnorm_subjects = VariableFilter(roles=[WEIGHT])(cg.parameters)
        if reg_config.get('max_norm_exclude_lookup', False):
            maxnorm_subjects = [v for v in maxnorm_subjects
                                if not isinstance(get_brick(v), LookupTable)]
        logger.info("Parameters covered by MaxNorm:\n"
                    + pprint.pformat([name for name, p in parameters.items()
                                      if p in maxnorm_subjects]))
        logger.info("Parameters NOT covered by MaxNorm:\n"
                    + pprint.pformat([name for name, p in parameters.items()
                                      if not p in maxnorm_subjects]))
        max_norm_rules = [
            Restrict(VariableClipping(reg_config['max_norm'], axis=0),
                     maxnorm_subjects)]
    burn_in = []
    if train_conf.get('burn_in_steps', 0):
        burn_in.append(
            BurnIn(num_steps=train_conf['burn_in_steps']))
    algorithm = GradientDescent(
        cost=train_cost,
        parameters=parameters.values(),
        gradients=gradients,
        step_rule=CompositeRule(
            [clipping] + core_rules + max_norm_rules +
            # Parameters are not changed at all
            # when nans are encountered.
            [RemoveNotFinite(0.0)] + burn_in),
        on_unused_sources='warn')

    logger.debug("Scan Ops in the gradients")
    gradient_cg = ComputationGraph(algorithm.gradients.values())
    for op in ComputationGraph(gradient_cg).scans:
        logger.debug(op)

    # More variables for debugging: some of them can be added only
    # after the `algorithm` object is created.
    secondary_observables += list(regularized_cg.outputs)
    if not 'train_cost' in [v.name for v in secondary_observables]:
        secondary_observables += [train_cost]
    secondary_observables += [
        algorithm.total_step_norm, algorithm.total_gradient_norm,
        clipping.threshold]
    for name, param in parameters.items():
        num_elements = numpy.product(param.get_value().shape)
        norm = param.norm(2) / num_elements ** 0.5
        grad_norm = algorithm.gradients[param].norm(2) / num_elements ** 0.5
        step_norm = algorithm.steps[param].norm(2) / num_elements ** 0.5
        stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm)
        stats.name = name + '_stats'
        secondary_observables.append(stats)

    primary_observables += [
        train_cost,
        algorithm.total_gradient_norm,
        algorithm.total_step_norm, clipping.threshold,
        max_recording_length,
        max_attended_length, max_attended_mask_length]

    validation_observables += [
        rename(aggregation.mean(batch_cost, batch_size), cost.name),
        rename(aggregation.sum_(batch_size), 'num_utterances'),
        weights_entropy, weights_penalty]

    def attach_aggregation_schemes(variables):
        # Aggregation specification has to be factored out as a separate
        # function as it has to be applied at the very last stage
        # separately to training and validation observables.
        result = []
        for var in variables:
            if var.name == 'weights_penalty':
                result.append(rename(aggregation.mean(var, batch_size),
                                     'weights_penalty_per_recording'))
            elif var.name == 'weights_entropy':
                result.append(rename(aggregation.mean(var, labels_mask.sum()),
                                     'weights_entropy_per_label'))
            else:
                result.append(var)
        return result

    mon_conf = config['monitoring']

    # Build main loop.
    logger.info("Initialize extensions")
    extensions = []
    if use_load_ext and params:
        extensions.append(Load(params, load_iteration_state=True, load_log=True))
    if load_log and params:
        extensions.append(LoadLog(params))
    extensions += [
        Timing(after_batch=True),
        CGStatistics(),
        #CodeVersion(['lvsr']),
    ]
    extensions.append(TrainingDataMonitoring(
        primary_observables, after_batch=True))
    average_monitoring = TrainingDataMonitoring(
        attach_aggregation_schemes(secondary_observables),
        prefix="average", every_n_batches=10)
    extensions.append(average_monitoring)
    validation = DataStreamMonitoring(
        attach_aggregation_schemes(validation_observables),
        data.get_stream("valid", shuffle=False), prefix="valid").set_conditions(
            before_first_epoch=not fast_start,
            every_n_epochs=mon_conf['validate_every_epochs'],
            every_n_batches=mon_conf['validate_every_batches'],
            after_training=False)
    extensions.append(validation)
    per = PhonemeErrorRate(recognizer, data,
                           **config['monitoring']['search'])
    per_monitoring = DataStreamMonitoring(
        [per], data.get_stream("valid", batches=False, shuffle=False),
        prefix="valid").set_conditions(
            before_first_epoch=not fast_start,
            every_n_epochs=mon_conf['search_every_epochs'],
            every_n_batches=mon_conf['search_every_batches'],
            after_training=False)
    extensions.append(per_monitoring)
    track_the_best_per = TrackTheBest(
        per_monitoring.record_name(per)).set_conditions(
            before_first_epoch=True, after_epoch=True)
    track_the_best_cost = TrackTheBest(
        validation.record_name(cost)).set_conditions(
            before_first_epoch=True, after_epoch=True)
    extensions += [track_the_best_cost, track_the_best_per]
    extensions.append(AdaptiveClipping(
        algorithm.total_gradient_norm.name,
        clipping, train_conf['gradient_threshold'],
        decay_rate=0.998, burnin_period=500))
    extensions += [
        SwitchOffLengthFilter(
            data.length_filter,
            after_n_batches=train_conf.get('stop_filtering')),
        FinishAfter(after_n_batches=train_conf.get('num_batches'),
                    after_n_epochs=train_conf.get('num_epochs'))
            .add_condition(["after_batch"], _gradient_norm_is_none),
    ]
    channels = [
        # Plot 1: training and validation costs
        [average_monitoring.record_name(train_cost),
         validation.record_name(cost)],
        # Plot 2: gradient norm,
        [average_monitoring.record_name(algorithm.total_gradient_norm),
         average_monitoring.record_name(clipping.threshold)],
        # Plot 3: phoneme error rate
        [per_monitoring.record_name(per)],
        # Plot 4: training and validation mean weight entropy
        [average_monitoring._record_name('weights_entropy_per_label'),
         validation._record_name('weights_entropy_per_label')],
        # Plot 5: training and validation monotonicity penalty
        [average_monitoring._record_name('weights_penalty_per_recording'),
         validation._record_name('weights_penalty_per_recording')]]
    if bokeh:
        extensions += [
            Plot(bokeh_name if bokeh_name
                 else os.path.basename(save_path),
                 channels,
                 every_n_batches=10,
                 server_url=bokeh_server),]
    extensions += [
        Checkpoint(save_path,
                   before_first_epoch=not fast_start, after_epoch=True,
                   every_n_batches=train_conf.get('save_every_n_batches'),
                   save_separately=["model", "log"],
                   use_cpickle=True)
        .add_condition(
            ['after_epoch'],
            OnLogRecord(track_the_best_per.notification_name),
            (root_path + "_best" + extension,))
        .add_condition(
            ['after_epoch'],
            OnLogRecord(track_the_best_cost.notification_name),
            (root_path + "_best_ll" + extension,)),
        ProgressBar()]
    extensions.append(EmbedIPython(use_main_loop_run_caller_env=True))
    if config['net']['criterion']['name'].startswith('mse'):
        extensions.append(
            LogInputsGains(
                labels, cg, recognizer.generator.readout.emitter, data))

    if train_conf.get('patience'):
        patience_conf = train_conf['patience']
        if not patience_conf.get('notification_names'):
            # setdefault will not work for empty list
            patience_conf['notification_names'] = [
                track_the_best_per.notification_name,
                track_the_best_cost.notification_name]
        extensions.append(Patience(**patience_conf))

    extensions.append(Printing(every_n_batches=1,
                               attribute_filter=PrintingFilterList()))

    return model, algorithm, data, extensions
Esempio n. 40
0
def construct_monitors(algorithm, task, n_patches, x, x_shape,
                       graph, name, ram, model, cost,
                       n_spatial_dims, plot_url, patchmonitor_interval=100, **kwargs):
    location, scale, savings = util.get_recurrent_auxiliaries(
        "location scale savings".split(), graph, n_patches)

    channels = util.Channels()
    channels.extend(task.monitor_channels(graph))

    channels.append(util.named(savings.mean(), "savings.mean"))

    for variable_name in "location scale".split():
        variable = locals()[variable_name]
        channels.append(variable.mean(axis=0),
                        "%s.mean" % variable_name)
        channels.append(variable.var(axis=0),
                        "%s.variance" % variable_name)

    channels.append(algorithm.total_gradient_norm,
                    "total_gradient_norm")

    step_norms = util.Channels()
    step_norms.extend(util.named(l2_norm([algorithm.steps[param]]),
                                 "%s.step_norm" % name)
                      for name, param in model.get_parameter_dict().items())
    step_channels = step_norms.get_channels()

    #for activation in VariableFilter(roles=[OUTPUT])(graph.variables):
    #    quantity = activation.mean()
    #    quantity.name = "%s.mean" % util.get_path(activation)
    #    channels.append(quantity)

    data_independent_channels = util.Channels()
    for parameter in graph.parameters:
        if parameter.name in "gamma beta".split():
            quantity = parameter.mean()
            quantity.name = "%s.mean" % util.get_path(parameter)
            data_independent_channels.append(quantity)

    extensions = []

    extensions.append(TrainingDataMonitoring(
        step_channels, prefix="train", after_epoch=True))

    extensions.append(DataStreamMonitoring(data_independent_channels.get_channels(),
                                           data_stream=None, after_epoch=True))
    extensions.extend(DataStreamMonitoring((channels.get_channels() + [cost]),
                                           data_stream=task.get_stream(which, monitor=True),
                                           prefix=which, after_epoch=True)
                      for which in "train valid test".split())

    patchmonitor = None
    if n_spatial_dims == 2:
        patchmonitor_klass = PatchMonitoring
    elif n_spatial_dims == 3:
        patchmonitor_klass = VideoPatchMonitoring

    if patchmonitor_klass:
        patch = T.stack(*[
            ram.crop(x, x_shape, location[:, i, :], scale[:, i, :])
            for i in xrange(n_patches)])
        patch = patch.dimshuffle(1, 0, *range(2, patch.ndim))
        patch_extractor = theano.function([x, x_shape],
                                          [location, scale, patch])

        for which in "train valid".split():
            patchmonitor = patchmonitor_klass(
                save_to="%s_patches_%s" % (name, which),
                data_stream=task.get_stream(which, shuffle=False, num_examples=5),
                every_n_batches=patchmonitor_interval,
                extractor=patch_extractor,
                map_to_input_space=attention.static_map_to_input_space)
            patchmonitor.save_patches("patchmonitor_test.png")
            extensions.append(patchmonitor)

    if plot_url:
        plot_channels = []
        plot_channels.extend(task.plot_channels())
        plot_channels.append(["train_cost"])
        #plot_channels.append(["train_%s" % step_channel.name for step_channel in step_channels])

        from blocks.extras.extensions.plot import Plot
        extensions.append(Plot(name, channels=plot_channels,
                            after_epoch=True, server_url=plot_url))

    return extensions
Esempio n. 41
0
    def __init__(self,
                 cost=None,
                 parameters=None,
                 step_rule=None,
                 gradients=None,
                 known_grads=None,
                 consider_constant=None,
                 **kwargs):
        # Set initial values for cost, parameters, gradients.
        self.cost = cost
        self.parameters = parameters
        self.gradients = gradients

        # If we don't have gradients, we'll need to infer them from the
        # cost and the parameters, both of which must not be None.
        if not self.gradients:
            self.gradients = self._compute_gradients(known_grads,
                                                     consider_constant)
        else:
            if cost is not None:
                logger.warning(
                    ('{}: gradients already specified directly; '
                     'cost is unused.'.format(self.__class__.__name__)))
            if self.parameters is None and isinstance(gradients, OrderedDict):
                # If the dictionary is ordered, it's safe to use the keys
                # as they have a deterministic order.
                self.parameters = list(self.gradients.keys())
            elif self.parameters is not None:
                # If parameters and gradients.keys() don't match we can
                # try to recover if gradients is ordered.
                if set(self.parameters) != set(self.gradients.keys()):
                    logger.warn("Specified parameters list does not match "
                                "keys in provided gradient dictionary; "
                                "using parameters inferred from gradients")
                    if not isinstance(self.gradients, OrderedDict):
                        raise ValueError(determinism_error)
                    self.parameters = list(self.gradients.keys())
            else:
                # self.parameters is not None, and gradients isn't
                # an OrderedDict. We can't do anything safe.
                raise ValueError(determinism_error)
            if known_grads:
                raise ValueError("known_grads has no effect when gradients "
                                 "are passed in")
            if consider_constant is not None:
                raise ValueError("consider_constant has no effect when "
                                 "gradients are passed in")

        # The order in which the different gradient terms appears
        # here matters, as floating point addition is non-commutative (and
        # Theano's graph optimizations are not order-independent).
        # This is why we do not use .values().
        gradient_values = [self.gradients[p] for p in self.parameters]
        self.total_gradient_norm = (l2_norm(gradient_values).copy(
            name="total_gradient_norm"))

        self.step_rule = step_rule if step_rule else Scale()
        logger.debug("Computing parameter steps...")
        self.steps, self.step_rule_updates = (self.step_rule.compute_steps(
            self.gradients))

        # Same as gradient_values above: the order may influence a
        # bunch of things, so enforce a consistent one (don't use
        # .values()).
        step_values = [self.steps[p] for p in self.parameters]
        self.total_step_norm = (l2_norm(step_values).copy(
            name="total_step_norm"))

        # Once again, iterating on gradients may not be deterministically
        # ordered if it is not an OrderedDict. We add the updates here in
        # the order specified in self.parameters. Keep it this way to
        # maintain reproducibility.
        kwargs.setdefault('updates', []).extend(
            itertools.chain(((parameter, parameter - self.steps[parameter])
                             for parameter in self.parameters),
                            self.step_rule_updates))
        super(GradientDescent, self).__init__(**kwargs)
Esempio n. 42
0
    def training(self,
                 fea2obj,
                 batch_size,
                 learning_rate=0.005,
                 steprule='adagrad',
                 wait_epochs=5,
                 kl_weight_init=None,
                 klw_ep=50,
                 klw_inc_rate=0,
                 num_epochs=None):
        networkfile = self._config['net']

        n_epochs = num_epochs or int(self._config['nepochs'])
        reg_weight = float(self._config['loss_weight'])
        reg_type = self._config['loss_reg']
        numtrain = int(
            self._config['num_train']) if 'num_train' in self._config else None
        train_stream, num_samples_train = get_comb_stream(
            fea2obj, 'train', batch_size, shuffle=True, num_examples=numtrain)
        dev_stream, num_samples_dev = get_comb_stream(fea2obj,
                                                      'dev',
                                                      batch_size=None,
                                                      shuffle=False)
        logger.info('sources: %s -- number of train/dev samples: %d/%d',
                    train_stream.sources, num_samples_train, num_samples_dev)

        t2idx = fea2obj['targets'].t2idx
        klw_init = kl_weight_init or float(
            self._config['kld_weight']) if 'kld_weight' in self._config else 1
        logger.info('kl_weight_init: %d', klw_init)
        kl_weight = shared_floatx(klw_init, 'kl_weight')
        entropy_weight = shared_floatx(1., 'entropy_weight')

        cost, p_at_1, _, KLD, logpy_xz, pat1_recog, misclassify_rate = build_model_new(
            fea2obj, len(t2idx), self._config, kl_weight, entropy_weight)

        cg = ComputationGraph(cost)

        weights = VariableFilter(roles=[WEIGHT])(cg.parameters)
        logger.info('Model weights are: %s', weights)
        if 'L2' in reg_type:
            cost += reg_weight * l2_norm(weights)
            logger.info('applying %s with weight: %f ', reg_type, reg_weight)

        dropout = -0.1
        if dropout > 0:
            cg = apply_dropout(cg, weights, dropout)
            cost = cg.outputs[0]

        cost.name = 'cost'
        logger.info('Our Algorithm is : %s, and learning_rate: %f', steprule,
                    learning_rate)
        if 'adagrad' in steprule:
            cnf_step_rule = AdaGrad(learning_rate)
        elif 'adadelta' in steprule:
            cnf_step_rule = AdaDelta(decay_rate=0.95)
        elif 'decay' in steprule:
            cnf_step_rule = RMSProp(learning_rate=learning_rate,
                                    decay_rate=0.90)
            cnf_step_rule = CompositeRule([cnf_step_rule, StepClipping(1)])
        elif 'momentum' in steprule:
            cnf_step_rule = Momentum(learning_rate=learning_rate, momentum=0.9)
        elif 'adam' in steprule:
            cnf_step_rule = Adam(learning_rate=learning_rate)
        else:
            logger.info('The steprule param is wrong! which is: %s', steprule)

        algorithm = GradientDescent(cost=cost,
                                    parameters=cg.parameters,
                                    step_rule=cnf_step_rule,
                                    on_unused_sources='warn')
        #algorithm.add_updates(updates)
        gradient_norm = aggregation.mean(algorithm.total_gradient_norm)
        step_norm = aggregation.mean(algorithm.total_step_norm)
        monitored_vars = [
            cost, gradient_norm, step_norm, p_at_1, KLD, logpy_xz, kl_weight,
            pat1_recog
        ]
        train_monitor = TrainingDataMonitoring(variables=monitored_vars,
                                               after_batch=True,
                                               before_first_epoch=True,
                                               prefix='tra')

        dev_monitor = DataStreamMonitoring(variables=[
            cost, p_at_1, KLD, logpy_xz, pat1_recog, misclassify_rate
        ],
                                           after_epoch=True,
                                           before_first_epoch=True,
                                           data_stream=dev_stream,
                                           prefix="dev")

        extensions = [
            dev_monitor,
            train_monitor,
            Timing(),
            TrackTheBest('dev_cost'),
            FinishIfNoImprovementAfter('dev_cost_best_so_far',
                                       epochs=wait_epochs),
            Printing(after_batch=False),  #, ProgressBar()
            FinishAfter(after_n_epochs=n_epochs),
            saveload.Load(networkfile + '.toload.pkl'),
        ] + track_best('dev_cost', networkfile + '.best.pkl')

        #extensions.append(SharedVariableModifier(kl_weight,
        #                                          lambda n, klw: numpy.cast[theano.config.floatX] (klw_inc_rate + klw), after_epoch=False, every_n_epochs=klw_ep, after_batch=False))
        #         extensions.append(SharedVariableModifier(entropy_weight,
        #                                                   lambda n, crw: numpy.cast[theano.config.floatX](crw - klw_inc_rate), after_epoch=False, every_n_epochs=klw_ep, after_batch=False))

        logger.info('number of parameters in the model: %d',
                    tensor.sum([p.size for p in cg.parameters]).eval())
        logger.info('Lookup table sizes: %s',
                    [p.size.eval() for p in cg.parameters if 'lt' in p.name])

        main_loop = MainLoop(data_stream=train_stream,
                             algorithm=algorithm,
                             model=Model(cost),
                             extensions=extensions)
        main_loop.run()
Esempio n. 43
0
 def _apply_reg(self, params=None, *args, **kwargs):
     '''
     Apply regularization (default L2 norm) on parameters (default user, hashtag and word embedding) to computing
     graph of self.cg_generator
     :param params: A list of parameters to which regularization applied
     '''
     if self.norm_type is not None:
         params = [self.input_lookup.W]
         if self.norm_type == 'l2_norm':
             self._train_cg_generator = self._train_cg_generator + self.norm_scale * theano_expressions.l2_norm(
                 tensors=params)**2
         elif self.norm_type == 'l1_norm':
             norm = 0.
             for param in params:
                 norm += tensor.abs_(param).sum()
             self._train_cg_generator = self._train_cg_generator + self.norm_scale * norm
         else:
             raise ValueError('{0} norm type is not supported!'.format(
                 self.norm_type))
     else:
         pass
Esempio n. 44
0
# Define a cost function to optimize, and a classification error rate:
# Also apply the outputs from the net, and corresponding targets:
cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat)
error = MisclassificationRate().apply(y.flatten(), y_hat)
error.name = 'error'

# Need to define the computation graph:
graph = ComputationGraph(cost)

# This returns a list of weight vectors for each layer
W = VariableFilter(roles=[WEIGHT])(graph.variables)

# Add some regularization to this model:
lam = 0.001
cost += lam * l2_norm(W)
cost.name = 'entropy'

# This is the model without dropout, but with l2 reg.
model = Model(cost)

# Apply dropout to inputs:
graph = ComputationGraph(y_hat)
inputs = VariableFilter([INPUT])(graph.variables)
dropout_graph = apply_dropout(graph, inputs, 0.2)
dropout_cost = dropout_graph.outputs[0]
dropout_cost.name = 'dropout_entropy'

# Learning Algorithm:
algo = GradientDescent(
    # step_rule=Scale(learning_rate=0.1),
Esempio n. 45
0
# Define a cost function to optimize, and a classification error rate:
# Also apply the outputs from the net, and corresponding targets:
cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat)
error = MisclassificationRate().apply(y.flatten(), y_hat)
error.name = 'error'

# Need to define the computation graph:
graph = ComputationGraph(cost)

# This returns a list of weight vectors for each layer
W = VariableFilter(roles=[WEIGHT])(graph.variables)

# Add some regularization to this model:
lam = 0.001
cost += lam * l2_norm(W)
cost.name = 'entropy'

# This is the model without dropout, but with l2 reg.
model = Model(cost)

# Apply dropout to inputs:
graph = ComputationGraph(y_hat)
inputs = VariableFilter([INPUT])(graph.variables)
dropout_graph = apply_dropout(graph, inputs, 0.2)
dropout_cost = dropout_graph.outputs[0]
dropout_cost.name = 'dropout_entropy'

# Learning Algorithm:
algo = GradientDescent(
    # step_rule=Scale(learning_rate=0.1),
Esempio n. 46
0
# Define a cost function to optimize, and a classification error rate.
# Also apply the outputs from the net and corresponding targets:
cost = BinaryCrossEntropy().apply(x, x_hat)

# This is the model: before applying dropout
autoencoder = Model(cost)

# Need to define the computation graph for the cost func:
cost_graph = ComputationGraph([cost])

# This returns a list of weight vectors for each layer
W = VariableFilter(roles=[WEIGHT])(cost_graph.variables)

# Add some regularization to this model:
cost += weight_decay * l2_norm(W)
cost.name = 'entropy'

# computational graph with l2 reg
cost_graph = ComputationGraph([cost])

# Apply dropout to inputs:
inputs = VariableFilter([INPUT])(cost_graph.variables)
dropout_inputs = [input for input in inputs if input.name.startswith('linear_')]
dropout_graph = apply_dropout(cost_graph, dropout_inputs, dropout_ratio)
dropout_cost = dropout_graph.outputs[0]
dropout_cost.name = 'dropout_entropy'

# Learning Algorithm:
algo = GradientDescent(
    step_rule=solver_type,