Esempio n. 1
0
    def __init__(self,
                 objective,
                 params,
                 inputs=None,
                 param_constrainers=None,
                 max_iter=-1,
                 lr_scalers=None,
                 verbose=0,
                 tol=None,
                 init_alpha=None,
                 min_init_alpha=1e-3,
                 reset_alpha=True,
                 conjugate=False,
                 reset_conjugate=True,
                 gradients=None,
                 gradient_updates=None,
                 line_search_mode=None,
                 accumulate=False,
                 theano_function_mode=None):

        self.__dict__.update(locals())
        del self.self

        if line_search_mode is None:
            if init_alpha is None:
                init_alpha = (.001, .005, .01, .05, .1)
        else:
            assert line_search_mode == 'exhaustive'
            if init_alpha is None:
                init_alpha = (.5, 1.)

        self.init_alpha = tuple([float(elem) for elem in init_alpha])

        if inputs is None:
            inputs = []

        if param_constrainers is None:
            param_constrainers = []

        obj = objective

        self.verbose = verbose

        param_to_grad_sym = OrderedDict()
        param_to_grad_shared = OrderedDict()
        updates = OrderedDict()
        if self.gradient_updates is not None:
            updates.update(self.gradient_updates)

        self.params = [param for param in params]

        for param in params:
            if self.gradients is not None and param in self.gradients:
                g = self.gradients[param]
            else:
                g = grad(objective, param)
            param_to_grad_sym[param] = g
            if param.name is not None:
                param_name = param.name
            else:
                param_name = 'anon_param'
            grad_name = 'BatchGradientDescent.grad_' + param_name
            grad_shared = sharedX(param.get_value() * 0., name=grad_name)
            param_to_grad_shared[param] = grad_shared
            updates[grad_shared] = g

        self.param_to_grad_shared = param_to_grad_shared

        if self.verbose:
            logger.info('batch gradient class compiling gradient function')
        t1 = time.time()
        if self.accumulate:
            self._compute_grad = Accumulator(inputs, updates=updates)
        else:
            self._compute_grad = function(
                inputs,
                updates=updates,
                mode=self.theano_function_mode,
                name='BatchGradientDescent._compute_grad')
        if self.verbose:
            t2 = time.time()
            logger.info('done. Took {0}'.format(t2 - t1))

        if self.verbose:
            logger.info('batch gradient class compiling objective function')
        if self.accumulate:
            self.obj = Accumulator(inputs, obj)
        else:
            self.obj = function(inputs,
                                obj,
                                mode=self.theano_function_mode,
                                name='BatchGradientDescent.obj')

        if self.verbose:
            logger.info('done')

        self.param_to_cache = OrderedDict()
        alpha = T.scalar(name='alpha')
        alpha.tag.test_value = np.cast[alpha.dtype](.01)
        cache_updates = OrderedDict()
        goto_updates = OrderedDict()
        for param in params:
            if param.name is None:
                param_name = 'anon_param'
            else:
                param_name = param.name
            cache_name = 'BatchGradientDescent.param_to_cache[%s]' % param_name
            self.param_to_cache[param] = sharedX(param.get_value(borrow=False),
                                                 name=cache_name)
            cache_updates[self.param_to_cache[param]] = param
            cached = self.param_to_cache[param]
            g = self.param_to_grad_shared[param]
            if lr_scalers is not None and param in lr_scalers:
                scaled_alpha = alpha * lr_scalers[param]
            else:
                scaled_alpha = alpha
            mul = scaled_alpha * g
            diff = cached - mul
            goto_updates[param] = diff
        self._cache_values = function(
            [],
            updates=cache_updates,
            mode=self.theano_function_mode,
            name='BatchGradientDescent._cache_values')
        assert isinstance(param_constrainers, (list, tuple))
        for param_constrainer in param_constrainers:
            param_constrainer(goto_updates)
        self._goto_alpha = function([alpha],
                                    updates=goto_updates,
                                    mode=self.theano_function_mode,
                                    name='BatchGradientDescent._goto_alpha')

        norm = T.sqrt(
            sum([
                T.sqr(elem).sum()
                for elem in self.param_to_grad_shared.values()
            ]))
        norm.name = 'BatchGradientDescent.norm'
        normalize_grad_updates = OrderedDict()
        for grad_shared in self.param_to_grad_shared.values():
            normalize_grad_updates[grad_shared] = grad_shared / norm

        # useful for monitoring
        self.ave_grad_size = sharedX(0.)
        self.new_weight = sharedX(1.)
        normalize_grad_updates[self.ave_grad_size] = \
            self.new_weight * norm + (1.-self.new_weight) * self.ave_grad_size

        self._normalize_grad = \
            function([],
                     norm,
                     updates=normalize_grad_updates,
                     mode=self.theano_function_mode,
                     name='BatchGradientDescent._normalize_grad')

        if self.conjugate:
            grad_shared = self.param_to_grad_shared.values()

            grad_to_old_grad = OrderedDict()
            for elem in grad_shared:
                grad_to_old_grad[elem] = \
                    sharedX(elem.get_value(), 'old_'+elem.name)

            self._store_old_grad = \
                function([norm],
                         updates=OrderedDict([(grad_to_old_grad[g_], g_ * norm)
                                             for g_ in grad_to_old_grad]),
                         mode=self.theano_function_mode,
                         name='BatchGradientDescent._store_old_grad')

            grad_ordered = list(grad_to_old_grad.keys())
            old_grad_ordered = [grad_to_old_grad[g_] for g_ in grad_ordered]

            def dot_product(x, y):
                return sum([(x_elem * y_elem).sum()
                            for x_elem, y_elem in safe_zip(x, y)])

            beta_pr = (dot_product(grad_ordered, grad_ordered) - dot_product(grad_ordered, old_grad_ordered)) / \
                (1e-7+dot_product(old_grad_ordered, old_grad_ordered))
            assert beta_pr.ndim == 0

            beta = T.maximum(beta_pr, 0.)

            # beta_pr is the Polak-Ribiere formula for beta.
            # According to wikipedia, the beta to use for NCG is "a matter of
            # heuristics or taste" but max(0, beta_pr) is "a popular choice...
            # which provides direction reset automatically." (ie, it is meant
            # to revert to steepest descent when you have traveled far enough
            # that the objective function is behaving non-quadratically enough
            # that the conjugate gradient formulas aren't working anymore)

            # http://en.wikipedia.org/wiki/Nonlinear_conjugate_gradient_method

            assert grad not in grad_to_old_grad

            make_conjugate_updates = \
                [(g_, g_ + beta * grad_to_old_grad[g_]) for g_ in grad_ordered]

            mode = self.theano_function_mode
            if mode is not None and hasattr(mode, 'record'):
                for v, u in make_conjugate_updates:
                    mode.record.handle_line(
                        'BatchGradientDescent._make_conjugate var ' +
                        var_descriptor(v) + '\n')
                    mode.record.handle_line(
                        'BatchGradientDescent._make_conjugate update ' +
                        var_descriptor(u) + '\n')

            self._make_conjugate = \
                function([], updates=make_conjugate_updates,
                         mode=self.theano_function_mode,
                         name='BatchGradientDescent._make_conjugate')

            if mode is not None and hasattr(mode, 'record'):
                for output in self._make_conjugate.maker.fgraph.outputs:
                    mode.record.handle_line(
                        'BatchGradientDescent._make_conjugate output ' +
                        var_descriptor(output) + '\n')

        if tol is None:
            if objective.dtype == "float32":
                self.tol = 1e-6
            else:
                self.tol = 3e-7
        else:
            self.tol = tol

        self.ave_step_size = sharedX(0.)
        self.ave_grad_mult = sharedX(0.)
    def __init__(self, objective, params, inputs = None,
            param_constrainers = None, max_iter = -1,
            lr_scalers = None, verbose = 0, tol = None,
            init_alpha = None, min_init_alpha = 1e-3,
            reset_alpha = True, conjugate = False,
            reset_conjugate = True, gradients = None,
            gradient_updates = None, line_search_mode = None,
            accumulate = False, theano_function_mode=None):
        """
        objective: a theano expression to be minimized
                       should be a function of params and,
                       if provided, inputs
            params: A list of theano shared variables.
                    These are the optimization variables
            inputs: (Optional) A list of theano variables
                    to serve as inputs to the graph.
            param_constrainers: (Optional) A list of callables
                    to be called on all updates dictionaries to
                    be applied to params. This is how you implement
                    constrained optimization.
            reset_alpha: If True, reverts to using init_alpha after
                        each call. If False, the final set of alphas
                        is used at the start of the next call to minimize.
            conjugate: If True, tries to pick conjugate gradient directions.
                       For the directions to be truly conjugate, you must use
                       line_search_mode = 'exhaustive' and the objective function
                       must be quadratic.
                       Using line_search_mode = 'exhaustive' on a non-quadratic objective
                       function implements nonlinear conjugate gradient descent.
            reset_conjugate:
                    has no effect unless conjugate == True
                    if reset_conjugate == True,
                        reverts to direction of steepest descent for the first
                        step in each call to minimize.
                    otherwise, tries to make the new search direction
                    conjugate to the last one (even though the objective function
                    might be totally different on each call to minimize)
            gradients: if None, compute the gradients of obj using T.grad
                    otherwise, a dictionary mapping from params to expressions
                    for their gradients (this allows you to use approximate
                    gradients computed with something other than T.grad)
            gradient_updates: a dictionary of shared variable updates to run
                each time the gradient is computed

            Calling the ``minimize'' method with values for
            for ``inputs'' will update ``params'' to minimize
            ``objective''.
        """

        self.__dict__.update(locals())
        del self.self

        if line_search_mode is None:
            if init_alpha is None:
                init_alpha  = (.001, .005, .01, .05, .1)
        else:
            assert line_search_mode == 'exhaustive'
            if init_alpha is None:
                init_alpha = (.5, 1.)

        self.init_alpha = tuple([float(elem) for elem in init_alpha])

        if inputs is None:
            inputs = []

        if param_constrainers is None:
            param_constrainers = []

        obj = objective

        self.verbose = verbose

        param_to_grad_sym = OrderedDict()
        param_to_grad_shared = OrderedDict()
        updates = OrderedDict()
        if self.gradient_updates is not None:
            updates.update(self.gradient_updates)

        self.params = [ param for param in params ]

        for param in params:
            if self.gradients is not None and param in self.gradients:
                g = self.gradients[param]
            else:
                g = grad(objective, param)
            param_to_grad_sym[param] = g
            if param.name is not None:
                param_name = param.name
            else:
                param_name = 'anon_param'
            grad_name = 'BatchGradientDescent.grad_' + param_name
            grad_shared = sharedX( param.get_value() * 0., name=grad_name)
            param_to_grad_shared[param] = grad_shared
            updates[grad_shared] = g

        self.param_to_grad_shared = param_to_grad_shared

        if self.verbose:
            print 'batch gradient class compiling gradient function'
        t1 = time.time()
        if self.accumulate:
            self._compute_grad = Accumulator(inputs, updates = updates)
        else:
            self._compute_grad = function(inputs, updates = updates,
                    mode=self.theano_function_mode,
                    name='BatchGradientDescent._compute_grad')
        if self.verbose:
            t2 = time.time()
            print 'done. Took ',t2-t1

        if self.verbose:
            print 'batch gradient class compiling objective function'
        if self.accumulate:
            self.obj = Accumulator(inputs, obj)
        else:
            self.obj = function(inputs, obj, mode=self.theano_function_mode,
                    name='BatchGradientDescent.obj')

        if self.verbose:
            print 'done'

        self.param_to_cache = OrderedDict()
        alpha = T.scalar(name = 'alpha')
        alpha.tag.test_value = np.cast[alpha.dtype](.01)
        cache_updates = OrderedDict()
        goto_updates = OrderedDict()
        for param in params:
            if param.name is None:
                param_name = 'anon_param'
            else:
                param_name = param.name
            cache_name = 'BatchGradientDescent.param_to_cache[%s]' % param_name
            self.param_to_cache[param] = sharedX(param.get_value(borrow=False), name=cache_name)
            cache_updates[self.param_to_cache[param]] = param
            cached = self.param_to_cache[param]
            g = self.param_to_grad_shared[param]
            if lr_scalers is not None and param in lr_scalers:
                scaled_alpha = alpha * lr_scalers[param]
            else:
                scaled_alpha = alpha
            mul = scaled_alpha * g
            diff = cached - mul
            goto_updates[param] = diff
        self._cache_values = function([], updates = cache_updates, mode=self.theano_function_mode, name='BatchGradientDescent._cache_values')
        assert isinstance(param_constrainers, (list, tuple))
        for param_constrainer in param_constrainers:
            param_constrainer(goto_updates)
        self._goto_alpha = function([alpha], updates=goto_updates,
                mode=self.theano_function_mode, name='BatchGradientDescent._goto_alpha')

        norm = T.sqrt(sum([T.sqr(elem).sum() for elem in self.param_to_grad_shared.values()]))
        norm.name = 'BatchGradientDescent.norm'
        normalize_grad_updates = OrderedDict()
        for grad_shared in self.param_to_grad_shared.values():
            normalize_grad_updates[grad_shared] = grad_shared / norm

        # useful for monitoring
        self.ave_grad_size = sharedX(0.)
        self.new_weight = sharedX(1.)
        normalize_grad_updates[self.ave_grad_size] = self.new_weight * norm + (1.-self.new_weight) * self.ave_grad_size

        self._normalize_grad = function([], norm, updates=normalize_grad_updates, mode=self.theano_function_mode,
                name='BatchGradientDescent._normalize_grad')

        if self.conjugate:
            grad_shared = self.param_to_grad_shared.values()

            grad_to_old_grad = OrderedDict()
            for elem in grad_shared:
                grad_to_old_grad[elem] = sharedX(elem.get_value(), 'old_'+elem.name)

            self._store_old_grad = function([norm], updates = OrderedDict([(grad_to_old_grad[g], g * norm)
                for g in grad_to_old_grad]), mode=self.theano_function_mode,
                name='BatchGradientDescent._store_old_grad')

            grad_ordered = list(grad_to_old_grad.keys())
            old_grad_ordered = [ grad_to_old_grad[g] for g in grad_ordered]

            def dot_product(x, y):
                return sum([ (x_elem * y_elem).sum() for x_elem, y_elem in safe_zip(x, y) ])

            beta_pr = (dot_product(grad_ordered, grad_ordered) - dot_product(grad_ordered, old_grad_ordered)) / \
                    (1e-7+dot_product(old_grad_ordered, old_grad_ordered))
            assert beta_pr.ndim == 0

            beta = T.maximum(beta_pr, 0.)

            """

            beta_pr is the Polak-Ribiere formula for beta.
            According to wikipedia, the beta to use for NCG is "a matter of heuristics or taste"
            but max(0, beta_pr) is "a popular choice... which provides direction reset automatically."
            (ie, it is meant to revert to steepest descent when you have traveled far enough that
            the objective function is behaving non-quadratically enough that the conjugate gradient
            formulas aren't working anymore)

            http://en.wikipedia.org/wiki/Nonlinear_conjugate_gradient_method

            """

            assert grad not in grad_to_old_grad

            make_conjugate_updates = [(g, g + beta * grad_to_old_grad[g]) for g in grad_ordered]

            mode = self.theano_function_mode
            if mode is not None and hasattr(mode, 'record'):
                for v, u in make_conjugate_updates:
                    mode.record.handle_line('BatchGradientDescent._make_conjugate var ' \
                            + var_descriptor(v) + '\n')
                    mode.record.handle_line('BatchGradientDescent._make_conjugate update ' \
                            + var_descriptor(u) + '\n')

            self._make_conjugate = function([], updates=make_conjugate_updates,
                    mode=self.theano_function_mode, name='BatchGradientDescent._make_conjugate')

            if mode is not None and hasattr(mode, 'record'):
                for output in self._make_conjugate.maker.fgraph.outputs:
                    mode.record.handle_line('BatchGradientDescent._make_conjugate output ' \
                            + var_descriptor(output) + '\n')


        if tol is None:
            if objective.dtype == "float32":
                self.tol = 1e-6
            else:
                self.tol = 3e-7
        else:
            self.tol = tol

        self.ave_step_size = sharedX(0.)
        self.ave_grad_mult = sharedX(0.)
    def __init__(self,
                 objective,
                 params,
                 inputs=None,
                 param_constrainers=None,
                 max_iter=-1,
                 lr_scalers=None,
                 verbose=0,
                 tol=None,
                 init_alpha=None,
                 min_init_alpha=1e-3,
                 reset_alpha=True,
                 conjugate=False,
                 reset_conjugate=True,
                 gradients=None,
                 gradient_updates=None,
                 line_search_mode=None,
                 accumulate=False,
                 theano_function_mode=None):
        """
        objective: a theano expression to be minimized
                       should be a function of params and,
                       if provided, inputs
            params: A list of theano shared variables.
                    These are the optimization variables
            inputs: (Optional) A list of theano variables
                    to serve as inputs to the graph.
            param_constrainers: (Optional) A list of callables
                    to be called on all updates dictionaries to
                    be applied to params. This is how you implement
                    constrained optimization.
            reset_alpha: If True, reverts to using init_alpha after
                        each call. If False, the final set of alphas
                        is used at the start of the next call to minimize.
            conjugate: If True, tries to pick conjugate gradient directions.
                       For the directions to be truly conjugate, you must use
                       line_search_mode = 'exhaustive' and the objective function
                       must be quadratic.
                       Using line_search_mode = 'exhaustive' on a non-quadratic objective
                       function implements nonlinear conjugate gradient descent.
            reset_conjugate:
                    has no effect unless conjugate == True
                    if reset_conjugate == True,
                        reverts to direction of steepest descent for the first
                        step in each call to minimize.
                    otherwise, tries to make the new search direction
                    conjugate to the last one (even though the objective function
                    might be totally different on each call to minimize)
            gradients: if None, compute the gradients of obj using T.grad
                    otherwise, a dictionary mapping from params to expressions
                    for their gradients (this allows you to use approximate
                    gradients computed with something other than T.grad)
            gradient_updates: a dictionary of shared variable updates to run
                each time the gradient is computed

            Calling the ``minimize'' method with values for
            for ``inputs'' will update ``params'' to minimize
            ``objective''.
        """

        self.__dict__.update(locals())
        del self.self

        if line_search_mode is None:
            if init_alpha is None:
                init_alpha = (.001, .005, .01, .05, .1)
        else:
            assert line_search_mode == 'exhaustive'
            if init_alpha is None:
                init_alpha = (.5, 1.)

        self.init_alpha = tuple([float(elem) for elem in init_alpha])

        if inputs is None:
            inputs = []

        if param_constrainers is None:
            param_constrainers = []

        obj = objective

        self.verbose = verbose

        param_to_grad_sym = OrderedDict()
        param_to_grad_shared = OrderedDict()
        updates = OrderedDict()
        if self.gradient_updates is not None:
            updates.update(self.gradient_updates)

        self.params = [param for param in params]

        for param in params:
            if self.gradients is not None and param in self.gradients:
                g = self.gradients[param]
            else:
                g = grad(objective, param)
            param_to_grad_sym[param] = g
            if param.name is not None:
                param_name = param.name
            else:
                param_name = 'anon_param'
            grad_name = 'BatchGradientDescent.grad_' + param_name
            grad_shared = sharedX(param.get_value() * 0., name=grad_name)
            param_to_grad_shared[param] = grad_shared
            updates[grad_shared] = g

        self.param_to_grad_shared = param_to_grad_shared

        if self.verbose:
            print 'batch gradient class compiling gradient function'
        t1 = time.time()
        if self.accumulate:
            self._compute_grad = Accumulator(inputs, updates=updates)
        else:
            self._compute_grad = function(
                inputs,
                updates=updates,
                mode=self.theano_function_mode,
                name='BatchGradientDescent._compute_grad')
        if self.verbose:
            t2 = time.time()
            print 'done. Took ', t2 - t1

        if self.verbose:
            print 'batch gradient class compiling objective function'
        if self.accumulate:
            self.obj = Accumulator(inputs, obj)
        else:
            self.obj = function(inputs,
                                obj,
                                mode=self.theano_function_mode,
                                name='BatchGradientDescent.obj')

        if self.verbose:
            print 'done'

        self.param_to_cache = OrderedDict()
        alpha = T.scalar(name='alpha')
        alpha.tag.test_value = np.cast[alpha.dtype](.01)
        cache_updates = OrderedDict()
        goto_updates = OrderedDict()
        for param in params:
            if param.name is None:
                param_name = 'anon_param'
            else:
                param_name = param.name
            cache_name = 'BatchGradientDescent.param_to_cache[%s]' % param_name
            self.param_to_cache[param] = sharedX(param.get_value(borrow=False),
                                                 name=cache_name)
            cache_updates[self.param_to_cache[param]] = param
            cached = self.param_to_cache[param]
            g = self.param_to_grad_shared[param]
            if lr_scalers is not None and param in lr_scalers:
                scaled_alpha = alpha * lr_scalers[param]
            else:
                scaled_alpha = alpha
            mul = scaled_alpha * g
            diff = cached - mul
            goto_updates[param] = diff
        self._cache_values = function(
            [],
            updates=cache_updates,
            mode=self.theano_function_mode,
            name='BatchGradientDescent._cache_values')
        assert isinstance(param_constrainers, (list, tuple))
        for param_constrainer in param_constrainers:
            param_constrainer(goto_updates)
        self._goto_alpha = function([alpha],
                                    updates=goto_updates,
                                    mode=self.theano_function_mode,
                                    name='BatchGradientDescent._goto_alpha')

        norm = T.sqrt(
            sum([
                T.sqr(elem).sum()
                for elem in self.param_to_grad_shared.values()
            ]))
        norm.name = 'BatchGradientDescent.norm'
        normalize_grad_updates = OrderedDict()
        for grad_shared in self.param_to_grad_shared.values():
            normalize_grad_updates[grad_shared] = grad_shared / norm

        # useful for monitoring
        self.ave_grad_size = sharedX(0.)
        self.new_weight = sharedX(1.)
        normalize_grad_updates[self.ave_grad_size] = self.new_weight * norm + (
            1. - self.new_weight) * self.ave_grad_size

        self._normalize_grad = function(
            [],
            norm,
            updates=normalize_grad_updates,
            mode=self.theano_function_mode,
            name='BatchGradientDescent._normalize_grad')

        if self.conjugate:
            grad_shared = self.param_to_grad_shared.values()

            grad_to_old_grad = OrderedDict()
            for elem in grad_shared:
                grad_to_old_grad[elem] = sharedX(elem.get_value(),
                                                 'old_' + elem.name)

            self._store_old_grad = function(
                [norm],
                updates=OrderedDict([(grad_to_old_grad[g], g * norm)
                                     for g in grad_to_old_grad]),
                mode=self.theano_function_mode,
                name='BatchGradientDescent._store_old_grad')

            grad_ordered = list(grad_to_old_grad.keys())
            old_grad_ordered = [grad_to_old_grad[g] for g in grad_ordered]

            def dot_product(x, y):
                return sum([(x_elem * y_elem).sum()
                            for x_elem, y_elem in safe_zip(x, y)])

            beta_pr = (dot_product(grad_ordered, grad_ordered) - dot_product(grad_ordered, old_grad_ordered)) / \
                    (1e-7+dot_product(old_grad_ordered, old_grad_ordered))
            assert beta_pr.ndim == 0

            beta = T.maximum(beta_pr, 0.)
            """

            beta_pr is the Polak-Ribiere formula for beta.
            According to wikipedia, the beta to use for NCG is "a matter of heuristics or taste"
            but max(0, beta_pr) is "a popular choice... which provides direction reset automatically."
            (ie, it is meant to revert to steepest descent when you have traveled far enough that
            the objective function is behaving non-quadratically enough that the conjugate gradient
            formulas aren't working anymore)

            http://en.wikipedia.org/wiki/Nonlinear_conjugate_gradient_method

            """

            assert grad not in grad_to_old_grad

            make_conjugate_updates = [(g, g + beta * grad_to_old_grad[g])
                                      for g in grad_ordered]

            mode = self.theano_function_mode
            if mode is not None and hasattr(mode, 'record'):
                for v, u in make_conjugate_updates:
                    mode.record.handle_line('BatchGradientDescent._make_conjugate var ' \
                            + var_descriptor(v) + '\n')
                    mode.record.handle_line('BatchGradientDescent._make_conjugate update ' \
                            + var_descriptor(u) + '\n')

            self._make_conjugate = function(
                [],
                updates=make_conjugate_updates,
                mode=self.theano_function_mode,
                name='BatchGradientDescent._make_conjugate')

            if mode is not None and hasattr(mode, 'record'):
                for output in self._make_conjugate.maker.fgraph.outputs:
                    mode.record.handle_line('BatchGradientDescent._make_conjugate output ' \
                            + var_descriptor(output) + '\n')

        if tol is None:
            if objective.dtype == "float32":
                self.tol = 1e-6
            else:
                self.tol = 3e-7
        else:
            self.tol = tol

        self.ave_step_size = sharedX(0.)
        self.ave_grad_mult = sharedX(0.)
    def __init__(self, objective, params, inputs=None,
                 param_constrainers=None, max_iter=-1,
                 lr_scalers=None, verbose=0, tol=None,
                 init_alpha=None, min_init_alpha=1e-3,
                 reset_alpha=True, conjugate=False,
                 reset_conjugate=True, gradients=None,
                 gradient_updates=None, line_search_mode=None,
                 accumulate=False, theano_function_mode=None):

        self.__dict__.update(locals())
        del self.self

        if line_search_mode is None:
            if init_alpha is None:
                init_alpha = (.001, .005, .01, .05, .1)
        else:
            assert line_search_mode == 'exhaustive'
            if init_alpha is None:
                init_alpha = (.5, 1.)

        self.init_alpha = tuple([float(elem) for elem in init_alpha])

        if inputs is None:
            inputs = []

        if param_constrainers is None:
            param_constrainers = []

        obj = objective

        self.verbose = verbose

        param_to_grad_sym = OrderedDict()
        param_to_grad_shared = OrderedDict()
        updates = OrderedDict()
        if self.gradient_updates is not None:
            updates.update(self.gradient_updates)

        self.params = [param for param in params]

        for param in params:
            if self.gradients is not None and param in self.gradients:
                g = self.gradients[param]
            else:
                g = grad(objective, param)
            param_to_grad_sym[param] = g
            if param.name is not None:
                param_name = param.name
            else:
                param_name = 'anon_param'
            grad_name = 'BatchGradientDescent.grad_' + param_name
            grad_shared = sharedX(param.get_value() * 0., name=grad_name)
            param_to_grad_shared[param] = grad_shared
            updates[grad_shared] = g

        self.param_to_grad_shared = param_to_grad_shared

        if self.verbose:
            logger.info('batch gradient class compiling gradient function')
        t1 = time.time()
        if self.accumulate:
            self._compute_grad = Accumulator(inputs, updates=updates)
        else:
            self._compute_grad = function(
                inputs,
                updates=updates,
                mode=self.theano_function_mode,
                name='BatchGradientDescent._compute_grad')
        if self.verbose:
            t2 = time.time()
            logger.info('done. Took {0}'.format(t2-t1))

        if self.verbose:
            logger.info('batch gradient class compiling objective function')
        if self.accumulate:
            self.obj = Accumulator(inputs, obj)
        else:
            self.obj = function(inputs, obj, mode=self.theano_function_mode,
                                name='BatchGradientDescent.obj')

        if self.verbose:
            logger.info('done')

        self.param_to_cache = OrderedDict()
        alpha = T.scalar(name='alpha')
        alpha.tag.test_value = np.cast[alpha.dtype](.01)
        cache_updates = OrderedDict()
        goto_updates = OrderedDict()
        for param in params:
            if param.name is None:
                param_name = 'anon_param'
            else:
                param_name = param.name
            cache_name = 'BatchGradientDescent.param_to_cache[%s]' % param_name
            self.param_to_cache[param] = sharedX(param.get_value(borrow=False),
                                                 name=cache_name)
            cache_updates[self.param_to_cache[param]] = param
            cached = self.param_to_cache[param]
            g = self.param_to_grad_shared[param]
            if lr_scalers is not None and param in lr_scalers:
                scaled_alpha = alpha * lr_scalers[param]
            else:
                scaled_alpha = alpha
            mul = scaled_alpha * g
            diff = cached - mul
            goto_updates[param] = diff
        self._cache_values = function(
            [],
            updates=cache_updates,
            mode=self.theano_function_mode,
            name='BatchGradientDescent._cache_values')
        assert isinstance(param_constrainers, (list, tuple))
        for param_constrainer in param_constrainers:
            param_constrainer(goto_updates)
        self._goto_alpha = function(
            [alpha],
            updates=goto_updates,
            mode=self.theano_function_mode,
            name='BatchGradientDescent._goto_alpha')

        norm = T.sqrt(sum([T.sqr(elem).sum() for elem in
                           self.param_to_grad_shared.values()]))
        norm.name = 'BatchGradientDescent.norm'
        normalize_grad_updates = OrderedDict()
        for grad_shared in self.param_to_grad_shared.values():
            normalize_grad_updates[grad_shared] = grad_shared / norm

        # useful for monitoring
        self.ave_grad_size = sharedX(0.)
        self.new_weight = sharedX(1.)
        normalize_grad_updates[self.ave_grad_size] = \
            self.new_weight * norm + (1.-self.new_weight) * self.ave_grad_size

        self._normalize_grad = \
            function([],
                     norm,
                     updates=normalize_grad_updates,
                     mode=self.theano_function_mode,
                     name='BatchGradientDescent._normalize_grad')

        if self.conjugate:
            grad_shared = self.param_to_grad_shared.values()

            grad_to_old_grad = OrderedDict()
            for elem in grad_shared:
                grad_to_old_grad[elem] = \
                    sharedX(elem.get_value(), 'old_'+elem.name)

            self._store_old_grad = \
                function([norm],
                         updates=OrderedDict([(grad_to_old_grad[g_], g_ * norm)
                                             for g_ in grad_to_old_grad]),
                         mode=self.theano_function_mode,
                         name='BatchGradientDescent._store_old_grad')

            grad_ordered = list(grad_to_old_grad.keys())
            old_grad_ordered = [grad_to_old_grad[g_] for g_ in grad_ordered]

            def dot_product(x, y):
                return sum([(x_elem * y_elem).sum()
                           for x_elem, y_elem in safe_zip(x, y)])

            beta_pr = (dot_product(grad_ordered, grad_ordered) - dot_product(grad_ordered, old_grad_ordered)) / \
                (1e-7+dot_product(old_grad_ordered, old_grad_ordered))
            assert beta_pr.ndim == 0

            beta = T.maximum(beta_pr, 0.)

            # beta_pr is the Polak-Ribiere formula for beta.
            # According to wikipedia, the beta to use for NCG is "a matter of
            # heuristics or taste" but max(0, beta_pr) is "a popular choice...
            # which provides direction reset automatically." (ie, it is meant
            # to revert to steepest descent when you have traveled far enough
            # that the objective function is behaving non-quadratically enough
            # that the conjugate gradient formulas aren't working anymore)

            # http://en.wikipedia.org/wiki/Nonlinear_conjugate_gradient_method

            assert grad not in grad_to_old_grad

            make_conjugate_updates = \
                [(g_, g_ + beta * grad_to_old_grad[g_]) for g_ in grad_ordered]

            mode = self.theano_function_mode
            if mode is not None and hasattr(mode, 'record'):
                for v, u in make_conjugate_updates:
                    mode.record.handle_line(
                        'BatchGradientDescent._make_conjugate var '
                        + var_descriptor(v) + '\n')
                    mode.record.handle_line(
                        'BatchGradientDescent._make_conjugate update '
                        + var_descriptor(u) + '\n')

            self._make_conjugate = \
                function([], updates=make_conjugate_updates,
                         mode=self.theano_function_mode,
                         name='BatchGradientDescent._make_conjugate')

            if mode is not None and hasattr(mode, 'record'):
                for output in self._make_conjugate.maker.fgraph.outputs:
                    mode.record.handle_line(
                        'BatchGradientDescent._make_conjugate output '
                        + var_descriptor(output) + '\n')

        if tol is None:
            if objective.dtype == "float32":
                self.tol = 1e-6
            else:
                self.tol = 3e-7
        else:
            self.tol = tol

        self.ave_step_size = sharedX(0.)
        self.ave_grad_mult = sharedX(0.)
Esempio n. 5
0
    def setup_impl(self, model, dataset, algorithm):
        cost = algorithm.cost

        root = model.get_param_vector()

        dim = root.size

        rng = self.rng


        points = rng.randn(self.num_points, self.num_basis_vectors)
        points = points.astype(root.dtype)
        points *= self.scale

        if self.include_root:
            points[0, :] = 0.

        if not hasattr(self, 'cost_fn'):
            # Cargo cult all the Pascal bullshit needed to evaluate the f*****g cost function now
            # =======================================
            data_specs = cost.get_data_specs(model)
            mapping = DataSpecsMapping(data_specs)
            space_tuple = mapping.flatten(data_specs[0], return_tuple=True)
            source_tuple = mapping.flatten(data_specs[1], return_tuple=True)

            # Build a flat tuple of Theano Variables, one for each space.
            # We want that so that if the same space/source is specified
            # more than once in data_specs, only one Theano Variable
            # is generated for it, and the corresponding value is passed
            # only once to the compiled Theano function.
            theano_args = []
            for space, source in safe_zip(space_tuple, source_tuple):
                name = '%s[%s]' % (self.__class__.__name__, source)
                arg = space.make_theano_batch(name=name,
                                              batch_size=self.batch_size)
                theano_args.append(arg)
            theano_args = tuple(theano_args)

            # Methods of `cost` need args to be passed in a format compatible
            # with data_specs
            nested_args = mapping.nest(theano_args)
            fixed_var_descr = cost.get_fixed_var_descr(model, nested_args)
            self.on_load_batch = fixed_var_descr.on_load_batch

            cost_value = cost.expr(model, nested_args,
                                        ** fixed_var_descr.fixed_vars)
            # End cargo culting
            # ======================

            print "Compiling cost function..."
            cost_fn = function(theano_args, cost_value)
            self.cost_fn = cost_fn
        else:
            cost_fn = self.cost_fn

        cost_values = np.zeros(self.num_points)


        data = list(dataset.get_batch_design(self.batch_size,
            include_labels=True))
        from pylearn2.utils.one_hot import one_hot
        data[1] = one_hot(data[1])


        if self.method == 'gaussian':
            basis = rng.normal(dim, self.num_basis_vectors).astype(root.dtype)
        elif self.method == 'element':
            basis = np.zeros((dim, self.num_basis_vectors)).astype(root.dtype)
            for i in xrange(self.num_basis_vectors):
                basis[rng.randint(dim), i] = 1.
        elif self.method == 'gradient':
            if not hasattr(self, 'grad_fn'):
                self.grad_fn = function(theano_args, grad(cost_value, model.get_params()))
            grad_fn = self.grad_fn

            basis = np.zeros((dim, self.num_basis_vectors)).astype(root.dtype)
            for i in xrange(self.num_basis_vectors):
                ipt = list(dataset.get_batch_design(1, include_labels=True))
                label = ipt[1]
                assert label.size == 1
                label = label[0]
                one_hot = np.zeros((1, 10,),dtype='float32')
                one_hot[0, label] = 1
                ipt[1] = one_hot
                g = grad_fn(*ipt)
                basis[:,i] = np.concatenate([e.reshape(e.size) for e in g], axis=0)
        else:
            assert False

        basis /= np.sqrt(np.square(basis).sum(axis=0))

        # Orthogonalize basis
        for i in xrange(self.num_basis_vectors):
            v = basis[:,i ].copy()
            for j in xrange(i - 1):
                u = basis[:, j].copy()
                v -= np.dot(u, v) * u
            norm = np.sqrt(np.square(v).sum())
            assert norm > 1e-4
            v /= norm
            basis[:,i] = v


        for i in xrange(self.num_points):
            print "Evaluating cost at point ", i

            point = points[i, :]
            full_point = root + np.dot(basis, point)
            model.set_param_vector(full_point)

            cost_values[i] = cost_fn(*data)
            print cost_values[i]


        from pylearn2.utils import sharedX
        import theano.tensor as T

        print "!!!!!!!! FITTING THE QUADRATIC FUNCTION !!!!!!!!!!!!!!!!!!!"

        if not hasattr(self, 'fit_quad'):
            points = sharedX(points)
            #from theano import config
            #config.compute_test_value = 'raise'
            cost_values = sharedX(cost_values)
            A = sharedX(np.zeros((self.num_basis_vectors, self.num_basis_vectors)))
            if self.psd:
                mat = T.dot(A.T, A)
            else:
                mat = A
            b = sharedX(np.zeros(self.num_basis_vectors))
            c = sharedX(0.)
            half_quad = T.dot(points, mat)
            quad = (points * half_quad).sum(axis=1)
            lin = T.dot(points, b)
            pred = quad + lin + c

            from pylearn2.optimization.batch_gradient_descent import BatchGradientDescent

            mse = T.square(pred - cost_values).mean()
            mae = abs(pred - cost_values).mean()

            obj = locals()[self.fitting_cost]

            fit_quad = BatchGradientDescent(obj, params = [A, b, c],
                    max_iter = self.num_basis_vectors ** 2,
                    verbose = 3, tol = None,
                    init_alpha = None, min_init_alpha = 1e-7,
                    reset_alpha = False, conjugate = True,
                    reset_conjugate = False,
                    line_search_mode = 'exhaustive')
            self.fit_quad = fit_quad
            self.A = A
            self.b = b
            self.c = c
            self.points = points
            self.cost_values = cost_values
        else:
            self.A.set_value(.001 * np.identity(self.A.get_value().shape[0], dtype=self.A.dtype))
            self.b.set_value(self.b.get_value() * 0.)
            self.c.set_value(self.c.get_value() * 0.)
            self.points.set_value(points)
            self.cost_values.set_value(cost_values.astype(self.cost_values.dtype))

        self.fit_quad.minimize()

        print "!!!!!!!!!!!!! FINDING ITS MINIMUM !!!!!!!!!!!!!!!!!!!!!!!!!!!"

        if self.use_solver:
            if self.psd:
                Av = self.A.get_value()
                mat_v = np.dot(Av.T, Av)
            else:
                mat_v = self.A.get_value()
            bv = self.b.get_value()

            # minimize for x^T A x + b^T x + c
            # -> solve 2 A x + b = 0
            # Ax = - b / 2

            print "********** mat_v", mat_v.min(), mat_v.max()
            x, ignored_residuals, ignored_rank, ignored_singular_values = np.linalg.lstsq(mat_v, - 0.5 * bv)
            print "********** soln: ", x.min(), x.mean(), x.max()
            print "********** SVs: ", ignored_singular_values.min(), ignored_singular_values.max()
            assert x.ndim == 1, x.shape
            prod = np.dot(basis, x)
            norm = np.sqrt(np.square(prod).sum())
            print "*************** Moving params by ",norm
            vector = root + prod
            model.set_param_vector(vector)

        else: # use minimizer
            if not hasattr(self, 'fit_params'):
                self.vector = sharedX(points.get_value().mean(axis=0))
                vector = self.vector
                obj = T.dot(T.dot(mat, vector), vector) + T.dot(b, vector)

                def constrain(d):
                    assert vector in d
                    n = d[vector]
                    norm = T.sqrt(T.square(n).sum())
                    desired_norm = T.clip(norm, 0., self.max_jump_norm)
                    d[vector] = n * desired_norm / norm

                self.fit_params = BatchGradientDescent(obj, params=[vector],
                    max_iter = self.num_basis_vectors,
                    verbose = 3, tol=None,
                    param_constrainers = [constrain],
                    init_alpha = None, min_init_alpha = 1e-3,
                    reset_alpha=False, conjugate=True, reset_conjugate=False,
                    line_search_mode='exhaustive')
            else:
                self.vector.set_value(points.mean(axis=0).astype(self.vector.dtype))

            self.fit_params.minimize()

            model.set_param_vector(root + np.dot(basis , self.vector.get_value()))