コード例 #1
0
    def compute_gradients(self, outer_objective, optimizer_dict, hyper_list=None):
        hyper_list = super(ImplicitHG, self).compute_gradients(outer_objective, optimizer_dict, hyper_list)
        state = list(optimizer_dict.state)

        with tf.variable_scope(outer_objective.op.name):
            g1 = utils.vectorize_all(tf.gradients(outer_objective, state))
            grads_inner_obj_vec = utils.vectorize_all(tf.gradients(optimizer_dict.objective, state))

            q = self._create_q(g1)
            obj = tf.norm(
                utils.vectorize_all(tf.gradients(utils.dot(grads_inner_obj_vec, q), state)) - g1
            )  # using the norm seems to produce better results then squared norm...
            # (even though is more costly)

            self._lin_sys.append(lambda _tolerance: self.linear_system_solver(obj, [q], _tolerance))

            g2s = tf.gradients(outer_objective, hyper_list)
            cross_ders = tf.gradients(utils.dot(grads_inner_obj_vec, q), hyper_list)
            for g2, cd, hyper in zip(g2s, cross_ders, hyper_list):
                assert g2 is not None or cd is not None, HyperGradient._ERROR_HYPER_DETACHED.format(hyper)
                hg = utils.maybe_add(-cd, g2)
                if hg is None:  # this would be strange...
                    print('WARNING, outer objective is only directly dependent on hyperparameter {}. ' +
                          'Direct optimization would be better!'.format(hyper))
                    hg = g2
                self._hypergrad_dictionary[hyper].append(hg)

            return hyper_list
コード例 #2
0
ファイル: optimizer.py プロジェクト: AmirooR/FAR-HO
    def apply_gradients(self, grads_and_vars, global_step=None, name=None):
        super(BackTrackingGradientDescentOptimizer, self)._prepare()
        with tf.name_scope(name, self.get_name()):
            m = 0.
            dynamics = OrderedDict()

            def _wk(_eta, _w, _g):
                return _w - _eta * _g

            for g, w in grads_and_vars:
                dynamics[w] = (g, _wk)
                m -= utils.dot(g, g)

        return dynamics, m
コード例 #3
0
    def apply_gradients(self, grads_and_vars, global_step=None, name=None):
        # ts = super(BackTrackingGradientDescentOptimizer, self).apply_gradients(grads_and_vars, global_step, name)
        # self._learning_rate_t = tf.convert_to_tensor(self._learning_rate, dtype=grads_and_vars[0][1].)
        super(BackTrackingGradientDescentOptimizer, self)._prepare()
        with tf.name_scope(name, self.get_name()):
            # dynamics = []
            m = 0.
            dynamics = []

            def _wk(_eta, _w, _g):
                return _w - _eta * _g

            for g, w in grads_and_vars:
                dynamics.append((w, g, _wk))
                m -= utils.dot(g, g)

        return dynamics, m
コード例 #4
0
    def compute_gradients(self, outer_objective, optimizer_dict, hyper_list=None):
        hyper_list = super(ForwardHG, self).compute_gradients(outer_objective, optimizer_dict, hyper_list)

        # scalar_hyper_list

        with tf.variable_scope(outer_objective.op.name):
            # dynamics_vec = vectorize_all(optimizer_dict.dynamics)  # in the new implementation there's no need of
            # vectorizing... it might be more efficient since it's better to avoid too many reshaping operations...
            d_oo_d_state = tf.gradients(outer_objective, list(optimizer_dict.state))

            with tf.name_scope('DUMMY'):  # variables to compute forward propagation
                # TODO avoid this computation if optimizer_dict has already been seen.
                aux_vs = [tf.zeros_like(v) for v in optimizer_dict.state]
                dynamics_dot_aux_v = reduce_all_sums(list(optimizer_dict.dynamics), aux_vs)

                der_dynamics_dot_aux_v = tf.gradients(dynamics_dot_aux_v, list(optimizer_dict.state))
                # this is a list of jacobians times aux_vs that have the same dimension of states variables.

                init_dynamics_dot_aux_v = None
                if optimizer_dict.init_dynamics:
                    # init_dynamics_dot_aux_v = dot(vectorize_all(optimizer_dict.init_dynamics), aux_v_vec)  # old impl
                    init_dynamics_dot_aux_v = reduce_all_sums(
                        optimizer_dict.init_dynamics, aux_vs)

            for hyp in hyper_list:
                assert hyp.shape.ndims == 0, ForwardHG._HYPER_RANK_ERROR_MESSAGE.format(hyp, hyp.shape.ndims)

                d_init_dyn_d_hyp = None if init_dynamics_dot_aux_v is None else \
                    tf.gradients(init_dynamics_dot_aux_v, hyp)[0]
                d_dyn_d_hyp = tf.gradients(dynamics_dot_aux_v, hyp)[0]
                d_oo_d_hyp = tf.gradients(outer_objective, hyp)[0]

                # ------------------------------------------------------------
                # check detached hyperparameters (for which hypergradient would be always null)
                hyper_ok = d_init_dyn_d_hyp is not None or d_dyn_d_hyp is not None or d_oo_d_hyp is not None
                if RAISE_ERROR_ON_DETACHED:
                    # try:
                    assert hyper_ok, HyperGradient._ERROR_HYPER_DETACHED.format(hyp)
                    # ex
                else:
                    if not hyper_ok:
                        print(HyperGradient._ERROR_HYPER_DETACHED.format(hyp), file=sys.stderr)
                        hyper_list.remove(hyp)
                # -------------------------------------------------------------

                # UPDATE OF TOTAL DERIVATIVE OF STATE W.R.T. HYPERPARAMETER
                zs = ForwardHG._create_zs(
                    optimizer_dict, hyp, None if d_init_dyn_d_hyp is None else tf.gradients(d_init_dyn_d_hyp, aux_vs)
                )  # this is one z for each variable
                self._zs[hyp] = zs  # store a reference for the total derivatives for easy access
                Bs = tf.gradients(d_dyn_d_hyp, aux_vs)

                A_dot_zs = tf.gradients(reduce_all_sums(der_dynamics_dot_aux_v, zs), aux_vs)

                self.A_dot_zs[hyp] = A_dot_zs

                _z_iter = tf.group(*[
                    z.assign(maybe_add(A_dot_z, B)) for z, A_dot_z, B
                    in zip(zs, A_dot_zs, Bs)
                ])
                self._z_iter = tf.group(self._z_iter, _z_iter)

                # -- HYPERGRADIENT -----
                d_E_T = [dot(d_oo_d_s, z) for d_oo_d_s, z in zip(d_oo_d_state, zs)
                         if d_oo_d_s is not None and z is not None]  # list of dot products
                hg = maybe_add(tf.reduce_sum(d_E_T), d_oo_d_hyp)  # sum the partial dot products and possibly ->
                # adds the ''direct derivative'' term d(E( . , \lambda))/d \lambda

                self._hypergrad_dictionary[hyp].append(hg)
                self._forward_initializer = tf.group(self._forward_initializer,
                                                     tf.variables_initializer(zs))
        return hyper_list
コード例 #5
0
    def compute_gradients(self, outer_objective, optimizer_dict, hyper_list=None):
        """
        Function that adds to the computational graph all the operations needend for computing
        the hypergradients in a "dynamic" way, without unrolling the entire optimization graph.
        The resulting computation, while being roughly 2x more expensive then unrolling the
        optimizaiton dynamics, requires much less (GPU) memory and is more flexible, allowing
        to set a termination condition to the parameters optimizaiton routine.

        :param optimizer_dict: OptimzerDict object resulting from the inner objective optimization.
        :param outer_objective: A loss function for the hyperparameters (scalar tensor)
        :param hyper_list: Optional list of hyperparameters to consider. If not provided will get all variables in the
                            hyperparameter collection in the current scope.

        :return: list of hyperparameters involved in the computation
        """
        hyper_list = super(ReverseHG, self).compute_gradients(outer_objective, optimizer_dict, hyper_list)

        # derivative of outer objective w.r.t. state
        with tf.variable_scope(outer_objective.op.name):  # for some reason without this there is a cathastrofic
            # failure...
            doo_ds = tf.gradients(outer_objective, list(optimizer_dict.state))

            alphas = self._create_lagrangian_multipliers(optimizer_dict, doo_ds)

            alpha_vec = utils.vectorize_all(alphas)
            dyn_vec = utils.vectorize_all(list(optimizer_dict.dynamics))
            lag_phi_t = utils.dot(alpha_vec, dyn_vec, name='iter_wise_lagrangian_part1')
            # TODO outer_objective might be a list... handle this case

            # iterative computation of hypergradients
            alpha_dot_B = tf.gradients(lag_phi_t, hyper_list)
            # check that optimizer_dict has initial ops (phi_0)
            if optimizer_dict.init_dynamics is not None:
                lag_phi0 = utils.dot(alpha_vec, utils.vectorize_all([d for (s, d) in optimizer_dict.init_dynamics]))
                alpha_dot_B0 = tf.gradients(lag_phi0, hyper_list)
            else:
                alpha_dot_B0 = [None] * len(hyper_list)

            # here, if some of this is None it may mean that the hyperparameter compares inside phi_0: check that and
            # if it is not the case raise error...
            hyper_grad_vars, hyper_grad_step = [], tf.no_op()
            for dl_dh, a_d_b0, hyper in zip(alpha_dot_B, alpha_dot_B0, hyper_list):
                assert dl_dh is not None or a_d_b0 is not None, HyperGradient._ERROR_HYPER_DETACHED.format(hyper)
                hgv = None
                if dl_dh is not None:  # "normal hyperparameter"
                    hgv = self._create_hypergradient(outer_objective, hyper)

                    hyper_grad_step = tf.group(hyper_grad_step, hgv.assign_add(dl_dh))
                if a_d_b0 is not None:
                    hgv = hgv + a_d_b0 if hgv is not None else a_d_b0
                    # here hyper_grad_step has nothing to do...
                hyper_grad_vars.append(hgv)  # save these...

            with tf.control_dependencies([hyper_grad_step]):  # first update hypergradinet then alphas.
                _alpha_iter = tf.group(*[alpha.assign(dl_ds) for alpha, dl_ds
                                         in zip(alphas, tf.gradients(lag_phi_t, list(optimizer_dict.state)))])
            self._alpha_iter = tf.group(self._alpha_iter, _alpha_iter)  # put all the backward iterations toghether

            [self._hypergrad_dictionary[h].append(hg) for h, hg in zip(hyper_list, hyper_grad_vars)]

            self._reverse_initializer = tf.group(self._reverse_initializer,
                                                 tf.variables_initializer(alphas),
                                                 tf.variables_initializer([h for h in hyper_grad_vars
                                                                           if hasattr(h, 'initializer')]))  # some ->
            # hypergradients (those coming form initial dynamics) might be just tensors and not variables...

            return hyper_list
コード例 #6
0
ファイル: hyper_gradients.py プロジェクト: habibrk/FAR-HO
    def compute_gradients(self,
                          outer_objective,
                          optimizer_dict,
                          hyper_list=None):
        hyper_list = super(ForwardHG,
                           self).compute_gradients(outer_objective,
                                                   optimizer_dict, hyper_list)

        # scalar_hyper_list

        with tf.variable_scope(outer_objective.op.name):
            # dynamics_vec = vectorize_all(optimizer_dict.dynamics)  # in the new implementation there's no need of
            # vectorizing... it might be more efficient since it's better to avoid too many reshaping operations...
            d_oo_d_state = tf.gradients(outer_objective, optimizer_dict.state)

            # d_oo_d_state = [_v if _v is not None else tf.zeros_like(_s)
            # for _v, _s in zip(d_oo_d_state, optimizer_dict.state)]

            with tf.name_scope(
                    'DUMMY'):  # variables to compute forward propagation
                # TODO avoid this computation if optimizer_dict has already been seen.
                aux_v = [tf.zeros_like(v) for v in optimizer_dict.state]
                # aux_v_vec = vectorize_all(aux_v)
                # dynamics_dot_aux_v = dot(dynamics_vec, aux_v_vec)  # old impl
                dynamics_dot_aux_v = reduce_all_sums(optimizer_dict.dynamics,
                                                     aux_v)

                der_dynamics_dot_aux_v = tf.gradients(dynamics_dot_aux_v,
                                                      optimizer_dict.state)
                # this is a list of jacobians times aux_v that have the same dimension of states variables.

                init_dynamics_dot_aux_v = None
                if optimizer_dict.init_dynamics:
                    # init_dynamics_dot_aux_v = dot(vectorize_all(optimizer_dict.init_dynamics), aux_v_vec)  # old impl
                    init_dynamics_dot_aux_v = reduce_all_sums(
                        optimizer_dict.init_dynamics, aux_v)

            for hyp in hyper_list:
                assert hyp.shape.ndims == 0, ForwardHG._HYPER_RANK_ERROR_MESSAGE.format(
                    hyp, hyp.shape.ndims)

                d_init_dyn_d_hyp = None if init_dynamics_dot_aux_v is None else \
                    tf.gradients(init_dynamics_dot_aux_v, hyp)[0]
                d_dyn_d_hyp = tf.gradients(dynamics_dot_aux_v, hyp)[0]
                d_oo_d_hyp = tf.gradients(outer_objective, hyp)[0]
                if RAISE_ERROR_ON_DETACHED:
                    assert d_init_dyn_d_hyp is not None or d_dyn_d_hyp is not None or\
                        d_oo_d_hyp is not None, HyperGradient._ERROR_HYPER_DETACHED.format(hyp)
                else:
                    print(HyperGradient._ERROR_HYPER_DETACHED.format(hyp),
                          file=sys.stderr)
                    hyper_list.remove(hyp)

                # UPDATE OF TOTAL DERIVATIVE OF STATE W.R.T. HYPERPARAMETER
                zs = ForwardHG._create_z(
                    optimizer_dict, hyp,
                    None if d_init_dyn_d_hyp is None else tf.gradients(
                        d_init_dyn_d_hyp, aux_v))
                # dyn_dot_zs = dot(dynamics_vec, vectorize_all(zs))
                Bs = tf.gradients(d_dyn_d_hyp, aux_v)  # this looks right...
                # A_dot_zs = tf.gradients(dyn_dot_zs, optimizer_dict.state)  # I guess the error is here!
                # the error is HERE! this operation computes d Phi/ d w * z for each w instead of d Phi_i / d s * z
                # for each i

                # A_dot_zs = tf.gradients(dot(vectorize_all(der_dynamics_dot_aux_v), vectorize_all(zs)), aux_v)  # old
                A_dot_zs = tf.gradients(
                    reduce_all_sums(der_dynamics_dot_aux_v, zs), aux_v)

                self.A_dot_zs[hyp] = A_dot_zs

                _z_iter = tf.group(*[
                    z.assign(maybe_add(A_dot_z, B))
                    for z, A_dot_z, B in zip(zs, A_dot_zs, Bs)
                ])
                self._z_iter = tf.group(self._z_iter, _z_iter)

                # HYPERGRADIENT
                # d_E_T = dot(vectorize_all(d_oo_d_state), vectorize_all(zs))
                d_E_T = [
                    dot(d_oo_d_s, z) for d_oo_d_s, z in zip(d_oo_d_state, zs)
                    if d_oo_d_s is not None and z is not None
                ]
                hg = maybe_add(
                    tf.reduce_sum(d_E_T),
                    d_oo_d_hyp)  # this is right... the error is not here!
                # hg = maybe_add(d_E_T, d_oo_d_hyp)

                self._hypergrad_dictionary[hyp].append(hg)

                self._forward_initializer = tf.group(
                    self._forward_initializer, tf.variables_initializer(zs))

        return hyper_list
コード例 #7
0
ファイル: hyper_gradients.py プロジェクト: codealphago/FAR-HO
    def compute_gradients(self, outer_objective, optimizer_dict, hyper_list=None):
        hyper_list = super().compute_gradients(outer_objective, optimizer_dict, hyper_list)

        # scalar_hyper_list

        with tf.variable_scope(outer_objective.op.name):
            # dynamics_vec = vectorize_all(optimizer_dict.dynamics)  # in the new implementation there's no need of
            # vectorizing... it might be more efficient since it's better to avoid too many reshaping operations...
            d_oo_d_state = tf.gradients(outer_objective, optimizer_dict.state)

            # d_oo_d_state = [_v if _v is not None else tf.zeros_like(_s)
            # for _v, _s in zip(d_oo_d_state, optimizer_dict.state)]

            with tf.name_scope('DUMMY'):  # variables to compute forward propagation
                # TODO avoid this computation if optimizer_dict has already been seen.
                aux_v = [tf.zeros_like(v) for v in optimizer_dict.state]
                # aux_v_vec = vectorize_all(aux_v)
                # dynamics_dot_aux_v = dot(dynamics_vec, aux_v_vec)  # old impl
                dynamics_dot_aux_v = reduce_all_sums(optimizer_dict.dynamics, aux_v)

                der_dynamics_dot_aux_v = tf.gradients(dynamics_dot_aux_v, optimizer_dict.state)
                # this is a list of jacobians times aux_v that have the same dimension of states variables.

                init_dynamics_dot_aux_v = None
                if optimizer_dict.init_dynamics:
                    # init_dynamics_dot_aux_v = dot(vectorize_all(optimizer_dict.init_dynamics), aux_v_vec)  # old impl
                    init_dynamics_dot_aux_v = reduce_all_sums(
                        optimizer_dict.init_dynamics, aux_v)

            for hyp in hyper_list:
                assert hyp.shape.ndims == 0, ForwardHG._HYPER_RANK_ERROR_MESSAGE.format(hyp, hyp.shape.ndims)

                d_init_dyn_d_hyp = None if init_dynamics_dot_aux_v is None else \
                    tf.gradients(init_dynamics_dot_aux_v, hyp)[0]
                d_dyn_d_hyp = tf.gradients(dynamics_dot_aux_v, hyp)[0]
                d_oo_d_hyp = tf.gradients(outer_objective, hyp)[0]
                if RAISE_ERROR_ON_DETACHED:
                    assert d_init_dyn_d_hyp is not None or d_dyn_d_hyp is not None or\
                        d_oo_d_hyp is not None, HyperGradient._ERROR_HYPER_DETACHED.format(hyp)
                else:
                    print( HyperGradient._ERROR_HYPER_DETACHED.format(hyp), file=sys.stderr)
                    hyper_list.remove(hyp)

                # UPDATE OF TOTAL DERIVATIVE OF STATE W.R.T. HYPERPARAMETER
                zs = ForwardHG._create_z(
                    optimizer_dict, hyp, None if d_init_dyn_d_hyp is None else tf.gradients(d_init_dyn_d_hyp, aux_v)
                )
                # dyn_dot_zs = dot(dynamics_vec, vectorize_all(zs))
                Bs = tf.gradients(d_dyn_d_hyp, aux_v)  # this looks right...
                # A_dot_zs = tf.gradients(dyn_dot_zs, optimizer_dict.state)  # I guess the error is here!
                # the error is HERE! this operation computes d Phi/ d w * z for each w instead of d Phi_i / d s * z
                # for each i

                # A_dot_zs = tf.gradients(dot(vectorize_all(der_dynamics_dot_aux_v), vectorize_all(zs)), aux_v)  # old
                A_dot_zs = tf.gradients(reduce_all_sums(der_dynamics_dot_aux_v, zs), aux_v)

                self.A_dot_zs[hyp] = A_dot_zs

                _z_iter = tf.group(*[
                    z.assign(maybe_add(A_dot_z, B)) for z, A_dot_z, B
                    in zip(zs, A_dot_zs, Bs)
                ])
                self._z_iter = tf.group(self._z_iter, _z_iter)

                # HYPERGRADIENT
                # d_E_T = dot(vectorize_all(d_oo_d_state), vectorize_all(zs))
                d_E_T = [dot(d_oo_d_s, z) for d_oo_d_s, z in zip(d_oo_d_state, zs)
                         if d_oo_d_s is not None and z is not None]
                hg = maybe_add(tf.reduce_sum(d_E_T), d_oo_d_hyp)  # this is right... the error is not here!
                # hg = maybe_add(d_E_T, d_oo_d_hyp)

                self._hypergrad_dictionary[hyp].append(hg)

                self._forward_initializer = tf.group(self._forward_initializer,
                                                     tf.variables_initializer(zs))

        return hyper_list
コード例 #8
0
ファイル: hyper_gradients.py プロジェクト: codealphago/FAR-HO
    def compute_gradients(self, outer_objective, optimizer_dict, hyper_list=None):
        """
        Function that adds to the computational graph all the operations needend for computing
        the hypergradients in a "dynamic" way, without unrolling the entire optimization graph.
        The resulting computation, while being roughly 2x more expensive then unrolling the
        optimizaiton dynamics, requires much less (GPU) memory and is more flexible, allowing
        to set a termination condition to the parameters optimizaiton routine.

        :param optimizer_dict: OptimzerDict object resulting from the inner objective optimization.
        :param outer_objective: A loss function for the hyperparameters (scalar tensor)
        :param hyper_list: Optional list of hyperparameters to consider. If not provided will get all variables in the
                            hyperparameter collection in the current scope.

        :return: list of hyperparameters involved in the computation
        """
        hyper_list = super().compute_gradients(outer_objective, optimizer_dict, hyper_list)

        # derivative of outer objective w.r.t. state
        with tf.variable_scope(outer_objective.op.name):  # for some reason without this there is a cathastrofic
            # failure...
            doo_ds = tf.gradients(outer_objective, optimizer_dict.state)

            alphas = self._create_lagrangian_multipliers(optimizer_dict, doo_ds)

            alpha_vec = utils.vectorize_all(alphas)
            dyn_vec = utils.vectorize_all(optimizer_dict.dynamics)
            lag_phi_t = utils.dot(alpha_vec, dyn_vec, name='iter_wise_lagrangian_part1')
            # TODO outer_objective might be a list... handle this case

            # iterative computation of hypergradients
            doo_dypers = tf.gradients(outer_objective, hyper_list)  # (direct) derivative of outer objective w.r.t. hyp.
            alpha_dot_B = tf.gradients(lag_phi_t, hyper_list)
            # check that optimizer_dict has initial ops (phi_0)
            if optimizer_dict.init_dynamics is not None:
                lag_phi0 = utils.dot(alpha_vec, utils.vectorize_all([d for (s, d) in optimizer_dict.init_dynamics]))
                alpha_dot_B0 = tf.gradients(lag_phi0, hyper_list)
            else:
                alpha_dot_B0 = [None] * len(hyper_list)

            # here is some of this is None it may mean that the hyperparameter compares inside phi_0: check that and
            # if it is not the case return error...
            hyper_grad_vars, hyper_grad_step = [], tf.no_op()
            for dl_dh, doo_dh, a_d_b0, hyper in zip(alpha_dot_B, doo_dypers, alpha_dot_B0, hyper_list):
                assert dl_dh is not None or a_d_b0 is not None, HyperGradient._ERROR_HYPER_DETACHED.format(hyper)
                hgv = None
                if dl_dh is not None:  # "normal hyperparameter"
                    hgv = self._create_hypergradient(hyper, doo_dh)

                    hyper_grad_step = tf.group(hyper_grad_step, hgv.assign_add(dl_dh))
                if a_d_b0 is not None:
                    hgv = hgv + a_d_b0 if hgv is not None else a_d_b0
                    # here hyper_grad_step has nothing to do...
                hyper_grad_vars.append(hgv)  # save these...

            with tf.control_dependencies([hyper_grad_step]):  # first update hypergradinet then alphas.
                _alpha_iter = tf.group(*[alpha.assign(dl_ds) for alpha, dl_ds
                                         in zip(alphas, tf.gradients(lag_phi_t, optimizer_dict.state))])
            self._alpha_iter = tf.group(self._alpha_iter, _alpha_iter)  # put all the backward iterations toghether

            [self._hypergrad_dictionary[h].append(hg) for h, hg in zip(hyper_list, hyper_grad_vars)]

            self._reverse_initializer = tf.group(self._reverse_initializer,
                                                 tf.variables_initializer(alphas),
                                                 tf.variables_initializer([h for h in hyper_grad_vars
                                                                           if hasattr(h, 'initializer')]))  # some ->
            # hypergradients (those coming form initial dynamics) might be just tensors and not variables...

            return hyper_list