Exemple #1
0
    def compute_gradients(
        self, outer_objective, inner_grad, meta_param=None, param_dict=OrderedDict()
    ):
        meta_param = super(BOMLOuterGradImplicit, self).compute_gradients(
            outer_objective, inner_grad, meta_param
        )
        state = list(inner_grad.state)

        with tf.variable_scope(outer_objective.op.name):
            g1 = utils.vectorize_all(
                tf.gradients(outer_objective, state)
            )  # Lower Level gradient of UL objective  w.r.t task parameters
            grads_inner_obj_vec = utils.vectorize_all(
                tf.gradients(inner_grad.objective, state)
            )  #  Lower Level gradient of LL objective  w.r.t task parameters

            q = self._create_q(g1)
            obj = tf.norm(
                utils.vectorize_all(
                    tf.gradients(utils.dot(grads_inner_obj_vec, q), state)
                )
                - g1
            )  # using the norm seems to produce better results then squared norm...
            # (even though is more costly)

            self._lin_sys.append(
                lambda _tolerance: self.linear_system_solver(obj, [q], _tolerance)
            )

            g2s = tf.gradients(outer_objective, meta_param)
            cross_ders = tf.gradients(utils.dot(grads_inner_obj_vec, q), meta_param)
            for g2, cd, hyper in zip(g2s, cross_ders, meta_param):
                assert (
                    g2 is not None or cd is not None
                ), BOMLOuterGrad._ERROR_HYPER_DETACHED.format(hyper)
                hg = utils.maybe_add(-cd, g2)
                if hg is None:  # this would be strange...
                    print(
                        "WARNING, outer objective is only directly dependent on hyperparameter {}. "
                        + "Direct optimization would be better!".format(hyper)
                    )
                    hg = g2
                self._hypergrad_dictionary[hyper].append(hg)

            return meta_param
Exemple #2
0
    def compute_gradients(self,
                          outer_objective,
                          inner_grad,
                          meta_param=None,
                          param_dict=OrderedDict()):
        """
        Function that adds to the computational graph all the operations needend for computing
        the hypergradients in a "dynamic" way, without unrolling the entire optimization graph.
        The resulting computation, while being roughly 2x more expensive then unrolling the
        optimizaiton dynamics, requires much less (GPU) memory and is more flexible, allowing
        to set a termination condition to the parameters optimizaiton routine.

        :param inner_grad: OptimzerDict object resulting from the inner objective optimization.
        :param outer_objective: A loss function for the outer parameters (scalar tensor)
        :param meta_param: Optional list of outer parameters to consider. If not provided will get all variables in the
                            hyperparameter collection in the current scope.
        :param param_dict: dictionary to store necessary parameters

        :return: list of outer parameters involved in the computation
        """
        meta_param = super(BOMLOuterGradDarts,
                           self).compute_gradients(outer_objective, inner_grad,
                                                   meta_param)

        with tf.variable_scope(outer_objective.op.name):

            ex = self.param_dict["experiment"]
            model = self.param_dict["experiment"].model
            loss_func = self.param_dict["loss_func"]

            # compute the first-order gradient of updated outer parameters with ones-step forward
            grads_outer = [
                self._create_outergradient(outer_objective, hyper)
                for hyper in meta_param
            ]

            # compute the first-order gradient of  the initial task parameters
            darts_derivatives = [
                grad for grad in tf.gradients(outer_objective,
                                              list(inner_grad.state))
            ]

            # compute the differentiation part, multiplied by Epsilon
            darts_vector = tf.concat(
                axis=0, values=utils.vectorize_all(darts_derivatives))
            self.epsilon = 0.01 / tf.norm(tensor=darts_vector, ord=2)
            darts_derivatives = [
                self.Epsilon * darts_derivative
                for darts_derivative in darts_derivatives
            ]
            fin_diff_part = self._create_darts_derivatives(
                var_list=inner_grad.state, darts_derivatives=darts_derivatives)
            self._diff_initializer = tf.group(
                self._diff_initializer,
                tf.variables_initializer(fin_diff_part),
                tf.variables_initializer(grads_outer),
            )

            right_diff_0 = dict(
                zip(
                    model.task_parameter.keys(),
                    [
                        tf.add(state, fin_diff) for state, fin_diff in zip(
                            model.task_parameter.values(), fin_diff_part)
                    ],
                ))
            left_diff_0 = dict(
                zip(
                    model.task_parameter.keys(),
                    [
                        tf.subtract(state, fin_diff)
                        for state, fin_diff in zip(
                            model.task_parameter.values(), fin_diff_part)
                    ],
                ))

            left_diff = tf.gradients(
                loss_func(
                    pred=model.re_forward(task_parameter=left_diff_0).out,
                    label=ex.y),
                xs=meta_param,
            )

            right_diff = tf.gradients(
                loss_func(
                    pred=model.re_forward(task_parameter=right_diff_0).out,
                    label=ex.y),
                xs=meta_param,
            )

            if self._inner_method == 'Aggr':
                left_diff_outer = tf.gradients(
                    loss_func(pred=model.re_forward(
                        new_input=param_dict['meta_learner'].re_forward(
                            ex.x_).out,
                        task_parameter=left_diff_0).out,
                              label=ex.y_),
                    xs=meta_param,
                )
                for _, left_diff_grad, left_diff_outer_grad in zip(
                        range(len(left_diff)), left_diff, left_diff_outer):
                    if left_diff_grad is not None:
                        if left_diff_outer_grad is not None:
                            left_diff[_] = (1 - param_dict['alpha']
                                            ) * left_diff_grad + param_dict[
                                                'alpha'] * left_diff_outer_grad
                    else:
                        if left_diff_outer_grad is not None:
                            left_diff[
                                _] = param_dict['alpha'] * left_diff_outer_grad

                right_diff_outer = tf.gradients(
                    loss_func(pred=model.re_forward(
                        new_input=param_dict['meta_learner'].re_forward(
                            ex.x_).out,
                        task_parameter=right_diff_0).out,
                              label=ex.y_),
                    xs=meta_param,
                )

                for _, right_diff_grad, right_diff_outer_grad in zip(
                        range(len(right_diff)), right_diff, right_diff_outer):
                    if right_diff_grad is not None:
                        if right_diff_outer_grad is not None:
                            right_diff[_] = (
                                1 - param_dict['alpha']
                            ) * right_diff_grad + param_dict[
                                'alpha'] * right_diff_outer_grad
                    else:
                        if right_diff_outer_grad is not None:
                            right_diff[_] = param_dict[
                                'alpha'] * right_diff_outer_grad

            # compute the second-order part and add them to the first-order item
            for grad_outer, left_dif, right_dif in zip(grads_outer, left_diff,
                                                       right_diff):
                if right_dif is not None and left_dif is not None:
                    grad_param = tf.divide(tf.subtract(right_dif, left_dif),
                                           2 * self.epsilon)
                    meta_grad = self.param_dict["learning_rate"] * grad_param
                    self._darts_initializer = tf.group(
                        self._darts_initializer,
                        grad_outer.assign_sub(meta_grad))

            for h, doo_dh in zip(meta_param, grads_outer):
                assert doo_dh is not None, BOMLOuterGrad._ERROR_HYPER_DETACHED.format(
                    doo_dh)
                self._outer_grads_dict[h].append(doo_dh)
            return meta_param
Exemple #3
0
    def compute_gradients(self,
                          outer_objective,
                          inner_grad,
                          meta_param=None,
                          param_dict=OrderedDict()):
        """
        Function that adds to the computational graph all the operations needend for computing
        the hypergradients in a "dynamic" way, without unrolling the entire optimization graph.
        The resulting computation, while being roughly 2x more expensive then unrolling the
        optimizaiton dynamics, requires much less (GPU) memory and is more flexible, allowing
        to set a termination condition to the parameters optimizaiton routine.

        :param inner_grad: OptimzerDict object resulting from the inner objective optimization.
        :param outer_objective: A loss function for the outer parameters (scalar tensor)
        :param meta_param: Optional list of outer parameters to consider. If not provided will get all variables in the
                            hyperparameter collection in the current scope.

        :return: list of outer parameters involved in the computation
        """
        meta_param = super(BOMLOuterGradReverse,
                           self).compute_gradients(outer_objective, inner_grad,
                                                   meta_param)

        with tf.variable_scope(outer_objective.op.name):
            doo_ds = tf.gradients(outer_objective, list(inner_grad.state))
            alphas = self._create_lagrangian_multipliers(inner_grad, doo_ds)

            alpha_vec = utils.vectorize_all(alphas)
            dyn_vec = utils.vectorize_all(list(inner_grad.dynamics))
            lag_phi_t = utils.dot(alpha_vec,
                                  dyn_vec,
                                  name="iter_wise_lagrangian_part1")

            alpha_dot_B = tf.gradients(lag_phi_t, meta_param)

            hyper_grad_vars, hyper_grad_step = [], tf.no_op()
            for dl_dh, hyper in zip(alpha_dot_B, meta_param):
                assert dl_dh is not None, BOMLOuterGrad._ERROR_HYPER_DETACHED.format(
                    hyper)
                hgv = None
                if dl_dh is not None:
                    hgv = self._create_outergradient(outer_objective, hyper)

                    hyper_grad_step = tf.group(hyper_grad_step,
                                               hgv.assign_add(dl_dh))
                hyper_grad_vars.append(hgv)
                # first update hypergradinet then alphas.
            with tf.control_dependencies([hyper_grad_step]):
                _alpha_iter = tf.group(*[
                    alpha.assign(dl_ds) for alpha, dl_ds in zip(
                        alphas, tf.gradients(lag_phi_t, list(
                            inner_grad.state)))
                ])
            self._alpha_iter = tf.group(self._alpha_iter, _alpha_iter)
            # put all the backward iterations toghether
            [
                self._hypergrad_dictionary[h].append(hg)
                for h, hg in zip(meta_param, hyper_grad_vars)
            ]
            self._reverse_initializer = tf.group(
                self._reverse_initializer,
                tf.variables_initializer(alphas),
                tf.variables_initializer(
                    [h for h in hyper_grad_vars if hasattr(h, "initializer")]),
            )
            return meta_param
Exemple #4
0
    def compute_gradients(self,
                          outer_objective,
                          inner_grad,
                          meta_param=None,
                          param_dict=OrderedDict()):
        """
        Function that adds to the computational graph all the operations needed for computing
        the outer gradients with the dynamical system.
        :param inner_grad: BOMLInnerGrad object resulting from the inner objective optimization.
        :param outer_objective: A loss function for the outer parameters (scalar tensor)
        :param meta_param: Optional list of outer parameters to consider. If not provided will get all variables in the
                            METAPARAMETERS collection in the current scope.

        :return: list of outer parameters involved in the computation
        """
        meta_param = super(BOMLOuterGradReverse,
                           self).compute_gradients(outer_objective, inner_grad,
                                                   meta_param)

        with tf.variable_scope(outer_objective.op.name):
            doo_ds = tf.gradients(outer_objective, list(inner_grad.state))
            alphas = self._create_lagrangian_multipliers(inner_grad, doo_ds)

            alpha_vec = utils.vectorize_all(alphas)
            dyn_vec = utils.vectorize_all(list(inner_grad.dynamics))
            lag_phi_t = utils.dot(alpha_vec,
                                  dyn_vec,
                                  name="iter_wise_lagrangian_part1")

            alpha_dot_B = tf.gradients(lag_phi_t, meta_param)

            outer_grad_vars, outer_grad_step = [], tf.no_op()
            for dl_dh, hyper in zip(alpha_dot_B, meta_param):
                assert dl_dh is not None, BOMLOuterGrad._ERROR_HYPER_DETACHED.format(
                    hyper)
                hgv = None
                if dl_dh is not None:
                    hgv = self._create_outergradient(outer_objective, hyper)

                    outer_grad_step = tf.group(outer_grad_step,
                                               hgv.assign_add(dl_dh))
                outer_grad_vars.append(hgv)
                # first update hypergradinet then alphas.
            with tf.control_dependencies([outer_grad_step]):
                _alpha_iter = tf.group(*[
                    alpha.assign(dl_ds) for alpha, dl_ds in zip(
                        alphas, tf.gradients(lag_phi_t, list(
                            inner_grad.state)))
                ])
            self._alpha_iter = tf.group(self._alpha_iter, _alpha_iter)
            # put all the backward iterations toghether
            [
                self._outer_grads_dict[h].append(hg)
                for h, hg in zip(meta_param, outer_grad_vars)
            ]
            self._reverse_initializer = tf.group(
                self._reverse_initializer,
                tf.variables_initializer(alphas),
                tf.variables_initializer(
                    [h for h in outer_grad_vars if hasattr(h, "initializer")]),
            )
            return meta_param