def compute_gradients(self, outer_objective, optimizer_dict, hyper_list=None):
        hyper_list = super(ImplicitHG, self).compute_gradients(outer_objective, optimizer_dict, hyper_list)
        state = list(optimizer_dict.state)

        with tf.variable_scope(outer_objective.op.name):
            g1 = utils.vectorize_all(tf.gradients(outer_objective, state))
            grads_inner_obj_vec = utils.vectorize_all(tf.gradients(optimizer_dict.objective, state))

            q = self._create_q(g1)
            obj = tf.norm(
                utils.vectorize_all(tf.gradients(utils.dot(grads_inner_obj_vec, q), state)) - g1
            )  # using the norm seems to produce better results then squared norm...
            # (even though is more costly)

            self._lin_sys.append(lambda _tolerance: self.linear_system_solver(obj, [q], _tolerance))

            g2s = tf.gradients(outer_objective, hyper_list)
            cross_ders = tf.gradients(utils.dot(grads_inner_obj_vec, q), hyper_list)
            for g2, cd, hyper in zip(g2s, cross_ders, hyper_list):
                assert g2 is not None or cd is not None, HyperGradient._ERROR_HYPER_DETACHED.format(hyper)
                hg = utils.maybe_add(-cd, g2)
                if hg is None:  # this would be strange...
                    print('WARNING, outer objective is only directly dependent on hyperparameter {}. ' +
                          'Direct optimization would be better!'.format(hyper))
                    hg = g2
                self._hypergrad_dictionary[hyper].append(hg)

            return hyper_list
Exemple #2
0
    def compute_gradients(self,
                          outer_objective,
                          optimizer_dict,
                          hyper_list=None):
        hyper_list = super(ForwardHG,
                           self).compute_gradients(outer_objective,
                                                   optimizer_dict, hyper_list)

        # scalar_hyper_list

        with tf.variable_scope(outer_objective.op.name):
            # dynamics_vec = vectorize_all(optimizer_dict.dynamics)  # in the new implementation there's no need of
            # vectorizing... it might be more efficient since it's better to avoid too many reshaping operations...
            d_oo_d_state = tf.gradients(outer_objective, optimizer_dict.state)

            # d_oo_d_state = [_v if _v is not None else tf.zeros_like(_s)
            # for _v, _s in zip(d_oo_d_state, optimizer_dict.state)]

            with tf.name_scope(
                    'DUMMY'):  # variables to compute forward propagation
                # TODO avoid this computation if optimizer_dict has already been seen.
                aux_v = [tf.zeros_like(v) for v in optimizer_dict.state]
                # aux_v_vec = vectorize_all(aux_v)
                # dynamics_dot_aux_v = dot(dynamics_vec, aux_v_vec)  # old impl
                dynamics_dot_aux_v = reduce_all_sums(optimizer_dict.dynamics,
                                                     aux_v)

                der_dynamics_dot_aux_v = tf.gradients(dynamics_dot_aux_v,
                                                      optimizer_dict.state)
                # this is a list of jacobians times aux_v that have the same dimension of states variables.

                init_dynamics_dot_aux_v = None
                if optimizer_dict.init_dynamics:
                    # init_dynamics_dot_aux_v = dot(vectorize_all(optimizer_dict.init_dynamics), aux_v_vec)  # old impl
                    init_dynamics_dot_aux_v = reduce_all_sums(
                        optimizer_dict.init_dynamics, aux_v)

            for hyp in hyper_list:
                assert hyp.shape.ndims == 0, ForwardHG._HYPER_RANK_ERROR_MESSAGE.format(
                    hyp, hyp.shape.ndims)

                d_init_dyn_d_hyp = None if init_dynamics_dot_aux_v is None else \
                    tf.gradients(init_dynamics_dot_aux_v, hyp)[0]
                d_dyn_d_hyp = tf.gradients(dynamics_dot_aux_v, hyp)[0]
                d_oo_d_hyp = tf.gradients(outer_objective, hyp)[0]
                if RAISE_ERROR_ON_DETACHED:
                    assert d_init_dyn_d_hyp is not None or d_dyn_d_hyp is not None or\
                        d_oo_d_hyp is not None, HyperGradient._ERROR_HYPER_DETACHED.format(hyp)
                else:
                    print(HyperGradient._ERROR_HYPER_DETACHED.format(hyp),
                          file=sys.stderr)
                    hyper_list.remove(hyp)

                # UPDATE OF TOTAL DERIVATIVE OF STATE W.R.T. HYPERPARAMETER
                zs = ForwardHG._create_z(
                    optimizer_dict, hyp,
                    None if d_init_dyn_d_hyp is None else tf.gradients(
                        d_init_dyn_d_hyp, aux_v))
                # dyn_dot_zs = dot(dynamics_vec, vectorize_all(zs))
                Bs = tf.gradients(d_dyn_d_hyp, aux_v)  # this looks right...
                # A_dot_zs = tf.gradients(dyn_dot_zs, optimizer_dict.state)  # I guess the error is here!
                # the error is HERE! this operation computes d Phi/ d w * z for each w instead of d Phi_i / d s * z
                # for each i

                # A_dot_zs = tf.gradients(dot(vectorize_all(der_dynamics_dot_aux_v), vectorize_all(zs)), aux_v)  # old
                A_dot_zs = tf.gradients(
                    reduce_all_sums(der_dynamics_dot_aux_v, zs), aux_v)

                self.A_dot_zs[hyp] = A_dot_zs

                _z_iter = tf.group(*[
                    z.assign(maybe_add(A_dot_z, B))
                    for z, A_dot_z, B in zip(zs, A_dot_zs, Bs)
                ])
                self._z_iter = tf.group(self._z_iter, _z_iter)

                # HYPERGRADIENT
                # d_E_T = dot(vectorize_all(d_oo_d_state), vectorize_all(zs))
                d_E_T = [
                    dot(d_oo_d_s, z) for d_oo_d_s, z in zip(d_oo_d_state, zs)
                    if d_oo_d_s is not None and z is not None
                ]
                hg = maybe_add(
                    tf.reduce_sum(d_E_T),
                    d_oo_d_hyp)  # this is right... the error is not here!
                # hg = maybe_add(d_E_T, d_oo_d_hyp)

                self._hypergrad_dictionary[hyp].append(hg)

                self._forward_initializer = tf.group(
                    self._forward_initializer, tf.variables_initializer(zs))

        return hyper_list
    def compute_gradients(self, outer_objective, optimizer_dict, hyper_list=None):
        hyper_list = super(ForwardHG, self).compute_gradients(outer_objective, optimizer_dict, hyper_list)

        # scalar_hyper_list

        with tf.variable_scope(outer_objective.op.name):
            # dynamics_vec = vectorize_all(optimizer_dict.dynamics)  # in the new implementation there's no need of
            # vectorizing... it might be more efficient since it's better to avoid too many reshaping operations...
            d_oo_d_state = tf.gradients(outer_objective, list(optimizer_dict.state))

            with tf.name_scope('DUMMY'):  # variables to compute forward propagation
                # TODO avoid this computation if optimizer_dict has already been seen.
                aux_vs = [tf.zeros_like(v) for v in optimizer_dict.state]
                dynamics_dot_aux_v = reduce_all_sums(list(optimizer_dict.dynamics), aux_vs)

                der_dynamics_dot_aux_v = tf.gradients(dynamics_dot_aux_v, list(optimizer_dict.state))
                # this is a list of jacobians times aux_vs that have the same dimension of states variables.

                init_dynamics_dot_aux_v = None
                if optimizer_dict.init_dynamics:
                    # init_dynamics_dot_aux_v = dot(vectorize_all(optimizer_dict.init_dynamics), aux_v_vec)  # old impl
                    init_dynamics_dot_aux_v = reduce_all_sums(
                        optimizer_dict.init_dynamics, aux_vs)

            for hyp in hyper_list:
                assert hyp.shape.ndims == 0, ForwardHG._HYPER_RANK_ERROR_MESSAGE.format(hyp, hyp.shape.ndims)

                d_init_dyn_d_hyp = None if init_dynamics_dot_aux_v is None else \
                    tf.gradients(init_dynamics_dot_aux_v, hyp)[0]
                d_dyn_d_hyp = tf.gradients(dynamics_dot_aux_v, hyp)[0]
                d_oo_d_hyp = tf.gradients(outer_objective, hyp)[0]

                # ------------------------------------------------------------
                # check detached hyperparameters (for which hypergradient would be always null)
                hyper_ok = d_init_dyn_d_hyp is not None or d_dyn_d_hyp is not None or d_oo_d_hyp is not None
                if RAISE_ERROR_ON_DETACHED:
                    # try:
                    assert hyper_ok, HyperGradient._ERROR_HYPER_DETACHED.format(hyp)
                    # ex
                else:
                    if not hyper_ok:
                        print(HyperGradient._ERROR_HYPER_DETACHED.format(hyp), file=sys.stderr)
                        hyper_list.remove(hyp)
                # -------------------------------------------------------------

                # UPDATE OF TOTAL DERIVATIVE OF STATE W.R.T. HYPERPARAMETER
                zs = ForwardHG._create_zs(
                    optimizer_dict, hyp, None if d_init_dyn_d_hyp is None else tf.gradients(d_init_dyn_d_hyp, aux_vs)
                )  # this is one z for each variable
                self._zs[hyp] = zs  # store a reference for the total derivatives for easy access
                Bs = tf.gradients(d_dyn_d_hyp, aux_vs)

                A_dot_zs = tf.gradients(reduce_all_sums(der_dynamics_dot_aux_v, zs), aux_vs)

                self.A_dot_zs[hyp] = A_dot_zs

                _z_iter = tf.group(*[
                    z.assign(maybe_add(A_dot_z, B)) for z, A_dot_z, B
                    in zip(zs, A_dot_zs, Bs)
                ])
                self._z_iter = tf.group(self._z_iter, _z_iter)

                # -- HYPERGRADIENT -----
                d_E_T = [dot(d_oo_d_s, z) for d_oo_d_s, z in zip(d_oo_d_state, zs)
                         if d_oo_d_s is not None and z is not None]  # list of dot products
                hg = maybe_add(tf.reduce_sum(d_E_T), d_oo_d_hyp)  # sum the partial dot products and possibly ->
                # adds the ''direct derivative'' term d(E( . , \lambda))/d \lambda

                self._hypergrad_dictionary[hyp].append(hg)
                self._forward_initializer = tf.group(self._forward_initializer,
                                                     tf.variables_initializer(zs))
        return hyper_list
    def compute_gradients(self, outer_objective, optimizer_dict, hyper_list=None):
        hyper_list = super().compute_gradients(outer_objective, optimizer_dict, hyper_list)

        # scalar_hyper_list

        with tf.variable_scope(outer_objective.op.name):
            # dynamics_vec = vectorize_all(optimizer_dict.dynamics)  # in the new implementation there's no need of
            # vectorizing... it might be more efficient since it's better to avoid too many reshaping operations...
            d_oo_d_state = tf.gradients(outer_objective, optimizer_dict.state)

            # d_oo_d_state = [_v if _v is not None else tf.zeros_like(_s)
            # for _v, _s in zip(d_oo_d_state, optimizer_dict.state)]

            with tf.name_scope('DUMMY'):  # variables to compute forward propagation
                # TODO avoid this computation if optimizer_dict has already been seen.
                aux_v = [tf.zeros_like(v) for v in optimizer_dict.state]
                # aux_v_vec = vectorize_all(aux_v)
                # dynamics_dot_aux_v = dot(dynamics_vec, aux_v_vec)  # old impl
                dynamics_dot_aux_v = reduce_all_sums(optimizer_dict.dynamics, aux_v)

                der_dynamics_dot_aux_v = tf.gradients(dynamics_dot_aux_v, optimizer_dict.state)
                # this is a list of jacobians times aux_v that have the same dimension of states variables.

                init_dynamics_dot_aux_v = None
                if optimizer_dict.init_dynamics:
                    # init_dynamics_dot_aux_v = dot(vectorize_all(optimizer_dict.init_dynamics), aux_v_vec)  # old impl
                    init_dynamics_dot_aux_v = reduce_all_sums(
                        optimizer_dict.init_dynamics, aux_v)

            for hyp in hyper_list:
                assert hyp.shape.ndims == 0, ForwardHG._HYPER_RANK_ERROR_MESSAGE.format(hyp, hyp.shape.ndims)

                d_init_dyn_d_hyp = None if init_dynamics_dot_aux_v is None else \
                    tf.gradients(init_dynamics_dot_aux_v, hyp)[0]
                d_dyn_d_hyp = tf.gradients(dynamics_dot_aux_v, hyp)[0]
                d_oo_d_hyp = tf.gradients(outer_objective, hyp)[0]
                if RAISE_ERROR_ON_DETACHED:
                    assert d_init_dyn_d_hyp is not None or d_dyn_d_hyp is not None or\
                        d_oo_d_hyp is not None, HyperGradient._ERROR_HYPER_DETACHED.format(hyp)
                else:
                    print( HyperGradient._ERROR_HYPER_DETACHED.format(hyp), file=sys.stderr)
                    hyper_list.remove(hyp)

                # UPDATE OF TOTAL DERIVATIVE OF STATE W.R.T. HYPERPARAMETER
                zs = ForwardHG._create_z(
                    optimizer_dict, hyp, None if d_init_dyn_d_hyp is None else tf.gradients(d_init_dyn_d_hyp, aux_v)
                )
                # dyn_dot_zs = dot(dynamics_vec, vectorize_all(zs))
                Bs = tf.gradients(d_dyn_d_hyp, aux_v)  # this looks right...
                # A_dot_zs = tf.gradients(dyn_dot_zs, optimizer_dict.state)  # I guess the error is here!
                # the error is HERE! this operation computes d Phi/ d w * z for each w instead of d Phi_i / d s * z
                # for each i

                # A_dot_zs = tf.gradients(dot(vectorize_all(der_dynamics_dot_aux_v), vectorize_all(zs)), aux_v)  # old
                A_dot_zs = tf.gradients(reduce_all_sums(der_dynamics_dot_aux_v, zs), aux_v)

                self.A_dot_zs[hyp] = A_dot_zs

                _z_iter = tf.group(*[
                    z.assign(maybe_add(A_dot_z, B)) for z, A_dot_z, B
                    in zip(zs, A_dot_zs, Bs)
                ])
                self._z_iter = tf.group(self._z_iter, _z_iter)

                # HYPERGRADIENT
                # d_E_T = dot(vectorize_all(d_oo_d_state), vectorize_all(zs))
                d_E_T = [dot(d_oo_d_s, z) for d_oo_d_s, z in zip(d_oo_d_state, zs)
                         if d_oo_d_s is not None and z is not None]
                hg = maybe_add(tf.reduce_sum(d_E_T), d_oo_d_hyp)  # this is right... the error is not here!
                # hg = maybe_add(d_E_T, d_oo_d_hyp)

                self._hypergrad_dictionary[hyp].append(hg)

                self._forward_initializer = tf.group(self._forward_initializer,
                                                     tf.variables_initializer(zs))

        return hyper_list