def compute_gradients(self, outer_objective, optimizer_dict, hyper_list=None): hyper_list = super(ImplicitHG, self).compute_gradients(outer_objective, optimizer_dict, hyper_list) state = list(optimizer_dict.state) with tf.variable_scope(outer_objective.op.name): g1 = utils.vectorize_all(tf.gradients(outer_objective, state)) grads_inner_obj_vec = utils.vectorize_all(tf.gradients(optimizer_dict.objective, state)) q = self._create_q(g1) obj = tf.norm( utils.vectorize_all(tf.gradients(utils.dot(grads_inner_obj_vec, q), state)) - g1 ) # using the norm seems to produce better results then squared norm... # (even though is more costly) self._lin_sys.append(lambda _tolerance: self.linear_system_solver(obj, [q], _tolerance)) g2s = tf.gradients(outer_objective, hyper_list) cross_ders = tf.gradients(utils.dot(grads_inner_obj_vec, q), hyper_list) for g2, cd, hyper in zip(g2s, cross_ders, hyper_list): assert g2 is not None or cd is not None, HyperGradient._ERROR_HYPER_DETACHED.format(hyper) hg = utils.maybe_add(-cd, g2) if hg is None: # this would be strange... print('WARNING, outer objective is only directly dependent on hyperparameter {}. ' + 'Direct optimization would be better!'.format(hyper)) hg = g2 self._hypergrad_dictionary[hyper].append(hg) return hyper_list
def compute_gradients(self, outer_objective, optimizer_dict, hyper_list=None): hyper_list = super(ForwardHG, self).compute_gradients(outer_objective, optimizer_dict, hyper_list) # scalar_hyper_list with tf.variable_scope(outer_objective.op.name): # dynamics_vec = vectorize_all(optimizer_dict.dynamics) # in the new implementation there's no need of # vectorizing... it might be more efficient since it's better to avoid too many reshaping operations... d_oo_d_state = tf.gradients(outer_objective, optimizer_dict.state) # d_oo_d_state = [_v if _v is not None else tf.zeros_like(_s) # for _v, _s in zip(d_oo_d_state, optimizer_dict.state)] with tf.name_scope( 'DUMMY'): # variables to compute forward propagation # TODO avoid this computation if optimizer_dict has already been seen. aux_v = [tf.zeros_like(v) for v in optimizer_dict.state] # aux_v_vec = vectorize_all(aux_v) # dynamics_dot_aux_v = dot(dynamics_vec, aux_v_vec) # old impl dynamics_dot_aux_v = reduce_all_sums(optimizer_dict.dynamics, aux_v) der_dynamics_dot_aux_v = tf.gradients(dynamics_dot_aux_v, optimizer_dict.state) # this is a list of jacobians times aux_v that have the same dimension of states variables. init_dynamics_dot_aux_v = None if optimizer_dict.init_dynamics: # init_dynamics_dot_aux_v = dot(vectorize_all(optimizer_dict.init_dynamics), aux_v_vec) # old impl init_dynamics_dot_aux_v = reduce_all_sums( optimizer_dict.init_dynamics, aux_v) for hyp in hyper_list: assert hyp.shape.ndims == 0, ForwardHG._HYPER_RANK_ERROR_MESSAGE.format( hyp, hyp.shape.ndims) d_init_dyn_d_hyp = None if init_dynamics_dot_aux_v is None else \ tf.gradients(init_dynamics_dot_aux_v, hyp)[0] d_dyn_d_hyp = tf.gradients(dynamics_dot_aux_v, hyp)[0] d_oo_d_hyp = tf.gradients(outer_objective, hyp)[0] if RAISE_ERROR_ON_DETACHED: assert d_init_dyn_d_hyp is not None or d_dyn_d_hyp is not None or\ d_oo_d_hyp is not None, HyperGradient._ERROR_HYPER_DETACHED.format(hyp) else: print(HyperGradient._ERROR_HYPER_DETACHED.format(hyp), file=sys.stderr) hyper_list.remove(hyp) # UPDATE OF TOTAL DERIVATIVE OF STATE W.R.T. HYPERPARAMETER zs = ForwardHG._create_z( optimizer_dict, hyp, None if d_init_dyn_d_hyp is None else tf.gradients( d_init_dyn_d_hyp, aux_v)) # dyn_dot_zs = dot(dynamics_vec, vectorize_all(zs)) Bs = tf.gradients(d_dyn_d_hyp, aux_v) # this looks right... # A_dot_zs = tf.gradients(dyn_dot_zs, optimizer_dict.state) # I guess the error is here! # the error is HERE! this operation computes d Phi/ d w * z for each w instead of d Phi_i / d s * z # for each i # A_dot_zs = tf.gradients(dot(vectorize_all(der_dynamics_dot_aux_v), vectorize_all(zs)), aux_v) # old A_dot_zs = tf.gradients( reduce_all_sums(der_dynamics_dot_aux_v, zs), aux_v) self.A_dot_zs[hyp] = A_dot_zs _z_iter = tf.group(*[ z.assign(maybe_add(A_dot_z, B)) for z, A_dot_z, B in zip(zs, A_dot_zs, Bs) ]) self._z_iter = tf.group(self._z_iter, _z_iter) # HYPERGRADIENT # d_E_T = dot(vectorize_all(d_oo_d_state), vectorize_all(zs)) d_E_T = [ dot(d_oo_d_s, z) for d_oo_d_s, z in zip(d_oo_d_state, zs) if d_oo_d_s is not None and z is not None ] hg = maybe_add( tf.reduce_sum(d_E_T), d_oo_d_hyp) # this is right... the error is not here! # hg = maybe_add(d_E_T, d_oo_d_hyp) self._hypergrad_dictionary[hyp].append(hg) self._forward_initializer = tf.group( self._forward_initializer, tf.variables_initializer(zs)) return hyper_list
def compute_gradients(self, outer_objective, optimizer_dict, hyper_list=None): hyper_list = super(ForwardHG, self).compute_gradients(outer_objective, optimizer_dict, hyper_list) # scalar_hyper_list with tf.variable_scope(outer_objective.op.name): # dynamics_vec = vectorize_all(optimizer_dict.dynamics) # in the new implementation there's no need of # vectorizing... it might be more efficient since it's better to avoid too many reshaping operations... d_oo_d_state = tf.gradients(outer_objective, list(optimizer_dict.state)) with tf.name_scope('DUMMY'): # variables to compute forward propagation # TODO avoid this computation if optimizer_dict has already been seen. aux_vs = [tf.zeros_like(v) for v in optimizer_dict.state] dynamics_dot_aux_v = reduce_all_sums(list(optimizer_dict.dynamics), aux_vs) der_dynamics_dot_aux_v = tf.gradients(dynamics_dot_aux_v, list(optimizer_dict.state)) # this is a list of jacobians times aux_vs that have the same dimension of states variables. init_dynamics_dot_aux_v = None if optimizer_dict.init_dynamics: # init_dynamics_dot_aux_v = dot(vectorize_all(optimizer_dict.init_dynamics), aux_v_vec) # old impl init_dynamics_dot_aux_v = reduce_all_sums( optimizer_dict.init_dynamics, aux_vs) for hyp in hyper_list: assert hyp.shape.ndims == 0, ForwardHG._HYPER_RANK_ERROR_MESSAGE.format(hyp, hyp.shape.ndims) d_init_dyn_d_hyp = None if init_dynamics_dot_aux_v is None else \ tf.gradients(init_dynamics_dot_aux_v, hyp)[0] d_dyn_d_hyp = tf.gradients(dynamics_dot_aux_v, hyp)[0] d_oo_d_hyp = tf.gradients(outer_objective, hyp)[0] # ------------------------------------------------------------ # check detached hyperparameters (for which hypergradient would be always null) hyper_ok = d_init_dyn_d_hyp is not None or d_dyn_d_hyp is not None or d_oo_d_hyp is not None if RAISE_ERROR_ON_DETACHED: # try: assert hyper_ok, HyperGradient._ERROR_HYPER_DETACHED.format(hyp) # ex else: if not hyper_ok: print(HyperGradient._ERROR_HYPER_DETACHED.format(hyp), file=sys.stderr) hyper_list.remove(hyp) # ------------------------------------------------------------- # UPDATE OF TOTAL DERIVATIVE OF STATE W.R.T. HYPERPARAMETER zs = ForwardHG._create_zs( optimizer_dict, hyp, None if d_init_dyn_d_hyp is None else tf.gradients(d_init_dyn_d_hyp, aux_vs) ) # this is one z for each variable self._zs[hyp] = zs # store a reference for the total derivatives for easy access Bs = tf.gradients(d_dyn_d_hyp, aux_vs) A_dot_zs = tf.gradients(reduce_all_sums(der_dynamics_dot_aux_v, zs), aux_vs) self.A_dot_zs[hyp] = A_dot_zs _z_iter = tf.group(*[ z.assign(maybe_add(A_dot_z, B)) for z, A_dot_z, B in zip(zs, A_dot_zs, Bs) ]) self._z_iter = tf.group(self._z_iter, _z_iter) # -- HYPERGRADIENT ----- d_E_T = [dot(d_oo_d_s, z) for d_oo_d_s, z in zip(d_oo_d_state, zs) if d_oo_d_s is not None and z is not None] # list of dot products hg = maybe_add(tf.reduce_sum(d_E_T), d_oo_d_hyp) # sum the partial dot products and possibly -> # adds the ''direct derivative'' term d(E( . , \lambda))/d \lambda self._hypergrad_dictionary[hyp].append(hg) self._forward_initializer = tf.group(self._forward_initializer, tf.variables_initializer(zs)) return hyper_list
def compute_gradients(self, outer_objective, optimizer_dict, hyper_list=None): hyper_list = super().compute_gradients(outer_objective, optimizer_dict, hyper_list) # scalar_hyper_list with tf.variable_scope(outer_objective.op.name): # dynamics_vec = vectorize_all(optimizer_dict.dynamics) # in the new implementation there's no need of # vectorizing... it might be more efficient since it's better to avoid too many reshaping operations... d_oo_d_state = tf.gradients(outer_objective, optimizer_dict.state) # d_oo_d_state = [_v if _v is not None else tf.zeros_like(_s) # for _v, _s in zip(d_oo_d_state, optimizer_dict.state)] with tf.name_scope('DUMMY'): # variables to compute forward propagation # TODO avoid this computation if optimizer_dict has already been seen. aux_v = [tf.zeros_like(v) for v in optimizer_dict.state] # aux_v_vec = vectorize_all(aux_v) # dynamics_dot_aux_v = dot(dynamics_vec, aux_v_vec) # old impl dynamics_dot_aux_v = reduce_all_sums(optimizer_dict.dynamics, aux_v) der_dynamics_dot_aux_v = tf.gradients(dynamics_dot_aux_v, optimizer_dict.state) # this is a list of jacobians times aux_v that have the same dimension of states variables. init_dynamics_dot_aux_v = None if optimizer_dict.init_dynamics: # init_dynamics_dot_aux_v = dot(vectorize_all(optimizer_dict.init_dynamics), aux_v_vec) # old impl init_dynamics_dot_aux_v = reduce_all_sums( optimizer_dict.init_dynamics, aux_v) for hyp in hyper_list: assert hyp.shape.ndims == 0, ForwardHG._HYPER_RANK_ERROR_MESSAGE.format(hyp, hyp.shape.ndims) d_init_dyn_d_hyp = None if init_dynamics_dot_aux_v is None else \ tf.gradients(init_dynamics_dot_aux_v, hyp)[0] d_dyn_d_hyp = tf.gradients(dynamics_dot_aux_v, hyp)[0] d_oo_d_hyp = tf.gradients(outer_objective, hyp)[0] if RAISE_ERROR_ON_DETACHED: assert d_init_dyn_d_hyp is not None or d_dyn_d_hyp is not None or\ d_oo_d_hyp is not None, HyperGradient._ERROR_HYPER_DETACHED.format(hyp) else: print( HyperGradient._ERROR_HYPER_DETACHED.format(hyp), file=sys.stderr) hyper_list.remove(hyp) # UPDATE OF TOTAL DERIVATIVE OF STATE W.R.T. HYPERPARAMETER zs = ForwardHG._create_z( optimizer_dict, hyp, None if d_init_dyn_d_hyp is None else tf.gradients(d_init_dyn_d_hyp, aux_v) ) # dyn_dot_zs = dot(dynamics_vec, vectorize_all(zs)) Bs = tf.gradients(d_dyn_d_hyp, aux_v) # this looks right... # A_dot_zs = tf.gradients(dyn_dot_zs, optimizer_dict.state) # I guess the error is here! # the error is HERE! this operation computes d Phi/ d w * z for each w instead of d Phi_i / d s * z # for each i # A_dot_zs = tf.gradients(dot(vectorize_all(der_dynamics_dot_aux_v), vectorize_all(zs)), aux_v) # old A_dot_zs = tf.gradients(reduce_all_sums(der_dynamics_dot_aux_v, zs), aux_v) self.A_dot_zs[hyp] = A_dot_zs _z_iter = tf.group(*[ z.assign(maybe_add(A_dot_z, B)) for z, A_dot_z, B in zip(zs, A_dot_zs, Bs) ]) self._z_iter = tf.group(self._z_iter, _z_iter) # HYPERGRADIENT # d_E_T = dot(vectorize_all(d_oo_d_state), vectorize_all(zs)) d_E_T = [dot(d_oo_d_s, z) for d_oo_d_s, z in zip(d_oo_d_state, zs) if d_oo_d_s is not None and z is not None] hg = maybe_add(tf.reduce_sum(d_E_T), d_oo_d_hyp) # this is right... the error is not here! # hg = maybe_add(d_E_T, d_oo_d_hyp) self._hypergrad_dictionary[hyp].append(hg) self._forward_initializer = tf.group(self._forward_initializer, tf.variables_initializer(zs)) return hyper_list