def compute_gradients(self, outer_objective, optimizer_dict, hyper_list=None): hyper_list = super(ImplicitHG, self).compute_gradients(outer_objective, optimizer_dict, hyper_list) state = list(optimizer_dict.state) with tf.variable_scope(outer_objective.op.name): g1 = utils.vectorize_all(tf.gradients(outer_objective, state)) grads_inner_obj_vec = utils.vectorize_all(tf.gradients(optimizer_dict.objective, state)) q = self._create_q(g1) obj = tf.norm( utils.vectorize_all(tf.gradients(utils.dot(grads_inner_obj_vec, q), state)) - g1 ) # using the norm seems to produce better results then squared norm... # (even though is more costly) self._lin_sys.append(lambda _tolerance: self.linear_system_solver(obj, [q], _tolerance)) g2s = tf.gradients(outer_objective, hyper_list) cross_ders = tf.gradients(utils.dot(grads_inner_obj_vec, q), hyper_list) for g2, cd, hyper in zip(g2s, cross_ders, hyper_list): assert g2 is not None or cd is not None, HyperGradient._ERROR_HYPER_DETACHED.format(hyper) hg = utils.maybe_add(-cd, g2) if hg is None: # this would be strange... print('WARNING, outer objective is only directly dependent on hyperparameter {}. ' + 'Direct optimization would be better!'.format(hyper)) hg = g2 self._hypergrad_dictionary[hyper].append(hg) return hyper_list
def z_callback(self, hyperparameter=None, flatten=True): zs_values = [] zs = list(self._zs.values()) if hyperparameter is None else self._zs[hyperparameter] if flatten: zs = utils.vectorize_all(zs) # noinspection PyUnusedLocal def _callback(_, __, ss): zs_values.append(ss.run(zs)) # these should not depend from any feed dictionary return zs_values, _callback
def hypergrad_callback(self, hyperparameter=None, flatten=True): """callback that records the partial hypergradients on the reverse pass""" values = [] gs = list(self._hypergrad_dictionary.values()) if hyperparameter is None else \ self._hypergrad_dictionary[hyperparameter] if flatten: gs = utils.vectorize_all(gs) # noinspection PyUnusedLocal def _callback(_, __, ss): values.append(ss.run(gs)) # these should not depend from any feed dictionary return values, _callback
def compute_gradients(self, outer_objective, optimizer_dict, hyper_list=None): """ Function that adds to the computational graph all the operations needend for computing the hypergradients in a "dynamic" way, without unrolling the entire optimization graph. The resulting computation, while being roughly 2x more expensive then unrolling the optimizaiton dynamics, requires much less (GPU) memory and is more flexible, allowing to set a termination condition to the parameters optimizaiton routine. :param optimizer_dict: OptimzerDict object resulting from the inner objective optimization. :param outer_objective: A loss function for the hyperparameters (scalar tensor) :param hyper_list: Optional list of hyperparameters to consider. If not provided will get all variables in the hyperparameter collection in the current scope. :return: list of hyperparameters involved in the computation """ hyper_list = super(ReverseHG, self).compute_gradients(outer_objective, optimizer_dict, hyper_list) # derivative of outer objective w.r.t. state with tf.variable_scope(outer_objective.op.name): # for some reason without this there is a cathastrofic # failure... doo_ds = tf.gradients(outer_objective, list(optimizer_dict.state)) alphas = self._create_lagrangian_multipliers(optimizer_dict, doo_ds) alpha_vec = utils.vectorize_all(alphas) dyn_vec = utils.vectorize_all(list(optimizer_dict.dynamics)) lag_phi_t = utils.dot(alpha_vec, dyn_vec, name='iter_wise_lagrangian_part1') # TODO outer_objective might be a list... handle this case # iterative computation of hypergradients alpha_dot_B = tf.gradients(lag_phi_t, hyper_list) # check that optimizer_dict has initial ops (phi_0) if optimizer_dict.init_dynamics is not None: lag_phi0 = utils.dot(alpha_vec, utils.vectorize_all([d for (s, d) in optimizer_dict.init_dynamics])) alpha_dot_B0 = tf.gradients(lag_phi0, hyper_list) else: alpha_dot_B0 = [None] * len(hyper_list) # here, if some of this is None it may mean that the hyperparameter compares inside phi_0: check that and # if it is not the case raise error... hyper_grad_vars, hyper_grad_step = [], tf.no_op() for dl_dh, a_d_b0, hyper in zip(alpha_dot_B, alpha_dot_B0, hyper_list): assert dl_dh is not None or a_d_b0 is not None, HyperGradient._ERROR_HYPER_DETACHED.format(hyper) hgv = None if dl_dh is not None: # "normal hyperparameter" hgv = self._create_hypergradient(outer_objective, hyper) hyper_grad_step = tf.group(hyper_grad_step, hgv.assign_add(dl_dh)) if a_d_b0 is not None: hgv = hgv + a_d_b0 if hgv is not None else a_d_b0 # here hyper_grad_step has nothing to do... hyper_grad_vars.append(hgv) # save these... with tf.control_dependencies([hyper_grad_step]): # first update hypergradinet then alphas. _alpha_iter = tf.group(*[alpha.assign(dl_ds) for alpha, dl_ds in zip(alphas, tf.gradients(lag_phi_t, list(optimizer_dict.state)))]) self._alpha_iter = tf.group(self._alpha_iter, _alpha_iter) # put all the backward iterations toghether [self._hypergrad_dictionary[h].append(hg) for h, hg in zip(hyper_list, hyper_grad_vars)] self._reverse_initializer = tf.group(self._reverse_initializer, tf.variables_initializer(alphas), tf.variables_initializer([h for h in hyper_grad_vars if hasattr(h, 'initializer')])) # some -> # hypergradients (those coming form initial dynamics) might be just tensors and not variables... return hyper_list
def compute_gradients(self, outer_objective, optimizer_dict, hyper_list=None): """ Function that adds to the computational graph all the operations needend for computing the hypergradients in a "dynamic" way, without unrolling the entire optimization graph. The resulting computation, while being roughly 2x more expensive then unrolling the optimizaiton dynamics, requires much less (GPU) memory and is more flexible, allowing to set a termination condition to the parameters optimizaiton routine. :param optimizer_dict: OptimzerDict object resulting from the inner objective optimization. :param outer_objective: A loss function for the hyperparameters (scalar tensor) :param hyper_list: Optional list of hyperparameters to consider. If not provided will get all variables in the hyperparameter collection in the current scope. :return: list of hyperparameters involved in the computation """ hyper_list = super().compute_gradients(outer_objective, optimizer_dict, hyper_list) # derivative of outer objective w.r.t. state with tf.variable_scope(outer_objective.op.name): # for some reason without this there is a cathastrofic # failure... doo_ds = tf.gradients(outer_objective, optimizer_dict.state) alphas = self._create_lagrangian_multipliers(optimizer_dict, doo_ds) alpha_vec = utils.vectorize_all(alphas) dyn_vec = utils.vectorize_all(optimizer_dict.dynamics) lag_phi_t = utils.dot(alpha_vec, dyn_vec, name='iter_wise_lagrangian_part1') # TODO outer_objective might be a list... handle this case # iterative computation of hypergradients doo_dypers = tf.gradients(outer_objective, hyper_list) # (direct) derivative of outer objective w.r.t. hyp. alpha_dot_B = tf.gradients(lag_phi_t, hyper_list) # check that optimizer_dict has initial ops (phi_0) if optimizer_dict.init_dynamics is not None: lag_phi0 = utils.dot(alpha_vec, utils.vectorize_all([d for (s, d) in optimizer_dict.init_dynamics])) alpha_dot_B0 = tf.gradients(lag_phi0, hyper_list) else: alpha_dot_B0 = [None] * len(hyper_list) # here is some of this is None it may mean that the hyperparameter compares inside phi_0: check that and # if it is not the case return error... hyper_grad_vars, hyper_grad_step = [], tf.no_op() for dl_dh, doo_dh, a_d_b0, hyper in zip(alpha_dot_B, doo_dypers, alpha_dot_B0, hyper_list): assert dl_dh is not None or a_d_b0 is not None, HyperGradient._ERROR_HYPER_DETACHED.format(hyper) hgv = None if dl_dh is not None: # "normal hyperparameter" hgv = self._create_hypergradient(hyper, doo_dh) hyper_grad_step = tf.group(hyper_grad_step, hgv.assign_add(dl_dh)) if a_d_b0 is not None: hgv = hgv + a_d_b0 if hgv is not None else a_d_b0 # here hyper_grad_step has nothing to do... hyper_grad_vars.append(hgv) # save these... with tf.control_dependencies([hyper_grad_step]): # first update hypergradinet then alphas. _alpha_iter = tf.group(*[alpha.assign(dl_ds) for alpha, dl_ds in zip(alphas, tf.gradients(lag_phi_t, optimizer_dict.state))]) self._alpha_iter = tf.group(self._alpha_iter, _alpha_iter) # put all the backward iterations toghether [self._hypergrad_dictionary[h].append(hg) for h, hg in zip(hyper_list, hyper_grad_vars)] self._reverse_initializer = tf.group(self._reverse_initializer, tf.variables_initializer(alphas), tf.variables_initializer([h for h in hyper_grad_vars if hasattr(h, 'initializer')])) # some -> # hypergradients (those coming form initial dynamics) might be just tensors and not variables... return hyper_list