def compute_gradients( self, outer_objective, inner_grad, meta_param=None, param_dict=OrderedDict() ): meta_param = super(BOMLOuterGradImplicit, self).compute_gradients( outer_objective, inner_grad, meta_param ) state = list(inner_grad.state) with tf.variable_scope(outer_objective.op.name): g1 = utils.vectorize_all( tf.gradients(outer_objective, state) ) # Lower Level gradient of UL objective w.r.t task parameters grads_inner_obj_vec = utils.vectorize_all( tf.gradients(inner_grad.objective, state) ) # Lower Level gradient of LL objective w.r.t task parameters q = self._create_q(g1) obj = tf.norm( utils.vectorize_all( tf.gradients(utils.dot(grads_inner_obj_vec, q), state) ) - g1 ) # using the norm seems to produce better results then squared norm... # (even though is more costly) self._lin_sys.append( lambda _tolerance: self.linear_system_solver(obj, [q], _tolerance) ) g2s = tf.gradients(outer_objective, meta_param) cross_ders = tf.gradients(utils.dot(grads_inner_obj_vec, q), meta_param) for g2, cd, hyper in zip(g2s, cross_ders, meta_param): assert ( g2 is not None or cd is not None ), BOMLOuterGrad._ERROR_HYPER_DETACHED.format(hyper) hg = utils.maybe_add(-cd, g2) if hg is None: # this would be strange... print( "WARNING, outer objective is only directly dependent on hyperparameter {}. " + "Direct optimization would be better!".format(hyper) ) hg = g2 self._hypergrad_dictionary[hyper].append(hg) return meta_param
def compute_gradients(self, outer_objective, inner_grad, meta_param=None, param_dict=OrderedDict()): """ Function that adds to the computational graph all the operations needend for computing the hypergradients in a "dynamic" way, without unrolling the entire optimization graph. The resulting computation, while being roughly 2x more expensive then unrolling the optimizaiton dynamics, requires much less (GPU) memory and is more flexible, allowing to set a termination condition to the parameters optimizaiton routine. :param inner_grad: OptimzerDict object resulting from the inner objective optimization. :param outer_objective: A loss function for the outer parameters (scalar tensor) :param meta_param: Optional list of outer parameters to consider. If not provided will get all variables in the hyperparameter collection in the current scope. :param param_dict: dictionary to store necessary parameters :return: list of outer parameters involved in the computation """ meta_param = super(BOMLOuterGradDarts, self).compute_gradients(outer_objective, inner_grad, meta_param) with tf.variable_scope(outer_objective.op.name): ex = self.param_dict["experiment"] model = self.param_dict["experiment"].model loss_func = self.param_dict["loss_func"] # compute the first-order gradient of updated outer parameters with ones-step forward grads_outer = [ self._create_outergradient(outer_objective, hyper) for hyper in meta_param ] # compute the first-order gradient of the initial task parameters darts_derivatives = [ grad for grad in tf.gradients(outer_objective, list(inner_grad.state)) ] # compute the differentiation part, multiplied by Epsilon darts_vector = tf.concat( axis=0, values=utils.vectorize_all(darts_derivatives)) self.epsilon = 0.01 / tf.norm(tensor=darts_vector, ord=2) darts_derivatives = [ self.Epsilon * darts_derivative for darts_derivative in darts_derivatives ] fin_diff_part = self._create_darts_derivatives( var_list=inner_grad.state, darts_derivatives=darts_derivatives) self._diff_initializer = tf.group( self._diff_initializer, tf.variables_initializer(fin_diff_part), tf.variables_initializer(grads_outer), ) right_diff_0 = dict( zip( model.task_parameter.keys(), [ tf.add(state, fin_diff) for state, fin_diff in zip( model.task_parameter.values(), fin_diff_part) ], )) left_diff_0 = dict( zip( model.task_parameter.keys(), [ tf.subtract(state, fin_diff) for state, fin_diff in zip( model.task_parameter.values(), fin_diff_part) ], )) left_diff = tf.gradients( loss_func( pred=model.re_forward(task_parameter=left_diff_0).out, label=ex.y), xs=meta_param, ) right_diff = tf.gradients( loss_func( pred=model.re_forward(task_parameter=right_diff_0).out, label=ex.y), xs=meta_param, ) if self._inner_method == 'Aggr': left_diff_outer = tf.gradients( loss_func(pred=model.re_forward( new_input=param_dict['meta_learner'].re_forward( ex.x_).out, task_parameter=left_diff_0).out, label=ex.y_), xs=meta_param, ) for _, left_diff_grad, left_diff_outer_grad in zip( range(len(left_diff)), left_diff, left_diff_outer): if left_diff_grad is not None: if left_diff_outer_grad is not None: left_diff[_] = (1 - param_dict['alpha'] ) * left_diff_grad + param_dict[ 'alpha'] * left_diff_outer_grad else: if left_diff_outer_grad is not None: left_diff[ _] = param_dict['alpha'] * left_diff_outer_grad right_diff_outer = tf.gradients( loss_func(pred=model.re_forward( new_input=param_dict['meta_learner'].re_forward( ex.x_).out, task_parameter=right_diff_0).out, label=ex.y_), xs=meta_param, ) for _, right_diff_grad, right_diff_outer_grad in zip( range(len(right_diff)), right_diff, right_diff_outer): if right_diff_grad is not None: if right_diff_outer_grad is not None: right_diff[_] = ( 1 - param_dict['alpha'] ) * right_diff_grad + param_dict[ 'alpha'] * right_diff_outer_grad else: if right_diff_outer_grad is not None: right_diff[_] = param_dict[ 'alpha'] * right_diff_outer_grad # compute the second-order part and add them to the first-order item for grad_outer, left_dif, right_dif in zip(grads_outer, left_diff, right_diff): if right_dif is not None and left_dif is not None: grad_param = tf.divide(tf.subtract(right_dif, left_dif), 2 * self.epsilon) meta_grad = self.param_dict["learning_rate"] * grad_param self._darts_initializer = tf.group( self._darts_initializer, grad_outer.assign_sub(meta_grad)) for h, doo_dh in zip(meta_param, grads_outer): assert doo_dh is not None, BOMLOuterGrad._ERROR_HYPER_DETACHED.format( doo_dh) self._outer_grads_dict[h].append(doo_dh) return meta_param
def compute_gradients(self, outer_objective, inner_grad, meta_param=None, param_dict=OrderedDict()): """ Function that adds to the computational graph all the operations needend for computing the hypergradients in a "dynamic" way, without unrolling the entire optimization graph. The resulting computation, while being roughly 2x more expensive then unrolling the optimizaiton dynamics, requires much less (GPU) memory and is more flexible, allowing to set a termination condition to the parameters optimizaiton routine. :param inner_grad: OptimzerDict object resulting from the inner objective optimization. :param outer_objective: A loss function for the outer parameters (scalar tensor) :param meta_param: Optional list of outer parameters to consider. If not provided will get all variables in the hyperparameter collection in the current scope. :return: list of outer parameters involved in the computation """ meta_param = super(BOMLOuterGradReverse, self).compute_gradients(outer_objective, inner_grad, meta_param) with tf.variable_scope(outer_objective.op.name): doo_ds = tf.gradients(outer_objective, list(inner_grad.state)) alphas = self._create_lagrangian_multipliers(inner_grad, doo_ds) alpha_vec = utils.vectorize_all(alphas) dyn_vec = utils.vectorize_all(list(inner_grad.dynamics)) lag_phi_t = utils.dot(alpha_vec, dyn_vec, name="iter_wise_lagrangian_part1") alpha_dot_B = tf.gradients(lag_phi_t, meta_param) hyper_grad_vars, hyper_grad_step = [], tf.no_op() for dl_dh, hyper in zip(alpha_dot_B, meta_param): assert dl_dh is not None, BOMLOuterGrad._ERROR_HYPER_DETACHED.format( hyper) hgv = None if dl_dh is not None: hgv = self._create_outergradient(outer_objective, hyper) hyper_grad_step = tf.group(hyper_grad_step, hgv.assign_add(dl_dh)) hyper_grad_vars.append(hgv) # first update hypergradinet then alphas. with tf.control_dependencies([hyper_grad_step]): _alpha_iter = tf.group(*[ alpha.assign(dl_ds) for alpha, dl_ds in zip( alphas, tf.gradients(lag_phi_t, list( inner_grad.state))) ]) self._alpha_iter = tf.group(self._alpha_iter, _alpha_iter) # put all the backward iterations toghether [ self._hypergrad_dictionary[h].append(hg) for h, hg in zip(meta_param, hyper_grad_vars) ] self._reverse_initializer = tf.group( self._reverse_initializer, tf.variables_initializer(alphas), tf.variables_initializer( [h for h in hyper_grad_vars if hasattr(h, "initializer")]), ) return meta_param
def compute_gradients(self, outer_objective, inner_grad, meta_param=None, param_dict=OrderedDict()): """ Function that adds to the computational graph all the operations needed for computing the outer gradients with the dynamical system. :param inner_grad: BOMLInnerGrad object resulting from the inner objective optimization. :param outer_objective: A loss function for the outer parameters (scalar tensor) :param meta_param: Optional list of outer parameters to consider. If not provided will get all variables in the METAPARAMETERS collection in the current scope. :return: list of outer parameters involved in the computation """ meta_param = super(BOMLOuterGradReverse, self).compute_gradients(outer_objective, inner_grad, meta_param) with tf.variable_scope(outer_objective.op.name): doo_ds = tf.gradients(outer_objective, list(inner_grad.state)) alphas = self._create_lagrangian_multipliers(inner_grad, doo_ds) alpha_vec = utils.vectorize_all(alphas) dyn_vec = utils.vectorize_all(list(inner_grad.dynamics)) lag_phi_t = utils.dot(alpha_vec, dyn_vec, name="iter_wise_lagrangian_part1") alpha_dot_B = tf.gradients(lag_phi_t, meta_param) outer_grad_vars, outer_grad_step = [], tf.no_op() for dl_dh, hyper in zip(alpha_dot_B, meta_param): assert dl_dh is not None, BOMLOuterGrad._ERROR_HYPER_DETACHED.format( hyper) hgv = None if dl_dh is not None: hgv = self._create_outergradient(outer_objective, hyper) outer_grad_step = tf.group(outer_grad_step, hgv.assign_add(dl_dh)) outer_grad_vars.append(hgv) # first update hypergradinet then alphas. with tf.control_dependencies([outer_grad_step]): _alpha_iter = tf.group(*[ alpha.assign(dl_ds) for alpha, dl_ds in zip( alphas, tf.gradients(lag_phi_t, list( inner_grad.state))) ]) self._alpha_iter = tf.group(self._alpha_iter, _alpha_iter) # put all the backward iterations toghether [ self._outer_grads_dict[h].append(hg) for h, hg in zip(meta_param, outer_grad_vars) ] self._reverse_initializer = tf.group( self._reverse_initializer, tf.variables_initializer(alphas), tf.variables_initializer( [h for h in outer_grad_vars if hasattr(h, "initializer")]), ) return meta_param