def create_optim_op(self): if not self.optimizer: self.create_optimizer() assert self.loss is not None with tf.variable_scope("optimize"): # AccumulateN might not be deterministic but should be faster and should require less memory. # We might want to make this configurable. aggregation_method = tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N grad_noise = self.config.float("gradient_noise", 0.0) grad_clip = self.config.float("gradient_clip", 0.0) grad_clip_global_norm = self.config.float( "gradient_clip_global_norm", 0.0) # Extended self.optimizer.minimize() to optinally modify gradients. grads_and_vars = self.optimizer.compute_gradients( self.loss, var_list=self.trainable_vars, aggregation_method=aggregation_method) if not [v for g, v in grads_and_vars if g is not None]: raise Exception("no single variable to train") # Also see tf.contrib.layers.optimizers.optimize_loss() for reference. if grad_noise: assert grad_noise > 0 from TFUtil import add_scaled_noise_to_gradients grads_and_vars = add_scaled_noise_to_gradients( grads_and_vars, grad_noise) if grad_clip: assert grad_clip > 0 grads_and_vars = [(tf.clip_by_value(grad, -grad_clip, grad_clip), var) for grad, var in grads_and_vars] if grad_clip_global_norm: assert grad_clip_global_norm > 0 grads_clipped, _ = tf.clip_by_global_norm( [grad for (grad, _) in grads_and_vars], grad_clip_global_norm) grads_and_vars = zip(grads_clipped, [var for (_, var) in grads_and_vars]) apply_grads = self.optimizer.apply_gradients(grads_and_vars) incr_step_op = tf.assign_add(self.network.global_train_step, 1, name="global_train_step_increment") self.optim_op = tf.group(apply_grads, incr_step_op, name="optim_and_step_incr") print("Initialize optimizer with slots %s." % self.optimizer.get_slot_names(), file=log.v3) slot_vars = [] for slot_name in self.optimizer.get_slot_names(): for v in self.trainable_vars: slot_var = self.optimizer.get_slot(var=v, name=slot_name) assert slot_var is not None slot_vars.append(slot_var) self.tf_session.run( tf.variables_initializer(slot_vars, name="init_optim_slot_vars"))
def _get_apply_grads_op(self, loss, trainable_vars_for_gradients): """ :param tf.Tensor loss: :param list[tf.Variable] trainable_vars_for_gradients: :return: op with all variable updates combined, using the optimizer :rtype: tf.Operation """ if not trainable_vars_for_gradients: return tf.no_op(name="no_grad_vars_no_op") # AccumulateN might not be deterministic but should be faster and should require less memory. # We might want to make this configurable. aggregation_method = tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N grad_noise = self.config.float("gradient_noise", 0.0) grad_clip = self.config.float("gradient_clip", 0.0) grad_clip_global_norm = self.config.float("gradient_clip_global_norm", 0.0) # E.g. https://github.com/openai/baselines/blob/master/baselines/deepq/simple.py: grad_norm_clipping=10 -> tf.clip_by_norm # Extended self.optimizer.minimize() to optionally modify gradients. grads_and_vars = self.optimizer.compute_gradients( loss, var_list=trainable_vars_for_gradients, aggregation_method=aggregation_method) if not [v for g, v in grads_and_vars if g is not None]: raise Exception("no single variable to train") if self.config.bool("debug_grad_summaries", False): from TFUtil import variable_summaries, get_base_name, reuse_name_scope_of_tensor for grad, var in grads_and_vars: with reuse_name_scope_of_tensor(grad, prefix="grads/"): variable_summaries(grad, name="grad_of_%s" % get_base_name(var)) with reuse_name_scope_of_tensor(var, prefix="vars/"): variable_summaries(var, name=get_base_name(var)) # Also see tf.contrib.layers.optimizers.optimize_loss() for reference. if self.config.bool("gradient_nan_inf_filter", False): from TFUtil import nan_to_num grads_and_vars = [(nan_to_num(grad, nan_num=0.0, inf_num=0.0), var) for (grad, var) in grads_and_vars] if grad_noise: assert grad_noise > 0 from TFUtil import add_scaled_noise_to_gradients grads_and_vars = add_scaled_noise_to_gradients(grads_and_vars, grad_noise) if grad_clip: assert grad_clip > 0 grads_and_vars = [(tf.clip_by_value(grad, -grad_clip, grad_clip), var) for grad, var in grads_and_vars] if grad_clip_global_norm: assert grad_clip_global_norm > 0 grads_clipped, _ = tf.clip_by_global_norm([grad for (grad, _) in grads_and_vars], grad_clip_global_norm) grads_and_vars = zip(grads_clipped, [var for (_, var) in grads_and_vars]) apply_grads = self.optimizer.apply_gradients(grads_and_vars) return apply_grads
def _post_process_grad(self, grad, var, global_info): """ :param tf.Tensor grad: :param tf.Variable var: :param WrapOptimizer._GetGlobalInfo global_info: :return: new grad, apply grad opts :rtype: tf.Tensor, dict[str] """ updater_opts = self._get_updater_opts_from_var(var) accum_grad_multiple_num_steps = updater_opts.get( "accum_grad_multiple_step", self.config.int("accum_grad_multiple_step", 0)) grad_noise = updater_opts.get("gradient_noise", self.config.float("gradient_noise", 0.0)) grad_clip = updater_opts.get("gradient_clip", self.config.float("gradient_clip", 0.0)) # E.g. https://github.com/openai/baselines/blob/master/baselines/deepq/simple.py: # grad_norm_clipping=10 -> tf.clip_by_norm grad_clip_norm = updater_opts.get("gradient_clip_norm", self.config.float("gradient_clip_norm", 0.0)) grad_clip_avg_norm = updater_opts.get("gradient_clip_avg_norm", self.config.float("gradient_clip_avg_norm", 0.0)) grad_clip_global_norm = updater_opts.get( "gradient_clip_global_norm", self.config.float("gradient_clip_global_norm", 0.0)) global_norm_tag = updater_opts.get( "global_norm_tag", self.config.value("global_norm_tag", None)) grad_clip_global_norm_tag = updater_opts.get( "gradient_clip_global_norm_tag", self.config.value("gradient_clip_global_norm_tag", global_norm_tag)) grad_norm_to_clip_to_zero = updater_opts.get( "grad_norm_to_clip_to_zero", self.config.float("grad_norm_to_clip_to_zero", 0.0)) maximize_grad_norm = updater_opts.get("maximize_grad_norm", self.config.float("maximize_grad_norm", 0)) if maximize_grad_norm: grad_ext = global_info.get_maximize_grad_norm_grad(maximize_grad_norm, var) if grad_ext is not None: grad += grad_ext if accum_grad_multiple_num_steps >= 1: grad = accum_grad_multiple_step( grad, var, train_step=self.global_train_step, num_accum_steps=accum_grad_multiple_num_steps) if updater_opts.get("debug_grad_summaries", self.config.bool_or_other("debug_grad_summaries", False)): from TFUtil import variable_summaries, get_base_name, reuse_name_scope_of_tensor with reuse_name_scope_of_tensor(grad, prefix="grads/"): variable_summaries(grad, name="grad_of_%s" % get_base_name(var)) with reuse_name_scope_of_tensor(var, prefix="vars/"): variable_summaries(var, name=get_base_name(var)) # Also see tf.contrib.layers.optimizers.optimize_loss() for reference. if grad_noise: assert grad_noise > 0 from TFUtil import add_scaled_noise_to_gradients with tf.name_scope("grad_noise"): (grad, var), = add_scaled_noise_to_gradients([(grad, var)], grad_noise) if grad_clip: assert grad_clip > 0 with tf.name_scope("grad_clip"): grad = tf.clip_by_value(grad, -grad_clip, grad_clip) if grad_clip_norm: assert grad_clip_norm > 0 with tf.name_scope("grad_clip_norm"): grad = tf.clip_by_norm(grad, grad_clip_norm) if grad_clip_avg_norm: assert grad_clip_avg_norm > 0 with tf.name_scope("grad_clip_avg_norm"): grad = tf.clip_by_average_norm(grad, grad_clip_avg_norm) if grad_clip_global_norm: assert grad_clip_global_norm > 0 with tf.name_scope("grad_clip_global_norm"): grad = global_info.clip_by_global_norm( grad, clip_norm=grad_clip_global_norm, global_norm_tag=grad_clip_global_norm_tag) if updater_opts.get("gradient_nan_inf_filter", self.config.bool("gradient_nan_inf_filter", False)): from TFUtil import nan_to_num grad = nan_to_num(grad, nan_num=0.0, inf_num=0.0) if grad_norm_to_clip_to_zero: with tf.name_scope("grad_norm_to_clip_to_zero"): grad = global_info.set_zero_on_high_global_norm( grad, grad_norm_threshold=grad_norm_to_clip_to_zero, global_norm_tag=global_norm_tag) updater_opts.assert_all_read() opt_key, _ = self._get_optimizer_item_for_variable(var) apply_grad_opts = { "opt_key": opt_key, "accum_grad_multiple_num_steps": accum_grad_multiple_num_steps} return grad, apply_grad_opts
def _get_apply_grads_op(self, loss, trainable_vars_for_gradients): """ :param tf.Tensor loss: :param list[tf.Variable] trainable_vars_for_gradients: :return: op with all variable updates combined, using the optimizer :rtype: tf.Operation """ if not trainable_vars_for_gradients: return tf.no_op(name="no_grad_vars_no_op") # AccumulateN might not be deterministic but should be faster and should require less memory. # We might want to make this configurable. if self.config.is_true("deterministic_train"): aggregation_method = tf.AggregationMethod.ADD_N else: aggregation_method = tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N accum_grad_multiple_num_steps = self.config.int( "accum_grad_multiple_step", 0) grad_noise = self.config.float("gradient_noise", 0.0) grad_clip = self.config.float("gradient_clip", 0.0) grad_clip_norm = self.config.float("gradient_clip_norm", 0.0) grad_clip_avg_norm = self.config.float("gradient_clip_avg_norm", 0.0) grad_clip_global_norm = self.config.float("gradient_clip_global_norm", 0.0) # E.g. https://github.com/openai/baselines/blob/master/baselines/deepq/simple.py: grad_norm_clipping=10 -> tf.clip_by_norm # Extended self.optimizer.minimize() to optionally modify gradients. grads_and_vars = self.optimizer.compute_gradients( loss, var_list=trainable_vars_for_gradients, aggregation_method=aggregation_method) if self.config.is_true("use_horovod") and self.config.value( "horovod_reduce_type", "") == "grad": import horovod.tensorflow as hvd grads_and_vars = [(hvd.allreduce( grad, average=self.config.is_true("horovod_avg_grad")) if grad is not None else None, var) for (grad, var) in grads_and_vars] var_grads = { var: grad for (grad, var) in grads_and_vars if grad is not None } if not var_grads: raise Exception("no single variable to train") if self.config.float("maximize_grad_norm", 0): f = self.config.float("maximize_grad_norm", 0) grad_norm = tf.add_n( [tf.nn.l2_loss(g) for g in var_grads.values()], name="grad_norm_half") * 2.0 loss_ext = grad_norm * (-f) grads_and_vars_ext = self.optimizer.compute_gradients( loss_ext, var_list=list(var_grads.keys()), aggregation_method=aggregation_method) var_grads_ext = { var: grad for (grad, var) in grads_and_vars_ext if grad is not None } grads_and_vars = [(grad + var_grads_ext.get(var, 0.0), var) for (grad, var) in grads_and_vars] if accum_grad_multiple_num_steps >= 1: grads_and_vars = [(accum_grad_multiple_step( grad, var, train_step=self.network.global_train_step, num_accum_steps=accum_grad_multiple_num_steps), var) for (grad, var) in grads_and_vars] if self.config.bool("debug_grad_summaries", False): from TFUtil import variable_summaries, get_base_name, reuse_name_scope_of_tensor for grad, var in grads_and_vars: with reuse_name_scope_of_tensor(grad, prefix="grads/"): variable_summaries(grad, name="grad_of_%s" % get_base_name(var)) with reuse_name_scope_of_tensor(var, prefix="vars/"): variable_summaries(var, name=get_base_name(var)) # Also see tf.contrib.layers.optimizers.optimize_loss() for reference. if self.config.bool("gradient_nan_inf_filter", False): from TFUtil import nan_to_num grads_and_vars = [(nan_to_num(grad, nan_num=0.0, inf_num=0.0), var) for (grad, var) in grads_and_vars] if grad_noise: assert grad_noise > 0 from TFUtil import add_scaled_noise_to_gradients with tf.name_scope("grad_noise"): grads_and_vars = add_scaled_noise_to_gradients( grads_and_vars, grad_noise) if grad_clip: assert grad_clip > 0 with tf.name_scope("grad_clip"): grads_and_vars = [(tf.clip_by_value(grad, -grad_clip, grad_clip), var) for grad, var in grads_and_vars] if grad_clip_norm: assert grad_clip_norm > 0 with tf.name_scope("grad_clip_norm"): grads_and_vars = [(tf.clip_by_norm(grad, grad_clip_norm), var) for grad, var in grads_and_vars] if grad_clip_avg_norm: assert grad_clip_avg_norm > 0 with tf.name_scope("grad_clip_avg_norm"): grads_and_vars = [ (tf.clip_by_average_norm(grad, grad_clip_avg_norm), var) for grad, var in grads_and_vars ] if grad_clip_global_norm: assert grad_clip_global_norm > 0 with tf.name_scope("grad_clip_global_norm"): grads_clipped, _ = tf.clip_by_global_norm( [grad for (grad, _) in grads_and_vars], grad_clip_global_norm) grads_and_vars = zip(grads_clipped, [var for (_, var) in grads_and_vars]) if accum_grad_multiple_num_steps >= 1: apply_grads = tf.cond( tf.equal( tf.mod(self.network.global_train_step, accum_grad_multiple_num_steps), accum_grad_multiple_num_steps - 1), true_fn=lambda: self.optimizer.apply_gradients(grads_and_vars), false_fn=lambda: tf.no_op(), name="apply_grads/accum_grad_multiple_step") else: apply_grads = self.optimizer.apply_gradients(grads_and_vars) return apply_grads
def create_optim_op(self): # Keep track of all current available vars. # The optimizer could add some, even some which are not so-called "slot-vars", # and we want to keep track about them. all_vars = tf.global_variables() # type: list[tf.Variable] if not self.optimizer: self.create_optimizer() assert self.loss is not None with tf.variable_scope("optimize"): # AccumulateN might not be deterministic but should be faster and should require less memory. # We might want to make this configurable. aggregation_method = tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N grad_noise = self.config.float("gradient_noise", 0.0) grad_clip = self.config.float("gradient_clip", 0.0) grad_clip_global_norm = self.config.float("gradient_clip_global_norm", 0.0) # Extended self.optimizer.minimize() to optionally modify gradients. grads_and_vars = self.optimizer.compute_gradients( self.loss, var_list=self.trainable_vars, aggregation_method=aggregation_method) if not [v for g, v in grads_and_vars if g is not None]: raise Exception("no single variable to train") # Also see tf.contrib.layers.optimizers.optimize_loss() for reference. if self.config.bool("gradient_nan_inf_filter", False): from TFUtil import nan_to_num grads_and_vars = [(nan_to_num(grad), var) for (grad, var) in grads_and_vars] if grad_noise: assert grad_noise > 0 from TFUtil import add_scaled_noise_to_gradients grads_and_vars = add_scaled_noise_to_gradients(grads_and_vars, grad_noise) if grad_clip: assert grad_clip > 0 grads_and_vars = [(tf.clip_by_value(grad, -grad_clip, grad_clip), var) for grad, var in grads_and_vars] if grad_clip_global_norm: assert grad_clip_global_norm > 0 grads_clipped, _ = tf.clip_by_global_norm([grad for (grad, _) in grads_and_vars], grad_clip_global_norm) grads_and_vars = zip(grads_clipped, [var for (_, var) in grads_and_vars]) apply_grads = self.optimizer.apply_gradients(grads_and_vars) incr_step_op = tf.assign_add(self.network.global_train_step, 1, name="global_train_step_increment") self.optim_op = tf.group(apply_grads, incr_step_op, name="optim_and_step_incr") print("Initialize optimizer with slots %s." % self.optimizer.get_slot_names(), file=log.v3) slot_vars = [] for slot_name in self.optimizer.get_slot_names(): for v in self.trainable_vars: slot_var = self.optimizer.get_slot(var=v, name=slot_name) assert slot_var is not None assert isinstance(slot_var, tf.Variable) slot_vars.append(slot_var) self.optimizer_vars = slot_vars # Check if there were any other variables added. # E.g. currently (TF 1.0) the `AdamOptimizer` creates these additional vars # `[<tf.Variable 'optimize/beta1_power:0' shape=() dtype=float32_ref>, # <tf.Variable 'optimize/beta2_power:0' shape=() dtype=float32_ref>]` # which do not correspond to trainable vars, thus we did not get them as slot vars above. other_new_vars = [] for v in tf.global_variables(): if v in all_vars: continue if v in self.optimizer_vars: continue other_new_vars.append(v) if other_new_vars: print("These additional variable were created by the optimizer: %s." % other_new_vars, file=log.v3) self.optimizer_vars += other_new_vars self.optimizer_init_vars_op = tf.variables_initializer(self.optimizer_vars, name="init_optim_slot_vars") self.init_optimizer_vars()