def _get_apply_grads_op(self, loss, trainable_vars_for_gradients): """ :param tf.Tensor loss: :param list[tf.Variable] trainable_vars_for_gradients: :return: op with all variable updates combined, using the optimizer :rtype: tf.Operation """ if not trainable_vars_for_gradients: return tf.no_op(name="no_grad_vars_no_op") # AccumulateN might not be deterministic but should be faster and should require less memory. # We might want to make this configurable. aggregation_method = tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N grad_noise = self.config.float("gradient_noise", 0.0) grad_clip = self.config.float("gradient_clip", 0.0) grad_clip_global_norm = self.config.float("gradient_clip_global_norm", 0.0) # E.g. https://github.com/openai/baselines/blob/master/baselines/deepq/simple.py: grad_norm_clipping=10 -> tf.clip_by_norm # Extended self.optimizer.minimize() to optionally modify gradients. grads_and_vars = self.optimizer.compute_gradients( loss, var_list=trainable_vars_for_gradients, aggregation_method=aggregation_method) if not [v for g, v in grads_and_vars if g is not None]: raise Exception("no single variable to train") if self.config.bool("debug_grad_summaries", False): from TFUtil import variable_summaries, get_base_name, reuse_name_scope_of_tensor for grad, var in grads_and_vars: with reuse_name_scope_of_tensor(grad, prefix="grads/"): variable_summaries(grad, name="grad_of_%s" % get_base_name(var)) with reuse_name_scope_of_tensor(var, prefix="vars/"): variable_summaries(var, name=get_base_name(var)) # Also see tf.contrib.layers.optimizers.optimize_loss() for reference. if self.config.bool("gradient_nan_inf_filter", False): from TFUtil import nan_to_num grads_and_vars = [(nan_to_num(grad, nan_num=0.0, inf_num=0.0), var) for (grad, var) in grads_and_vars] if grad_noise: assert grad_noise > 0 from TFUtil import add_scaled_noise_to_gradients grads_and_vars = add_scaled_noise_to_gradients(grads_and_vars, grad_noise) if grad_clip: assert grad_clip > 0 grads_and_vars = [(tf.clip_by_value(grad, -grad_clip, grad_clip), var) for grad, var in grads_and_vars] if grad_clip_global_norm: assert grad_clip_global_norm > 0 grads_clipped, _ = tf.clip_by_global_norm([grad for (grad, _) in grads_and_vars], grad_clip_global_norm) grads_and_vars = zip(grads_clipped, [var for (_, var) in grads_and_vars]) apply_grads = self.optimizer.apply_gradients(grads_and_vars) return apply_grads
def accum_grad_multiple_step(grad, var, train_step, num_accum_steps): """ :param tf.Tensor|tf.IndexedSlices grad: :param tf.Variable var: :param tf.Tensor train_step: int, scalar :param int num_accum_steps: :return: modified grad :rtype: tf.Tensor """ from TFUtil import reuse_name_scope_of_tensor, get_base_name with reuse_name_scope_of_tensor(grad, postfix="/%s_accum_grad" % get_base_name(grad)): shape = var.get_shape().as_list() v = tf.get_variable( name="var_accum_grad", shape=shape, dtype=grad.dtype, initializer=tf.zeros_initializer(), trainable=False) return tf.cond( tf.less_equal(tf.mod(train_step, num_accum_steps), 0), lambda: tf.assign(v, grad), lambda: tf.assign_add(v, grad))
def accum_grad_multiple_step(grad, var, train_step, num_accum_steps): """ :param tf.Tensor|tf.IndexedSlices grad: :param tf.Variable var: :param tf.Tensor train_step: int, scalar :param int num_accum_steps: :return: modified grad :rtype: tf.Tensor """ from TFUtil import reuse_name_scope_of_tensor, get_base_name with reuse_name_scope_of_tensor(grad, postfix="/%s_accum_grad" % get_base_name(grad)): shape = var.get_shape().as_list() v = tf.get_variable(name="var_accum_grad", shape=shape, dtype=grad.dtype, initializer=tf.zeros_initializer(), trainable=False) return tf.cond(tf.less_equal(tf.mod(train_step, num_accum_steps), 0), lambda: tf.assign(v, grad), lambda: tf.assign_add(v, grad))
def _post_process_grad(self, grad, var, global_info): """ :param tf.Tensor grad: :param tf.Variable var: :param WrapOptimizer._GetGlobalInfo global_info: :return: new grad, apply grad opts :rtype: tf.Tensor, dict[str] """ updater_opts = self._get_updater_opts_from_var(var) accum_grad_multiple_num_steps = updater_opts.get( "accum_grad_multiple_step", self.config.int("accum_grad_multiple_step", 0)) grad_noise = updater_opts.get("gradient_noise", self.config.float("gradient_noise", 0.0)) grad_clip = updater_opts.get("gradient_clip", self.config.float("gradient_clip", 0.0)) # E.g. https://github.com/openai/baselines/blob/master/baselines/deepq/simple.py: # grad_norm_clipping=10 -> tf.clip_by_norm grad_clip_norm = updater_opts.get("gradient_clip_norm", self.config.float("gradient_clip_norm", 0.0)) grad_clip_avg_norm = updater_opts.get("gradient_clip_avg_norm", self.config.float("gradient_clip_avg_norm", 0.0)) grad_clip_global_norm = updater_opts.get( "gradient_clip_global_norm", self.config.float("gradient_clip_global_norm", 0.0)) global_norm_tag = updater_opts.get( "global_norm_tag", self.config.value("global_norm_tag", None)) grad_clip_global_norm_tag = updater_opts.get( "gradient_clip_global_norm_tag", self.config.value("gradient_clip_global_norm_tag", global_norm_tag)) grad_norm_to_clip_to_zero = updater_opts.get( "grad_norm_to_clip_to_zero", self.config.float("grad_norm_to_clip_to_zero", 0.0)) maximize_grad_norm = updater_opts.get("maximize_grad_norm", self.config.float("maximize_grad_norm", 0)) if maximize_grad_norm: grad_ext = global_info.get_maximize_grad_norm_grad(maximize_grad_norm, var) if grad_ext is not None: grad += grad_ext if accum_grad_multiple_num_steps >= 1: grad = accum_grad_multiple_step( grad, var, train_step=self.global_train_step, num_accum_steps=accum_grad_multiple_num_steps) if updater_opts.get("debug_grad_summaries", self.config.bool_or_other("debug_grad_summaries", False)): from TFUtil import variable_summaries, get_base_name, reuse_name_scope_of_tensor with reuse_name_scope_of_tensor(grad, prefix="grads/"): variable_summaries(grad, name="grad_of_%s" % get_base_name(var)) with reuse_name_scope_of_tensor(var, prefix="vars/"): variable_summaries(var, name=get_base_name(var)) # Also see tf.contrib.layers.optimizers.optimize_loss() for reference. if grad_noise: assert grad_noise > 0 from TFUtil import add_scaled_noise_to_gradients with tf.name_scope("grad_noise"): (grad, var), = add_scaled_noise_to_gradients([(grad, var)], grad_noise) if grad_clip: assert grad_clip > 0 with tf.name_scope("grad_clip"): grad = tf.clip_by_value(grad, -grad_clip, grad_clip) if grad_clip_norm: assert grad_clip_norm > 0 with tf.name_scope("grad_clip_norm"): grad = tf.clip_by_norm(grad, grad_clip_norm) if grad_clip_avg_norm: assert grad_clip_avg_norm > 0 with tf.name_scope("grad_clip_avg_norm"): grad = tf.clip_by_average_norm(grad, grad_clip_avg_norm) if grad_clip_global_norm: assert grad_clip_global_norm > 0 with tf.name_scope("grad_clip_global_norm"): grad = global_info.clip_by_global_norm( grad, clip_norm=grad_clip_global_norm, global_norm_tag=grad_clip_global_norm_tag) if updater_opts.get("gradient_nan_inf_filter", self.config.bool("gradient_nan_inf_filter", False)): from TFUtil import nan_to_num grad = nan_to_num(grad, nan_num=0.0, inf_num=0.0) if grad_norm_to_clip_to_zero: with tf.name_scope("grad_norm_to_clip_to_zero"): grad = global_info.set_zero_on_high_global_norm( grad, grad_norm_threshold=grad_norm_to_clip_to_zero, global_norm_tag=global_norm_tag) updater_opts.assert_all_read() opt_key, _ = self._get_optimizer_item_for_variable(var) apply_grad_opts = { "opt_key": opt_key, "accum_grad_multiple_num_steps": accum_grad_multiple_num_steps} return grad, apply_grad_opts
def _get_apply_grads_op(self, loss, trainable_vars_for_gradients): """ :param tf.Tensor loss: :param list[tf.Variable] trainable_vars_for_gradients: :return: op with all variable updates combined, using the optimizer :rtype: tf.Operation """ if not trainable_vars_for_gradients: return tf.no_op(name="no_grad_vars_no_op") # AccumulateN might not be deterministic but should be faster and should require less memory. # We might want to make this configurable. if self.config.is_true("deterministic_train"): aggregation_method = tf.AggregationMethod.ADD_N else: aggregation_method = tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N accum_grad_multiple_num_steps = self.config.int( "accum_grad_multiple_step", 0) grad_noise = self.config.float("gradient_noise", 0.0) grad_clip = self.config.float("gradient_clip", 0.0) grad_clip_norm = self.config.float("gradient_clip_norm", 0.0) grad_clip_avg_norm = self.config.float("gradient_clip_avg_norm", 0.0) grad_clip_global_norm = self.config.float("gradient_clip_global_norm", 0.0) # E.g. https://github.com/openai/baselines/blob/master/baselines/deepq/simple.py: grad_norm_clipping=10 -> tf.clip_by_norm # Extended self.optimizer.minimize() to optionally modify gradients. grads_and_vars = self.optimizer.compute_gradients( loss, var_list=trainable_vars_for_gradients, aggregation_method=aggregation_method) if self.config.is_true("use_horovod") and self.config.value( "horovod_reduce_type", "") == "grad": import horovod.tensorflow as hvd grads_and_vars = [(hvd.allreduce( grad, average=self.config.is_true("horovod_avg_grad")) if grad is not None else None, var) for (grad, var) in grads_and_vars] var_grads = { var: grad for (grad, var) in grads_and_vars if grad is not None } if not var_grads: raise Exception("no single variable to train") if self.config.float("maximize_grad_norm", 0): f = self.config.float("maximize_grad_norm", 0) grad_norm = tf.add_n( [tf.nn.l2_loss(g) for g in var_grads.values()], name="grad_norm_half") * 2.0 loss_ext = grad_norm * (-f) grads_and_vars_ext = self.optimizer.compute_gradients( loss_ext, var_list=list(var_grads.keys()), aggregation_method=aggregation_method) var_grads_ext = { var: grad for (grad, var) in grads_and_vars_ext if grad is not None } grads_and_vars = [(grad + var_grads_ext.get(var, 0.0), var) for (grad, var) in grads_and_vars] if accum_grad_multiple_num_steps >= 1: grads_and_vars = [(accum_grad_multiple_step( grad, var, train_step=self.network.global_train_step, num_accum_steps=accum_grad_multiple_num_steps), var) for (grad, var) in grads_and_vars] if self.config.bool("debug_grad_summaries", False): from TFUtil import variable_summaries, get_base_name, reuse_name_scope_of_tensor for grad, var in grads_and_vars: with reuse_name_scope_of_tensor(grad, prefix="grads/"): variable_summaries(grad, name="grad_of_%s" % get_base_name(var)) with reuse_name_scope_of_tensor(var, prefix="vars/"): variable_summaries(var, name=get_base_name(var)) # Also see tf.contrib.layers.optimizers.optimize_loss() for reference. if self.config.bool("gradient_nan_inf_filter", False): from TFUtil import nan_to_num grads_and_vars = [(nan_to_num(grad, nan_num=0.0, inf_num=0.0), var) for (grad, var) in grads_and_vars] if grad_noise: assert grad_noise > 0 from TFUtil import add_scaled_noise_to_gradients with tf.name_scope("grad_noise"): grads_and_vars = add_scaled_noise_to_gradients( grads_and_vars, grad_noise) if grad_clip: assert grad_clip > 0 with tf.name_scope("grad_clip"): grads_and_vars = [(tf.clip_by_value(grad, -grad_clip, grad_clip), var) for grad, var in grads_and_vars] if grad_clip_norm: assert grad_clip_norm > 0 with tf.name_scope("grad_clip_norm"): grads_and_vars = [(tf.clip_by_norm(grad, grad_clip_norm), var) for grad, var in grads_and_vars] if grad_clip_avg_norm: assert grad_clip_avg_norm > 0 with tf.name_scope("grad_clip_avg_norm"): grads_and_vars = [ (tf.clip_by_average_norm(grad, grad_clip_avg_norm), var) for grad, var in grads_and_vars ] if grad_clip_global_norm: assert grad_clip_global_norm > 0 with tf.name_scope("grad_clip_global_norm"): grads_clipped, _ = tf.clip_by_global_norm( [grad for (grad, _) in grads_and_vars], grad_clip_global_norm) grads_and_vars = zip(grads_clipped, [var for (_, var) in grads_and_vars]) if accum_grad_multiple_num_steps >= 1: apply_grads = tf.cond( tf.equal( tf.mod(self.network.global_train_step, accum_grad_multiple_num_steps), accum_grad_multiple_num_steps - 1), true_fn=lambda: self.optimizer.apply_gradients(grads_and_vars), false_fn=lambda: tf.no_op(), name="apply_grads/accum_grad_multiple_step") else: apply_grads = self.optimizer.apply_gradients(grads_and_vars) return apply_grads