def map_layer_inputs_to_op(self, X, W, b, i, initial_state=None): """ Just like NativeOp.LstmGenericBase.map_layer_inputs_to_op(). :param tf.Tensor X: inputs: shape (time,batch,n_input_dim) :param tf.Tensor W: shape (n_input_dim+n_hidden,n_hidden*4) :param tf.Tensor b: shape (n_hidden*4,) :param tf.Tensor i: index: shape (time,batch) :param tf.Tensor|None initial_state: shape (batch,n_hidden) :rtype: tuple[tf.Tensor] """ X.set_shape(tf.TensorShape([None, None, self.n_input_dim])) W.set_shape(tf.TensorShape([self.n_input_dim + self.n_hidden, self.n_hidden * 4])) i.set_shape(tf.TensorShape([None, None])) if i.dtype != tf.float32: if not hasattr(i, "cast_float32"): from TFUtil import reuse_name_scope_of_tensor with reuse_name_scope_of_tensor(i): i_cast_float32 = tf.cast(i, dtype=tf.float32, name="index_cast_float32") i.cast_float32 = i_cast_float32 i = i.cast_float32 n_batch = tf.shape(X)[1] if initial_state is not None: c0 = initial_state else: c0 = tf.zeros((n_batch, self.n_hidden), dtype=tf.float32, name="initial_c") # We could make `h` a variable exactly if `c` is a trainable variable. y0 = tf.zeros((n_batch, self.n_hidden), dtype=tf.float32, name="initial_h") start = tf.constant(0, name="start") step = tf.constant(self.step or 1, name="step") return X, W, b, y0, c0, i, start, step
def map_layer_inputs_to_op(cls, Z, V_h, i, initial_state=None): """ Just like NativeOp.LstmGenericBase.map_layer_inputs_to_op(). :param tf.Tensor Z: inputs: shape (time,batch,n_hidden*4) :param tf.Tensor V_h: W_re: shape (n_hidden,n_hidden*4) :param tf.Tensor i: index: shape (time,batch) :param tf.Tensor|None initial_state: shape (batch,n_hidden) :rtype: (tf.Tensor,tf.Tensor,tf.Tensor,tf.Tensor) """ assert Z.get_shape().ndims == 3 assert V_h.get_shape().ndims == 2 assert i.get_shape().ndims == 2 if i.dtype != tf.float32: if not hasattr(i, "cast_float32"): from TFUtil import reuse_name_scope_of_tensor with reuse_name_scope_of_tensor(i): i_cast_float32 = tf.cast(i, dtype=tf.float32, name="index_cast_float32") i.cast_float32 = i_cast_float32 i = i.cast_float32 n_batch = tf.shape(Z)[1] n_out = tf.shape(V_h)[0] if initial_state is not None: from tensorflow.python.ops.nn import rnn_cell if isinstance(initial_state, rnn_cell.LSTMStateTuple): initial_state = initial_state.c c = initial_state else: c = tf.zeros((n_batch, n_out), dtype=tf.float32) return Z, V_h, c, i
def _get_apply_grads_op(self, loss, trainable_vars_for_gradients): """ :param tf.Tensor loss: :param list[tf.Variable] trainable_vars_for_gradients: :return: op with all variable updates combined, using the optimizer :rtype: tf.Operation """ if not trainable_vars_for_gradients: return tf.no_op(name="no_grad_vars_no_op") # AccumulateN might not be deterministic but should be faster and should require less memory. # We might want to make this configurable. aggregation_method = tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N grad_noise = self.config.float("gradient_noise", 0.0) grad_clip = self.config.float("gradient_clip", 0.0) grad_clip_global_norm = self.config.float("gradient_clip_global_norm", 0.0) # E.g. https://github.com/openai/baselines/blob/master/baselines/deepq/simple.py: grad_norm_clipping=10 -> tf.clip_by_norm # Extended self.optimizer.minimize() to optionally modify gradients. grads_and_vars = self.optimizer.compute_gradients( loss, var_list=trainable_vars_for_gradients, aggregation_method=aggregation_method) if not [v for g, v in grads_and_vars if g is not None]: raise Exception("no single variable to train") if self.config.bool("debug_grad_summaries", False): from TFUtil import variable_summaries, get_base_name, reuse_name_scope_of_tensor for grad, var in grads_and_vars: with reuse_name_scope_of_tensor(grad, prefix="grads/"): variable_summaries(grad, name="grad_of_%s" % get_base_name(var)) with reuse_name_scope_of_tensor(var, prefix="vars/"): variable_summaries(var, name=get_base_name(var)) # Also see tf.contrib.layers.optimizers.optimize_loss() for reference. if self.config.bool("gradient_nan_inf_filter", False): from TFUtil import nan_to_num grads_and_vars = [(nan_to_num(grad, nan_num=0.0, inf_num=0.0), var) for (grad, var) in grads_and_vars] if grad_noise: assert grad_noise > 0 from TFUtil import add_scaled_noise_to_gradients grads_and_vars = add_scaled_noise_to_gradients(grads_and_vars, grad_noise) if grad_clip: assert grad_clip > 0 grads_and_vars = [(tf.clip_by_value(grad, -grad_clip, grad_clip), var) for grad, var in grads_and_vars] if grad_clip_global_norm: assert grad_clip_global_norm > 0 grads_clipped, _ = tf.clip_by_global_norm([grad for (grad, _) in grads_and_vars], grad_clip_global_norm) grads_and_vars = zip(grads_clipped, [var for (_, var) in grads_and_vars]) apply_grads = self.optimizer.apply_gradients(grads_and_vars) return apply_grads
def accum_grad_multiple_step(grad, var, train_step, num_accum_steps): """ :param tf.Tensor|tf.IndexedSlices grad: :param tf.Variable var: :param tf.Tensor train_step: int, scalar :param int num_accum_steps: :return: modified grad :rtype: tf.Tensor """ from TFUtil import reuse_name_scope_of_tensor, get_base_name with reuse_name_scope_of_tensor(grad, postfix="/%s_accum_grad" % get_base_name(grad)): shape = var.get_shape().as_list() v = tf.get_variable( name="var_accum_grad", shape=shape, dtype=grad.dtype, initializer=tf.zeros_initializer(), trainable=False) return tf.cond( tf.less_equal(tf.mod(train_step, num_accum_steps), 0), lambda: tf.assign(v, grad), lambda: tf.assign_add(v, grad))
def map_layer_inputs_to_op(self, X, W, i, initial_state=None): """ Just like NativeOp.LstmGenericBase.map_layer_inputs_to_op(). :param tf.Tensor X: inputs: shape (time,batch,n_input_dim) :param tf.Tensor W: shape (n_input_dim+n_hidden,n_hidden*4) :param tf.Tensor i: index: shape (time,batch) :param tf.Tensor|None initial_state: shape (batch,n_hidden) :rtype: tuple[tf.Tensor] """ from tensorflow.python.ops.nn import rnn_cell X.set_shape(tf.TensorShape([None, None, self.n_hidden * 4])) W.set_shape(tf.TensorShape([self.n_hidden, self.n_hidden * 4])) i.set_shape(tf.TensorShape([None, None])) if i.dtype != tf.float32: if not hasattr(i, "cast_float32"): from TFUtil import reuse_name_scope_of_tensor with reuse_name_scope_of_tensor(i): i_cast_float32 = tf.cast(i, dtype=tf.float32, name="index_cast_float32") i.cast_float32 = i_cast_float32 i = i.cast_float32 n_batch = tf.shape(X)[1] if initial_state is None: c0 = tf.zeros((n_batch, self.n_hidden), dtype=tf.float32, name="initial_c") y0 = tf.zeros((n_batch, self.n_hidden), dtype=tf.float32, name="initial_h") elif isinstance(initial_state, rnn_cell.LSTMStateTuple): c0 = initial_state.c y0 = initial_state.h else: c0 = initial_state y0 = tf.zeros((n_batch, self.n_hidden), dtype=tf.float32, name="initial_h") start = tf.constant(0, name="start") step = tf.constant(self.step or 1, name="step") return X, W, y0, c0, i, start, step
def accum_grad_multiple_step(grad, var, train_step, num_accum_steps): """ :param tf.Tensor|tf.IndexedSlices grad: :param tf.Variable var: :param tf.Tensor train_step: int, scalar :param int num_accum_steps: :return: modified grad :rtype: tf.Tensor """ from TFUtil import reuse_name_scope_of_tensor, get_base_name with reuse_name_scope_of_tensor(grad, postfix="/%s_accum_grad" % get_base_name(grad)): shape = var.get_shape().as_list() v = tf.get_variable(name="var_accum_grad", shape=shape, dtype=grad.dtype, initializer=tf.zeros_initializer(), trainable=False) return tf.cond(tf.less_equal(tf.mod(train_step, num_accum_steps), 0), lambda: tf.assign(v, grad), lambda: tf.assign_add(v, grad))
def _post_process_grad(self, grad, var, global_info): """ :param tf.Tensor grad: :param tf.Variable var: :param WrapOptimizer._GetGlobalInfo global_info: :return: new grad, apply grad opts :rtype: tf.Tensor, dict[str] """ updater_opts = self._get_updater_opts_from_var(var) accum_grad_multiple_num_steps = updater_opts.get( "accum_grad_multiple_step", self.config.int("accum_grad_multiple_step", 0)) grad_noise = updater_opts.get("gradient_noise", self.config.float("gradient_noise", 0.0)) grad_clip = updater_opts.get("gradient_clip", self.config.float("gradient_clip", 0.0)) # E.g. https://github.com/openai/baselines/blob/master/baselines/deepq/simple.py: # grad_norm_clipping=10 -> tf.clip_by_norm grad_clip_norm = updater_opts.get("gradient_clip_norm", self.config.float("gradient_clip_norm", 0.0)) grad_clip_avg_norm = updater_opts.get("gradient_clip_avg_norm", self.config.float("gradient_clip_avg_norm", 0.0)) grad_clip_global_norm = updater_opts.get( "gradient_clip_global_norm", self.config.float("gradient_clip_global_norm", 0.0)) global_norm_tag = updater_opts.get( "global_norm_tag", self.config.value("global_norm_tag", None)) grad_clip_global_norm_tag = updater_opts.get( "gradient_clip_global_norm_tag", self.config.value("gradient_clip_global_norm_tag", global_norm_tag)) grad_norm_to_clip_to_zero = updater_opts.get( "grad_norm_to_clip_to_zero", self.config.float("grad_norm_to_clip_to_zero", 0.0)) maximize_grad_norm = updater_opts.get("maximize_grad_norm", self.config.float("maximize_grad_norm", 0)) if maximize_grad_norm: grad_ext = global_info.get_maximize_grad_norm_grad(maximize_grad_norm, var) if grad_ext is not None: grad += grad_ext if accum_grad_multiple_num_steps >= 1: grad = accum_grad_multiple_step( grad, var, train_step=self.global_train_step, num_accum_steps=accum_grad_multiple_num_steps) if updater_opts.get("debug_grad_summaries", self.config.bool_or_other("debug_grad_summaries", False)): from TFUtil import variable_summaries, get_base_name, reuse_name_scope_of_tensor with reuse_name_scope_of_tensor(grad, prefix="grads/"): variable_summaries(grad, name="grad_of_%s" % get_base_name(var)) with reuse_name_scope_of_tensor(var, prefix="vars/"): variable_summaries(var, name=get_base_name(var)) # Also see tf.contrib.layers.optimizers.optimize_loss() for reference. if grad_noise: assert grad_noise > 0 from TFUtil import add_scaled_noise_to_gradients with tf.name_scope("grad_noise"): (grad, var), = add_scaled_noise_to_gradients([(grad, var)], grad_noise) if grad_clip: assert grad_clip > 0 with tf.name_scope("grad_clip"): grad = tf.clip_by_value(grad, -grad_clip, grad_clip) if grad_clip_norm: assert grad_clip_norm > 0 with tf.name_scope("grad_clip_norm"): grad = tf.clip_by_norm(grad, grad_clip_norm) if grad_clip_avg_norm: assert grad_clip_avg_norm > 0 with tf.name_scope("grad_clip_avg_norm"): grad = tf.clip_by_average_norm(grad, grad_clip_avg_norm) if grad_clip_global_norm: assert grad_clip_global_norm > 0 with tf.name_scope("grad_clip_global_norm"): grad = global_info.clip_by_global_norm( grad, clip_norm=grad_clip_global_norm, global_norm_tag=grad_clip_global_norm_tag) if updater_opts.get("gradient_nan_inf_filter", self.config.bool("gradient_nan_inf_filter", False)): from TFUtil import nan_to_num grad = nan_to_num(grad, nan_num=0.0, inf_num=0.0) if grad_norm_to_clip_to_zero: with tf.name_scope("grad_norm_to_clip_to_zero"): grad = global_info.set_zero_on_high_global_norm( grad, grad_norm_threshold=grad_norm_to_clip_to_zero, global_norm_tag=global_norm_tag) updater_opts.assert_all_read() opt_key, _ = self._get_optimizer_item_for_variable(var) apply_grad_opts = { "opt_key": opt_key, "accum_grad_multiple_num_steps": accum_grad_multiple_num_steps} return grad, apply_grad_opts
def create_optim_op(self): """ Creates the optimize TF op. :return: nothing, will just set self.optim_op """ assert self.loss is not None assert self.trainable_vars, "no variables to update/optimize" from TFUtil import MetaLosses # Keep track of all current available vars. # The optimizer could add some, even some which are not so-called "slot-vars", # and we want to keep track about them. all_prev_existing_vars = tf.global_variables() # type: typing.List[tf.Variable] trainable_vars_for_gradients = list(self.trainable_vars) trainable_vars_custom_update = [] # type: typing.List[tf.Variable] for v in self.trainable_vars: if hasattr(v, "returnn_custom_update"): trainable_vars_custom_update.append(v) trainable_vars_for_gradients.remove(v) if not self.optimizer: self.optimizer = WrapOptimizer( config=self.config, learning_rate=self.get_current_step_learning_rate(), global_train_step=self.network.global_train_step, use_locking=self.use_locking) self.optimizer.create_all_needed_optimizers(trainable_vars_for_gradients) with tf.variable_scope("optimize"): meta_losses_scope = MetaLosses.enter_gradient_scope() apply_grads = self.optimizer.get_apply_grads_op(self.loss, trainable_vars_for_gradients) meta_losses_scope.exit() self.optim_meta_losses_dict = meta_losses_scope.losses_as_fetch_dict() if meta_losses_scope.losses: with tf.name_scope("meta_loss"): meta_loss = meta_losses_scope.summed_loss_for_optimization() meta_apply_grads = self.optimizer.get_apply_grads_op(meta_loss, trainable_vars_for_gradients) apply_grads = tf.group(apply_grads, meta_apply_grads) self.optim_op = apply_grads if trainable_vars_custom_update: with tf.variable_scope("custom_update"): updates = [self.optim_op] for param in trainable_vars_custom_update: custom_update = getattr(param, "returnn_custom_update") assert isinstance(custom_update, CustomUpdate) updates.append(custom_update.update_var(param)) self.optim_op = tf.group(*updates) if self.constraints is not None: with tf.variable_scope("optimize_constraints"): with tf.variable_scope("factor"): factor = (self.get_current_step_learning_rate() / float(self.initial_learning_rate)) factor *= self.config.float("decouple_constraints_factor", 0.025) sgd_optimizer = tf.train.GradientDescentOptimizer( learning_rate=factor, use_locking=self.use_locking) with tf.control_dependencies([self.optim_op]): self.optim_op = sgd_optimizer.minimize(self.constraints, var_list=self.trainable_vars) if self.config.opt_typed_value("extra_updates"): extra_updates = self.config.typed_dict["extra_updates"] assert isinstance(extra_updates, dict) # dict var_name -> function(var) vars_by_name = {v.name[:-2]: v for v in all_prev_existing_vars} extra_updates_op_list = [] from Util import getargspec from TFUtil import get_var_update_ops, get_variable_grad_from_update_ops for var_name, func in extra_updates.items(): func_arg_names = getargspec(func).args assert var_name in vars_by_name, "var with name %r not found. vars:\n%s" % ( var_name, "\n".join(sorted(vars_by_name.keys()))) var = vars_by_name[var_name] assert isinstance(var, tf.Variable) ops = get_var_update_ops(var, fetches=self.optim_op) with tf.control_dependencies(ops): func_kwargs = {"var": var} if "network" in func_arg_names: func_kwargs["network"] = self.network if "update_ops" in func_arg_names: func_kwargs["update_ops"] = ops if "grad" in func_arg_names: func_kwargs["grad"] = get_variable_grad_from_update_ops(var, ops) op = func(**func_kwargs) assert isinstance(op, (tf.Operation, tf.Tensor)) extra_updates_op_list.append(op) self.optim_op = tf.group(self.optim_op, *extra_updates_op_list) slot_names_per_optimizer = self.optimizer.get_slot_names_per_optimizer() slot_vars = [] for opt_key, slot_names in slot_names_per_optimizer.items(): print("Initialize optimizer (%s) with slots %s." % (opt_key or "default", slot_names), file=log.v3) for slot_name in slot_names: for v in self.optimizer.filter_var_list_per_optimizer_key(trainable_vars_for_gradients, opt_key=opt_key): slot_var = self.optimizer.get_slot(var=v, name=slot_name) if slot_var is None: print("Warning: No slot_var found for variable %r, slot_name %r. Maybe no gradient for this var?" % ( v, slot_name), file=log.v3) else: assert isinstance(slot_var, tf.Variable) slot_vars.append(slot_var) self.optimizer_vars = slot_vars # Check if there were any other variables added. # E.g. currently (TF 1.0) the `AdamOptimizer` creates these additional vars # `[<tf.Variable 'optimize/beta1_power:0' shape=() dtype=float32_ref>, # <tf.Variable 'optimize/beta2_power:0' shape=() dtype=float32_ref>]` # which do not correspond to trainable vars, thus we did not get them as slot vars above. other_new_vars = [] for v in tf.global_variables(): if v in all_prev_existing_vars: continue if v in self.optimizer_vars: continue other_new_vars.append(v) if other_new_vars: print("These additional variable were created by the optimizer: %s." % other_new_vars, file=log.v3) self.optimizer_vars += other_new_vars with tf.name_scope("optimizer_init_vars"): self.optimizer_init_vars_op = tf.variables_initializer(self.optimizer_vars, name="init_optim_slot_vars") if self.config.bool_or_other("debug_grad_summaries", False): from TFUtil import variable_summaries, get_base_name, reuse_name_scope_of_tensor for key in self.network.used_data_keys: data = self.network.extern_data.data[key] if data.sparse: continue with reuse_name_scope_of_tensor(data.placeholder): variable_summaries(data.placeholder) if self.config.bool("debug_add_check_numerics_ops", False): # also see debug_add_check_numerics_on_output print("Adding checks for inf/nan.", file=log.v3) self.optim_op = tf.group(self.optim_op, add_check_numerics_ops([self.optim_op])) # Do this at the very end. incr_step_op = tf.assign_add(self.network.global_train_step, 1, name="global_train_step_increment") self.optim_op = tf.group(self.optim_op, incr_step_op, name="optim_and_step_incr") if self.config.bool("debug_save_updater_vars", False): print("Save updater/optimizer vars:", file=log.v3) print(self.optimizer_vars) for v in self.optimizer_vars: if v not in self.network.extra_vars_to_save: self.network.extra_vars_to_save.append(v) self.network.reset_saver()
def create_optim_op(self): assert self.loss is not None assert self.trainable_vars, "no variables to update/optimize" from TFUtil import SyntheticGradient # Keep track of all current available vars. # The optimizer could add some, even some which are not so-called "slot-vars", # and we want to keep track about them. all_prev_existing_vars = tf.global_variables( ) # type: list[tf.Variable] if not self.optimizer: self.create_optimizer() trainable_vars_for_gradients = list(self.trainable_vars) trainable_vars_custom_update = [] # type: list[tf.Variable] for v in self.trainable_vars: if hasattr(v, "returnn_custom_update"): trainable_vars_custom_update.append(v) trainable_vars_for_gradients.remove(v) with tf.variable_scope("optimize"): synthetic_gradient_scope = SyntheticGradient.enter_gradient_scope() apply_grads = self._get_apply_grads_op( self.loss, trainable_vars_for_gradients) synthetic_gradient_scope.exit() self.optim_meta_losses = synthetic_gradient_scope.as_fetch_dict() if synthetic_gradient_scope.losses: with tf.name_scope("meta_loss"): meta_loss = tf.add_n(synthetic_gradient_scope.losses) meta_apply_grads = self._get_apply_grads_op( meta_loss, trainable_vars_for_gradients) apply_grads = tf.group(apply_grads, meta_apply_grads) incr_step_op = tf.assign_add(self.network.global_train_step, 1, name="global_train_step_increment") self.optim_op = tf.group(apply_grads, incr_step_op, name="optim_and_step_incr") if trainable_vars_custom_update: with tf.variable_scope("custom_update"): updates = [self.optim_op] for param in trainable_vars_custom_update: custom_update = getattr(param, "returnn_custom_update") assert isinstance(custom_update, CustomUpdate) updates.append(custom_update.update_var(param)) self.optim_op = tf.group(*updates) if self.constraints is not None: with tf.variable_scope("optimize_constraints"): with tf.variable_scope("factor"): factor = (self.get_current_step_learning_rate() / float(self.initial_learning_rate)) factor *= self.config.float("decouple_constraints_factor", 0.025) sgd_optimizer = tf.train.GradientDescentOptimizer( learning_rate=factor, use_locking=self.use_locking) with tf.control_dependencies([self.optim_op]): self.optim_op = sgd_optimizer.minimize( self.constraints, var_list=self.trainable_vars) if self.config.opt_typed_value("extra_updates"): extra_updates = self.config.typed_dict["extra_updates"] assert isinstance(extra_updates, dict) # dict var_name -> function(var) vars_by_name = {v.name[:-2]: v for v in all_prev_existing_vars} extra_updates_op_list = [] from Util import getargspec from TFUtil import get_var_update_ops, get_variable_grad_from_update_ops for var_name, func in extra_updates.items(): func_arg_names = getargspec(func).args assert var_name in vars_by_name, "var with name %r not found. vars:\n%s" % ( var_name, "\n".join(sorted(vars_by_name.keys()))) var = vars_by_name[var_name] assert isinstance(var, tf.Variable) ops = get_var_update_ops(var, fetches=self.optim_op) with tf.control_dependencies(ops): func_kwargs = {"var": var} if "network" in func_arg_names: func_kwargs["network"] = self.network if "update_ops" in func_arg_names: func_kwargs["update_ops"] = ops if "grad" in func_arg_names: func_kwargs[ "grad"] = get_variable_grad_from_update_ops( var, ops) op = func(**func_kwargs) assert isinstance(op, (tf.Operation, tf.Tensor)) extra_updates_op_list.append(op) self.optim_op = tf.group(self.optim_op, *extra_updates_op_list) print("Initialize optimizer with slots %s." % self.optimizer.get_slot_names(), file=log.v3) slot_vars = [] for slot_name in self.optimizer.get_slot_names(): for v in trainable_vars_for_gradients: slot_var = self.optimizer.get_slot(var=v, name=slot_name) if slot_var is None: print( "Warning: No slot_var found for variable %r, slot_name %r. Maybe no gradient for this var?" % (v, slot_name), file=log.v3) else: assert isinstance(slot_var, tf.Variable) slot_vars.append(slot_var) self.optimizer_vars = slot_vars # Check if there were any other variables added. # E.g. currently (TF 1.0) the `AdamOptimizer` creates these additional vars # `[<tf.Variable 'optimize/beta1_power:0' shape=() dtype=float32_ref>, # <tf.Variable 'optimize/beta2_power:0' shape=() dtype=float32_ref>]` # which do not correspond to trainable vars, thus we did not get them as slot vars above. other_new_vars = [] for v in tf.global_variables(): if v in all_prev_existing_vars: continue if v in self.optimizer_vars: continue other_new_vars.append(v) if other_new_vars: print( "These additional variable were created by the optimizer: %s." % other_new_vars, file=log.v3) self.optimizer_vars += other_new_vars with tf.name_scope("optimizer_init_vars"): self.optimizer_init_vars_op = tf.variables_initializer( self.optimizer_vars, name="init_optim_slot_vars") if self.config.bool("debug_grad_summaries", False): from TFUtil import variable_summaries, get_base_name, reuse_name_scope_of_tensor for key in self.network.used_data_keys: data = self.network.extern_data.data[key] if data.sparse: continue with reuse_name_scope_of_tensor(data.placeholder): variable_summaries(data.placeholder) if self.config.bool( "debug_add_check_numerics_ops", False): # also see debug_add_check_numerics_on_output print("Adding checks for inf/nan.", file=log.v3) self.optim_op = tf.group(self.optim_op, add_check_numerics_ops([self.optim_op])) if self.config.bool("debug_save_updater_vars", False): print("Save updater/optimizer vars:", file=log.v3) print(self.optimizer_vars) for v in self.optimizer_vars: if v not in self.network.extra_vars_to_save: self.network.extra_vars_to_save.append(v) self.network.reset_saver()
def _get_apply_grads_op(self, loss, trainable_vars_for_gradients): """ :param tf.Tensor loss: :param list[tf.Variable] trainable_vars_for_gradients: :return: op with all variable updates combined, using the optimizer :rtype: tf.Operation """ if not trainable_vars_for_gradients: return tf.no_op(name="no_grad_vars_no_op") # AccumulateN might not be deterministic but should be faster and should require less memory. # We might want to make this configurable. if self.config.is_true("deterministic_train"): aggregation_method = tf.AggregationMethod.ADD_N else: aggregation_method = tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N accum_grad_multiple_num_steps = self.config.int( "accum_grad_multiple_step", 0) grad_noise = self.config.float("gradient_noise", 0.0) grad_clip = self.config.float("gradient_clip", 0.0) grad_clip_norm = self.config.float("gradient_clip_norm", 0.0) grad_clip_avg_norm = self.config.float("gradient_clip_avg_norm", 0.0) grad_clip_global_norm = self.config.float("gradient_clip_global_norm", 0.0) # E.g. https://github.com/openai/baselines/blob/master/baselines/deepq/simple.py: grad_norm_clipping=10 -> tf.clip_by_norm # Extended self.optimizer.minimize() to optionally modify gradients. grads_and_vars = self.optimizer.compute_gradients( loss, var_list=trainable_vars_for_gradients, aggregation_method=aggregation_method) if self.config.is_true("use_horovod") and self.config.value( "horovod_reduce_type", "") == "grad": import horovod.tensorflow as hvd grads_and_vars = [(hvd.allreduce( grad, average=self.config.is_true("horovod_avg_grad")) if grad is not None else None, var) for (grad, var) in grads_and_vars] var_grads = { var: grad for (grad, var) in grads_and_vars if grad is not None } if not var_grads: raise Exception("no single variable to train") if self.config.float("maximize_grad_norm", 0): f = self.config.float("maximize_grad_norm", 0) grad_norm = tf.add_n( [tf.nn.l2_loss(g) for g in var_grads.values()], name="grad_norm_half") * 2.0 loss_ext = grad_norm * (-f) grads_and_vars_ext = self.optimizer.compute_gradients( loss_ext, var_list=list(var_grads.keys()), aggregation_method=aggregation_method) var_grads_ext = { var: grad for (grad, var) in grads_and_vars_ext if grad is not None } grads_and_vars = [(grad + var_grads_ext.get(var, 0.0), var) for (grad, var) in grads_and_vars] if accum_grad_multiple_num_steps >= 1: grads_and_vars = [(accum_grad_multiple_step( grad, var, train_step=self.network.global_train_step, num_accum_steps=accum_grad_multiple_num_steps), var) for (grad, var) in grads_and_vars] if self.config.bool("debug_grad_summaries", False): from TFUtil import variable_summaries, get_base_name, reuse_name_scope_of_tensor for grad, var in grads_and_vars: with reuse_name_scope_of_tensor(grad, prefix="grads/"): variable_summaries(grad, name="grad_of_%s" % get_base_name(var)) with reuse_name_scope_of_tensor(var, prefix="vars/"): variable_summaries(var, name=get_base_name(var)) # Also see tf.contrib.layers.optimizers.optimize_loss() for reference. if self.config.bool("gradient_nan_inf_filter", False): from TFUtil import nan_to_num grads_and_vars = [(nan_to_num(grad, nan_num=0.0, inf_num=0.0), var) for (grad, var) in grads_and_vars] if grad_noise: assert grad_noise > 0 from TFUtil import add_scaled_noise_to_gradients with tf.name_scope("grad_noise"): grads_and_vars = add_scaled_noise_to_gradients( grads_and_vars, grad_noise) if grad_clip: assert grad_clip > 0 with tf.name_scope("grad_clip"): grads_and_vars = [(tf.clip_by_value(grad, -grad_clip, grad_clip), var) for grad, var in grads_and_vars] if grad_clip_norm: assert grad_clip_norm > 0 with tf.name_scope("grad_clip_norm"): grads_and_vars = [(tf.clip_by_norm(grad, grad_clip_norm), var) for grad, var in grads_and_vars] if grad_clip_avg_norm: assert grad_clip_avg_norm > 0 with tf.name_scope("grad_clip_avg_norm"): grads_and_vars = [ (tf.clip_by_average_norm(grad, grad_clip_avg_norm), var) for grad, var in grads_and_vars ] if grad_clip_global_norm: assert grad_clip_global_norm > 0 with tf.name_scope("grad_clip_global_norm"): grads_clipped, _ = tf.clip_by_global_norm( [grad for (grad, _) in grads_and_vars], grad_clip_global_norm) grads_and_vars = zip(grads_clipped, [var for (_, var) in grads_and_vars]) if accum_grad_multiple_num_steps >= 1: apply_grads = tf.cond( tf.equal( tf.mod(self.network.global_train_step, accum_grad_multiple_num_steps), accum_grad_multiple_num_steps - 1), true_fn=lambda: self.optimizer.apply_gradients(grads_and_vars), false_fn=lambda: tf.no_op(), name="apply_grads/accum_grad_multiple_step") else: apply_grads = self.optimizer.apply_gradients(grads_and_vars) return apply_grads
def create_optim_op(self): assert self.loss is not None assert self.trainable_vars, "no variables to update/optimize" from TFUtil import SyntheticGradient # Keep track of all current available vars. # The optimizer could add some, even some which are not so-called "slot-vars", # and we want to keep track about them. all_vars = tf.global_variables() # type: list[tf.Variable] if not self.optimizer: self.create_optimizer() trainable_vars_for_gradients = list(self.trainable_vars) trainable_vars_custom_update = [] # type: list[tf.Variable] for v in self.trainable_vars: if hasattr(v, "custom_update"): trainable_vars_custom_update.append(v) trainable_vars_for_gradients.remove(v) with tf.variable_scope("optimize"): synthetic_gradient_scope = SyntheticGradient.enter_gradient_scope() apply_grads = self._get_apply_grads_op(self.loss, trainable_vars_for_gradients) synthetic_gradient_scope.exit() self.optim_meta_losses = synthetic_gradient_scope.as_fetch_dict() if synthetic_gradient_scope.losses: with tf.name_scope("meta_loss"): meta_loss = tf.add_n(synthetic_gradient_scope.losses) meta_apply_grads = self._get_apply_grads_op(meta_loss, trainable_vars_for_gradients) apply_grads = tf.group(apply_grads, meta_apply_grads) incr_step_op = tf.assign_add(self.network.global_train_step, 1, name="global_train_step_increment") self.optim_op = tf.group(apply_grads, incr_step_op, name="optim_and_step_incr") if trainable_vars_custom_update: with tf.variable_scope("custom_update"): updates = [self.optim_op] for param in trainable_vars_custom_update: custom_update = getattr(param, "custom_update") assert isinstance(custom_update, CustomUpdate) updates.append(custom_update.update_var(param)) self.optim_op = tf.group(*updates) print("Initialize optimizer with slots %s." % self.optimizer.get_slot_names(), file=log.v3) slot_vars = [] for slot_name in self.optimizer.get_slot_names(): for v in trainable_vars_for_gradients: slot_var = self.optimizer.get_slot(var=v, name=slot_name) assert slot_var is not None assert isinstance(slot_var, tf.Variable) slot_vars.append(slot_var) self.optimizer_vars = slot_vars # Check if there were any other variables added. # E.g. currently (TF 1.0) the `AdamOptimizer` creates these additional vars # `[<tf.Variable 'optimize/beta1_power:0' shape=() dtype=float32_ref>, # <tf.Variable 'optimize/beta2_power:0' shape=() dtype=float32_ref>]` # which do not correspond to trainable vars, thus we did not get them as slot vars above. other_new_vars = [] for v in tf.global_variables(): if v in all_vars: continue if v in self.optimizer_vars: continue other_new_vars.append(v) if other_new_vars: print("These additional variable were created by the optimizer: %s." % other_new_vars, file=log.v3) self.optimizer_vars += other_new_vars with tf.name_scope("optimizer_init_vars"): self.optimizer_init_vars_op = tf.variables_initializer(self.optimizer_vars, name="init_optim_slot_vars") self.init_optimizer_vars() if self.config.bool("debug_grad_summaries", False): from TFUtil import variable_summaries, get_base_name, reuse_name_scope_of_tensor for key in self.network.used_data_keys: data = self.network.extern_data.data[key] if data.sparse: continue with reuse_name_scope_of_tensor(data.placeholder): variable_summaries(data.placeholder) if self.config.bool("debug_add_check_numerics_ops", False): print("Adding checks for inf/nan.", file=log.v3) self.optim_op = tf.group(self.optim_op, add_check_numerics_ops([self.optim_op])) if self.config.bool("debug_save_updater_vars", False): print("Save updater/optimizer vars:", file=log.v3) print(self.optimizer_vars) for v in self.optimizer_vars: if v not in self.network.extra_vars_to_save: self.network.extra_vars_to_save.append(v) self.network.reset_saver()