def map_layer_inputs_to_op(self, X, W, b, i, initial_state=None):
   """
   Just like NativeOp.LstmGenericBase.map_layer_inputs_to_op().
   :param tf.Tensor X: inputs: shape (time,batch,n_input_dim)
   :param tf.Tensor W: shape (n_input_dim+n_hidden,n_hidden*4)
   :param tf.Tensor b: shape (n_hidden*4,)
   :param tf.Tensor i: index: shape (time,batch)
   :param tf.Tensor|None initial_state: shape (batch,n_hidden)
   :rtype: tuple[tf.Tensor]
   """
   X.set_shape(tf.TensorShape([None, None, self.n_input_dim]))
   W.set_shape(tf.TensorShape([self.n_input_dim + self.n_hidden, self.n_hidden * 4]))
   i.set_shape(tf.TensorShape([None, None]))
   if i.dtype != tf.float32:
     if not hasattr(i, "cast_float32"):
       from TFUtil import reuse_name_scope_of_tensor
       with reuse_name_scope_of_tensor(i):
         i_cast_float32 = tf.cast(i, dtype=tf.float32, name="index_cast_float32")
       i.cast_float32 = i_cast_float32
     i = i.cast_float32
   n_batch = tf.shape(X)[1]
   if initial_state is not None:
     c0 = initial_state
   else:
     c0 = tf.zeros((n_batch, self.n_hidden), dtype=tf.float32, name="initial_c")
   # We could make `h` a variable exactly if `c` is a trainable variable.
   y0 = tf.zeros((n_batch, self.n_hidden), dtype=tf.float32, name="initial_h")
   start = tf.constant(0, name="start")
   step = tf.constant(self.step or 1, name="step")
   return X, W, b, y0, c0, i, start, step
Exemple #2
0
 def map_layer_inputs_to_op(cls, Z, V_h, i, initial_state=None):
     """
 Just like NativeOp.LstmGenericBase.map_layer_inputs_to_op().
 :param tf.Tensor Z: inputs: shape (time,batch,n_hidden*4)
 :param tf.Tensor V_h: W_re: shape (n_hidden,n_hidden*4)
 :param tf.Tensor i: index: shape (time,batch)
 :param tf.Tensor|None initial_state: shape (batch,n_hidden)
 :rtype: (tf.Tensor,tf.Tensor,tf.Tensor,tf.Tensor)
 """
     assert Z.get_shape().ndims == 3
     assert V_h.get_shape().ndims == 2
     assert i.get_shape().ndims == 2
     if i.dtype != tf.float32:
         if not hasattr(i, "cast_float32"):
             from TFUtil import reuse_name_scope_of_tensor
             with reuse_name_scope_of_tensor(i):
                 i_cast_float32 = tf.cast(i,
                                          dtype=tf.float32,
                                          name="index_cast_float32")
             i.cast_float32 = i_cast_float32
         i = i.cast_float32
     n_batch = tf.shape(Z)[1]
     n_out = tf.shape(V_h)[0]
     if initial_state is not None:
         from tensorflow.python.ops.nn import rnn_cell
         if isinstance(initial_state, rnn_cell.LSTMStateTuple):
             initial_state = initial_state.c
         c = initial_state
     else:
         c = tf.zeros((n_batch, n_out), dtype=tf.float32)
     return Z, V_h, c, i
Exemple #3
0
  def _get_apply_grads_op(self, loss, trainable_vars_for_gradients):
    """
    :param tf.Tensor loss:
    :param list[tf.Variable] trainable_vars_for_gradients:
    :return: op with all variable updates combined, using the optimizer
    :rtype: tf.Operation
    """
    if not trainable_vars_for_gradients:
      return tf.no_op(name="no_grad_vars_no_op")
    # AccumulateN might not be deterministic but should be faster and should require less memory.
    # We might want to make this configurable.
    aggregation_method = tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N
    grad_noise = self.config.float("gradient_noise", 0.0)
    grad_clip = self.config.float("gradient_clip", 0.0)
    grad_clip_global_norm = self.config.float("gradient_clip_global_norm", 0.0)
    # E.g. https://github.com/openai/baselines/blob/master/baselines/deepq/simple.py: grad_norm_clipping=10 -> tf.clip_by_norm

    # Extended self.optimizer.minimize() to optionally modify gradients.
    grads_and_vars = self.optimizer.compute_gradients(
      loss, var_list=trainable_vars_for_gradients,
      aggregation_method=aggregation_method)
    if not [v for g, v in grads_and_vars if g is not None]:
      raise Exception("no single variable to train")
    if self.config.bool("debug_grad_summaries", False):
      from TFUtil import variable_summaries, get_base_name, reuse_name_scope_of_tensor
      for grad, var in grads_and_vars:
        with reuse_name_scope_of_tensor(grad, prefix="grads/"):
          variable_summaries(grad, name="grad_of_%s" % get_base_name(var))
        with reuse_name_scope_of_tensor(var, prefix="vars/"):
          variable_summaries(var, name=get_base_name(var))
    # Also see tf.contrib.layers.optimizers.optimize_loss() for reference.
    if self.config.bool("gradient_nan_inf_filter", False):
      from TFUtil import nan_to_num
      grads_and_vars = [(nan_to_num(grad, nan_num=0.0, inf_num=0.0), var) for (grad, var) in grads_and_vars]
    if grad_noise:
      assert grad_noise > 0
      from TFUtil import add_scaled_noise_to_gradients
      grads_and_vars = add_scaled_noise_to_gradients(grads_and_vars, grad_noise)
    if grad_clip:
      assert grad_clip > 0
      grads_and_vars = [(tf.clip_by_value(grad, -grad_clip, grad_clip), var) for grad, var in grads_and_vars]
    if grad_clip_global_norm:
      assert grad_clip_global_norm > 0
      grads_clipped, _ = tf.clip_by_global_norm([grad for (grad, _) in grads_and_vars], grad_clip_global_norm)
      grads_and_vars = zip(grads_clipped, [var for (_, var) in grads_and_vars])
    apply_grads = self.optimizer.apply_gradients(grads_and_vars)
    return apply_grads
Exemple #4
0
def accum_grad_multiple_step(grad, var, train_step, num_accum_steps):
  """
  :param tf.Tensor|tf.IndexedSlices grad:
  :param tf.Variable var:
  :param tf.Tensor train_step: int, scalar
  :param int num_accum_steps:
  :return: modified grad
  :rtype: tf.Tensor
  """
  from TFUtil import reuse_name_scope_of_tensor, get_base_name
  with reuse_name_scope_of_tensor(grad, postfix="/%s_accum_grad" % get_base_name(grad)):
    shape = var.get_shape().as_list()
    v = tf.get_variable(
      name="var_accum_grad", shape=shape, dtype=grad.dtype,
      initializer=tf.zeros_initializer(), trainable=False)
    return tf.cond(
      tf.less_equal(tf.mod(train_step, num_accum_steps), 0),
      lambda: tf.assign(v, grad),
      lambda: tf.assign_add(v, grad))
Exemple #5
0
 def map_layer_inputs_to_op(self, X, W, i, initial_state=None):
     """
 Just like NativeOp.LstmGenericBase.map_layer_inputs_to_op().
 :param tf.Tensor X: inputs: shape (time,batch,n_input_dim)
 :param tf.Tensor W: shape (n_input_dim+n_hidden,n_hidden*4)
 :param tf.Tensor i: index: shape (time,batch)
 :param tf.Tensor|None initial_state: shape (batch,n_hidden)
 :rtype: tuple[tf.Tensor]
 """
     from tensorflow.python.ops.nn import rnn_cell
     X.set_shape(tf.TensorShape([None, None, self.n_hidden * 4]))
     W.set_shape(tf.TensorShape([self.n_hidden, self.n_hidden * 4]))
     i.set_shape(tf.TensorShape([None, None]))
     if i.dtype != tf.float32:
         if not hasattr(i, "cast_float32"):
             from TFUtil import reuse_name_scope_of_tensor
             with reuse_name_scope_of_tensor(i):
                 i_cast_float32 = tf.cast(i,
                                          dtype=tf.float32,
                                          name="index_cast_float32")
             i.cast_float32 = i_cast_float32
         i = i.cast_float32
     n_batch = tf.shape(X)[1]
     if initial_state is None:
         c0 = tf.zeros((n_batch, self.n_hidden),
                       dtype=tf.float32,
                       name="initial_c")
         y0 = tf.zeros((n_batch, self.n_hidden),
                       dtype=tf.float32,
                       name="initial_h")
     elif isinstance(initial_state, rnn_cell.LSTMStateTuple):
         c0 = initial_state.c
         y0 = initial_state.h
     else:
         c0 = initial_state
         y0 = tf.zeros((n_batch, self.n_hidden),
                       dtype=tf.float32,
                       name="initial_h")
     start = tf.constant(0, name="start")
     step = tf.constant(self.step or 1, name="step")
     return X, W, y0, c0, i, start, step
Exemple #6
0
def accum_grad_multiple_step(grad, var, train_step, num_accum_steps):
    """
  :param tf.Tensor|tf.IndexedSlices grad:
  :param tf.Variable var:
  :param tf.Tensor train_step: int, scalar
  :param int num_accum_steps:
  :return: modified grad
  :rtype: tf.Tensor
  """
    from TFUtil import reuse_name_scope_of_tensor, get_base_name
    with reuse_name_scope_of_tensor(grad,
                                    postfix="/%s_accum_grad" %
                                    get_base_name(grad)):
        shape = var.get_shape().as_list()
        v = tf.get_variable(name="var_accum_grad",
                            shape=shape,
                            dtype=grad.dtype,
                            initializer=tf.zeros_initializer(),
                            trainable=False)
        return tf.cond(tf.less_equal(tf.mod(train_step, num_accum_steps),
                                     0), lambda: tf.assign(v, grad),
                       lambda: tf.assign_add(v, grad))
Exemple #7
0
  def _post_process_grad(self, grad, var, global_info):
    """
    :param tf.Tensor grad:
    :param tf.Variable var:
    :param WrapOptimizer._GetGlobalInfo global_info:
    :return: new grad, apply grad opts
    :rtype: tf.Tensor, dict[str]
    """
    updater_opts = self._get_updater_opts_from_var(var)

    accum_grad_multiple_num_steps = updater_opts.get(
      "accum_grad_multiple_step", self.config.int("accum_grad_multiple_step", 0))
    grad_noise = updater_opts.get("gradient_noise", self.config.float("gradient_noise", 0.0))
    grad_clip = updater_opts.get("gradient_clip", self.config.float("gradient_clip", 0.0))
    # E.g. https://github.com/openai/baselines/blob/master/baselines/deepq/simple.py:
    #   grad_norm_clipping=10 -> tf.clip_by_norm
    grad_clip_norm = updater_opts.get("gradient_clip_norm", self.config.float("gradient_clip_norm", 0.0))
    grad_clip_avg_norm = updater_opts.get("gradient_clip_avg_norm", self.config.float("gradient_clip_avg_norm", 0.0))
    grad_clip_global_norm = updater_opts.get(
      "gradient_clip_global_norm", self.config.float("gradient_clip_global_norm", 0.0))
    global_norm_tag = updater_opts.get(
      "global_norm_tag", self.config.value("global_norm_tag", None))
    grad_clip_global_norm_tag = updater_opts.get(
      "gradient_clip_global_norm_tag", self.config.value("gradient_clip_global_norm_tag", global_norm_tag))
    grad_norm_to_clip_to_zero = updater_opts.get(
      "grad_norm_to_clip_to_zero", self.config.float("grad_norm_to_clip_to_zero", 0.0))
    maximize_grad_norm = updater_opts.get("maximize_grad_norm", self.config.float("maximize_grad_norm", 0))

    if maximize_grad_norm:
      grad_ext = global_info.get_maximize_grad_norm_grad(maximize_grad_norm, var)
      if grad_ext is not None:
        grad += grad_ext

    if accum_grad_multiple_num_steps >= 1:
      grad = accum_grad_multiple_step(
        grad, var, train_step=self.global_train_step, num_accum_steps=accum_grad_multiple_num_steps)

    if updater_opts.get("debug_grad_summaries", self.config.bool_or_other("debug_grad_summaries", False)):
      from TFUtil import variable_summaries, get_base_name, reuse_name_scope_of_tensor
      with reuse_name_scope_of_tensor(grad, prefix="grads/"):
        variable_summaries(grad, name="grad_of_%s" % get_base_name(var))
      with reuse_name_scope_of_tensor(var, prefix="vars/"):
        variable_summaries(var, name=get_base_name(var))

    # Also see tf.contrib.layers.optimizers.optimize_loss() for reference.
    if grad_noise:
      assert grad_noise > 0
      from TFUtil import add_scaled_noise_to_gradients
      with tf.name_scope("grad_noise"):
        (grad, var), = add_scaled_noise_to_gradients([(grad, var)], grad_noise)
    if grad_clip:
      assert grad_clip > 0
      with tf.name_scope("grad_clip"):
        grad = tf.clip_by_value(grad, -grad_clip, grad_clip)
    if grad_clip_norm:
      assert grad_clip_norm > 0
      with tf.name_scope("grad_clip_norm"):
        grad = tf.clip_by_norm(grad, grad_clip_norm)
    if grad_clip_avg_norm:
      assert grad_clip_avg_norm > 0
      with tf.name_scope("grad_clip_avg_norm"):
        grad = tf.clip_by_average_norm(grad, grad_clip_avg_norm)
    if grad_clip_global_norm:
      assert grad_clip_global_norm > 0
      with tf.name_scope("grad_clip_global_norm"):
        grad = global_info.clip_by_global_norm(
          grad, clip_norm=grad_clip_global_norm, global_norm_tag=grad_clip_global_norm_tag)
    if updater_opts.get("gradient_nan_inf_filter", self.config.bool("gradient_nan_inf_filter", False)):
      from TFUtil import nan_to_num
      grad = nan_to_num(grad, nan_num=0.0, inf_num=0.0)
    if grad_norm_to_clip_to_zero:
      with tf.name_scope("grad_norm_to_clip_to_zero"):
        grad = global_info.set_zero_on_high_global_norm(
          grad, grad_norm_threshold=grad_norm_to_clip_to_zero, global_norm_tag=global_norm_tag)

    updater_opts.assert_all_read()

    opt_key, _ = self._get_optimizer_item_for_variable(var)
    apply_grad_opts = {
      "opt_key": opt_key, "accum_grad_multiple_num_steps": accum_grad_multiple_num_steps}
    return grad, apply_grad_opts
Exemple #8
0
  def create_optim_op(self):
    """
    Creates the optimize TF op.

    :return: nothing, will just set self.optim_op
    """
    assert self.loss is not None
    assert self.trainable_vars, "no variables to update/optimize"
    from TFUtil import MetaLosses

    # Keep track of all current available vars.
    # The optimizer could add some, even some which are not so-called "slot-vars",
    # and we want to keep track about them.
    all_prev_existing_vars = tf.global_variables()  # type: typing.List[tf.Variable]

    trainable_vars_for_gradients = list(self.trainable_vars)
    trainable_vars_custom_update = []  # type: typing.List[tf.Variable]
    for v in self.trainable_vars:
      if hasattr(v, "returnn_custom_update"):
        trainable_vars_custom_update.append(v)
        trainable_vars_for_gradients.remove(v)

    if not self.optimizer:
      self.optimizer = WrapOptimizer(
        config=self.config,
        learning_rate=self.get_current_step_learning_rate(),
        global_train_step=self.network.global_train_step,
        use_locking=self.use_locking)
      self.optimizer.create_all_needed_optimizers(trainable_vars_for_gradients)

    with tf.variable_scope("optimize"):
      meta_losses_scope = MetaLosses.enter_gradient_scope()
      apply_grads = self.optimizer.get_apply_grads_op(self.loss, trainable_vars_for_gradients)
      meta_losses_scope.exit()
      self.optim_meta_losses_dict = meta_losses_scope.losses_as_fetch_dict()
      if meta_losses_scope.losses:
        with tf.name_scope("meta_loss"):
          meta_loss = meta_losses_scope.summed_loss_for_optimization()
          meta_apply_grads = self.optimizer.get_apply_grads_op(meta_loss, trainable_vars_for_gradients)
        apply_grads = tf.group(apply_grads, meta_apply_grads)
      self.optim_op = apply_grads

    if trainable_vars_custom_update:
      with tf.variable_scope("custom_update"):
        updates = [self.optim_op]
        for param in trainable_vars_custom_update:
          custom_update = getattr(param, "returnn_custom_update")
          assert isinstance(custom_update, CustomUpdate)
          updates.append(custom_update.update_var(param))
        self.optim_op = tf.group(*updates)

    if self.constraints is not None:
      with tf.variable_scope("optimize_constraints"):
        with tf.variable_scope("factor"):
          factor = (self.get_current_step_learning_rate() / float(self.initial_learning_rate))
          factor *= self.config.float("decouple_constraints_factor", 0.025)
        sgd_optimizer = tf.train.GradientDescentOptimizer(
          learning_rate=factor, use_locking=self.use_locking)
        with tf.control_dependencies([self.optim_op]):
          self.optim_op = sgd_optimizer.minimize(self.constraints, var_list=self.trainable_vars)

    if self.config.opt_typed_value("extra_updates"):
      extra_updates = self.config.typed_dict["extra_updates"]
      assert isinstance(extra_updates, dict)  # dict var_name -> function(var)
      vars_by_name = {v.name[:-2]: v for v in all_prev_existing_vars}
      extra_updates_op_list = []
      from Util import getargspec
      from TFUtil import get_var_update_ops, get_variable_grad_from_update_ops
      for var_name, func in extra_updates.items():
        func_arg_names = getargspec(func).args
        assert var_name in vars_by_name, "var with name %r not found. vars:\n%s" % (
          var_name, "\n".join(sorted(vars_by_name.keys())))
        var = vars_by_name[var_name]
        assert isinstance(var, tf.Variable)
        ops = get_var_update_ops(var, fetches=self.optim_op)
        with tf.control_dependencies(ops):
          func_kwargs = {"var": var}
          if "network" in func_arg_names:
            func_kwargs["network"] = self.network
          if "update_ops" in func_arg_names:
            func_kwargs["update_ops"] = ops
          if "grad" in func_arg_names:
            func_kwargs["grad"] = get_variable_grad_from_update_ops(var, ops)
          op = func(**func_kwargs)
          assert isinstance(op, (tf.Operation, tf.Tensor))
          extra_updates_op_list.append(op)
        self.optim_op = tf.group(self.optim_op, *extra_updates_op_list)

    slot_names_per_optimizer = self.optimizer.get_slot_names_per_optimizer()
    slot_vars = []
    for opt_key, slot_names in slot_names_per_optimizer.items():
      print("Initialize optimizer (%s) with slots %s." % (opt_key or "default", slot_names), file=log.v3)
      for slot_name in slot_names:
        for v in self.optimizer.filter_var_list_per_optimizer_key(trainable_vars_for_gradients, opt_key=opt_key):
          slot_var = self.optimizer.get_slot(var=v, name=slot_name)
          if slot_var is None:
            print("Warning: No slot_var found for variable %r, slot_name %r. Maybe no gradient for this var?" % (
              v, slot_name), file=log.v3)
          else:
            assert isinstance(slot_var, tf.Variable)
            slot_vars.append(slot_var)
    self.optimizer_vars = slot_vars

    # Check if there were any other variables added.
    # E.g. currently (TF 1.0) the `AdamOptimizer` creates these additional vars
    # `[<tf.Variable 'optimize/beta1_power:0' shape=() dtype=float32_ref>,
    #   <tf.Variable 'optimize/beta2_power:0' shape=() dtype=float32_ref>]`
    # which do not correspond to trainable vars, thus we did not get them as slot vars above.
    other_new_vars = []
    for v in tf.global_variables():
      if v in all_prev_existing_vars:
        continue
      if v in self.optimizer_vars:
        continue
      other_new_vars.append(v)
    if other_new_vars:
      print("These additional variable were created by the optimizer: %s." % other_new_vars, file=log.v3)
      self.optimizer_vars += other_new_vars
    with tf.name_scope("optimizer_init_vars"):
      self.optimizer_init_vars_op = tf.variables_initializer(self.optimizer_vars, name="init_optim_slot_vars")

    if self.config.bool_or_other("debug_grad_summaries", False):
      from TFUtil import variable_summaries, get_base_name, reuse_name_scope_of_tensor
      for key in self.network.used_data_keys:
        data = self.network.extern_data.data[key]
        if data.sparse:
          continue
        with reuse_name_scope_of_tensor(data.placeholder):
          variable_summaries(data.placeholder)

    if self.config.bool("debug_add_check_numerics_ops", False):  # also see debug_add_check_numerics_on_output
      print("Adding checks for inf/nan.", file=log.v3)
      self.optim_op = tf.group(self.optim_op, add_check_numerics_ops([self.optim_op]))

    # Do this at the very end.
    incr_step_op = tf.assign_add(self.network.global_train_step, 1, name="global_train_step_increment")
    self.optim_op = tf.group(self.optim_op, incr_step_op, name="optim_and_step_incr")

    if self.config.bool("debug_save_updater_vars", False):
      print("Save updater/optimizer vars:", file=log.v3)
      print(self.optimizer_vars)
      for v in self.optimizer_vars:
        if v not in self.network.extra_vars_to_save:
          self.network.extra_vars_to_save.append(v)
      self.network.reset_saver()
Exemple #9
0
    def create_optim_op(self):
        assert self.loss is not None
        assert self.trainable_vars, "no variables to update/optimize"
        from TFUtil import SyntheticGradient

        # Keep track of all current available vars.
        # The optimizer could add some, even some which are not so-called "slot-vars",
        # and we want to keep track about them.
        all_prev_existing_vars = tf.global_variables(
        )  # type: list[tf.Variable]

        if not self.optimizer:
            self.create_optimizer()

        trainable_vars_for_gradients = list(self.trainable_vars)
        trainable_vars_custom_update = []  # type: list[tf.Variable]
        for v in self.trainable_vars:
            if hasattr(v, "returnn_custom_update"):
                trainable_vars_custom_update.append(v)
                trainable_vars_for_gradients.remove(v)

        with tf.variable_scope("optimize"):
            synthetic_gradient_scope = SyntheticGradient.enter_gradient_scope()
            apply_grads = self._get_apply_grads_op(
                self.loss, trainable_vars_for_gradients)
            synthetic_gradient_scope.exit()
            self.optim_meta_losses = synthetic_gradient_scope.as_fetch_dict()
            if synthetic_gradient_scope.losses:
                with tf.name_scope("meta_loss"):
                    meta_loss = tf.add_n(synthetic_gradient_scope.losses)
                    meta_apply_grads = self._get_apply_grads_op(
                        meta_loss, trainable_vars_for_gradients)
                apply_grads = tf.group(apply_grads, meta_apply_grads)
            incr_step_op = tf.assign_add(self.network.global_train_step,
                                         1,
                                         name="global_train_step_increment")
            self.optim_op = tf.group(apply_grads,
                                     incr_step_op,
                                     name="optim_and_step_incr")

        if trainable_vars_custom_update:
            with tf.variable_scope("custom_update"):
                updates = [self.optim_op]
                for param in trainable_vars_custom_update:
                    custom_update = getattr(param, "returnn_custom_update")
                    assert isinstance(custom_update, CustomUpdate)
                    updates.append(custom_update.update_var(param))
                self.optim_op = tf.group(*updates)

        if self.constraints is not None:
            with tf.variable_scope("optimize_constraints"):
                with tf.variable_scope("factor"):
                    factor = (self.get_current_step_learning_rate() /
                              float(self.initial_learning_rate))
                    factor *= self.config.float("decouple_constraints_factor",
                                                0.025)
                sgd_optimizer = tf.train.GradientDescentOptimizer(
                    learning_rate=factor, use_locking=self.use_locking)
                with tf.control_dependencies([self.optim_op]):
                    self.optim_op = sgd_optimizer.minimize(
                        self.constraints, var_list=self.trainable_vars)

        if self.config.opt_typed_value("extra_updates"):
            extra_updates = self.config.typed_dict["extra_updates"]
            assert isinstance(extra_updates,
                              dict)  # dict var_name -> function(var)
            vars_by_name = {v.name[:-2]: v for v in all_prev_existing_vars}
            extra_updates_op_list = []
            from Util import getargspec
            from TFUtil import get_var_update_ops, get_variable_grad_from_update_ops
            for var_name, func in extra_updates.items():
                func_arg_names = getargspec(func).args
                assert var_name in vars_by_name, "var with name %r not found. vars:\n%s" % (
                    var_name, "\n".join(sorted(vars_by_name.keys())))
                var = vars_by_name[var_name]
                assert isinstance(var, tf.Variable)
                ops = get_var_update_ops(var, fetches=self.optim_op)
                with tf.control_dependencies(ops):
                    func_kwargs = {"var": var}
                    if "network" in func_arg_names:
                        func_kwargs["network"] = self.network
                    if "update_ops" in func_arg_names:
                        func_kwargs["update_ops"] = ops
                    if "grad" in func_arg_names:
                        func_kwargs[
                            "grad"] = get_variable_grad_from_update_ops(
                                var, ops)
                    op = func(**func_kwargs)
                    assert isinstance(op, (tf.Operation, tf.Tensor))
                    extra_updates_op_list.append(op)
                self.optim_op = tf.group(self.optim_op, *extra_updates_op_list)

        print("Initialize optimizer with slots %s." %
              self.optimizer.get_slot_names(),
              file=log.v3)
        slot_vars = []
        for slot_name in self.optimizer.get_slot_names():
            for v in trainable_vars_for_gradients:
                slot_var = self.optimizer.get_slot(var=v, name=slot_name)
                if slot_var is None:
                    print(
                        "Warning: No slot_var found for variable %r, slot_name %r. Maybe no gradient for this var?"
                        % (v, slot_name),
                        file=log.v3)
                else:
                    assert isinstance(slot_var, tf.Variable)
                    slot_vars.append(slot_var)
        self.optimizer_vars = slot_vars

        # Check if there were any other variables added.
        # E.g. currently (TF 1.0) the `AdamOptimizer` creates these additional vars
        # `[<tf.Variable 'optimize/beta1_power:0' shape=() dtype=float32_ref>,
        #   <tf.Variable 'optimize/beta2_power:0' shape=() dtype=float32_ref>]`
        # which do not correspond to trainable vars, thus we did not get them as slot vars above.
        other_new_vars = []
        for v in tf.global_variables():
            if v in all_prev_existing_vars:
                continue
            if v in self.optimizer_vars:
                continue
            other_new_vars.append(v)
        if other_new_vars:
            print(
                "These additional variable were created by the optimizer: %s."
                % other_new_vars,
                file=log.v3)
            self.optimizer_vars += other_new_vars
        with tf.name_scope("optimizer_init_vars"):
            self.optimizer_init_vars_op = tf.variables_initializer(
                self.optimizer_vars, name="init_optim_slot_vars")

        if self.config.bool("debug_grad_summaries", False):
            from TFUtil import variable_summaries, get_base_name, reuse_name_scope_of_tensor
            for key in self.network.used_data_keys:
                data = self.network.extern_data.data[key]
                if data.sparse:
                    continue
                with reuse_name_scope_of_tensor(data.placeholder):
                    variable_summaries(data.placeholder)

        if self.config.bool(
                "debug_add_check_numerics_ops",
                False):  # also see debug_add_check_numerics_on_output
            print("Adding checks for inf/nan.", file=log.v3)
            self.optim_op = tf.group(self.optim_op,
                                     add_check_numerics_ops([self.optim_op]))

        if self.config.bool("debug_save_updater_vars", False):
            print("Save updater/optimizer vars:", file=log.v3)
            print(self.optimizer_vars)
            for v in self.optimizer_vars:
                if v not in self.network.extra_vars_to_save:
                    self.network.extra_vars_to_save.append(v)
            self.network.reset_saver()
Exemple #10
0
    def _get_apply_grads_op(self, loss, trainable_vars_for_gradients):
        """
    :param tf.Tensor loss:
    :param list[tf.Variable] trainable_vars_for_gradients:
    :return: op with all variable updates combined, using the optimizer
    :rtype: tf.Operation
    """
        if not trainable_vars_for_gradients:
            return tf.no_op(name="no_grad_vars_no_op")
        # AccumulateN might not be deterministic but should be faster and should require less memory.
        # We might want to make this configurable.
        if self.config.is_true("deterministic_train"):
            aggregation_method = tf.AggregationMethod.ADD_N
        else:
            aggregation_method = tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N
        accum_grad_multiple_num_steps = self.config.int(
            "accum_grad_multiple_step", 0)
        grad_noise = self.config.float("gradient_noise", 0.0)
        grad_clip = self.config.float("gradient_clip", 0.0)
        grad_clip_norm = self.config.float("gradient_clip_norm", 0.0)
        grad_clip_avg_norm = self.config.float("gradient_clip_avg_norm", 0.0)
        grad_clip_global_norm = self.config.float("gradient_clip_global_norm",
                                                  0.0)
        # E.g. https://github.com/openai/baselines/blob/master/baselines/deepq/simple.py: grad_norm_clipping=10 -> tf.clip_by_norm

        # Extended self.optimizer.minimize() to optionally modify gradients.
        grads_and_vars = self.optimizer.compute_gradients(
            loss,
            var_list=trainable_vars_for_gradients,
            aggregation_method=aggregation_method)
        if self.config.is_true("use_horovod") and self.config.value(
                "horovod_reduce_type", "") == "grad":
            import horovod.tensorflow as hvd
            grads_and_vars = [(hvd.allreduce(
                grad, average=self.config.is_true("horovod_avg_grad"))
                               if grad is not None else None, var)
                              for (grad, var) in grads_and_vars]
        var_grads = {
            var: grad
            for (grad, var) in grads_and_vars if grad is not None
        }
        if not var_grads:
            raise Exception("no single variable to train")
        if self.config.float("maximize_grad_norm", 0):
            f = self.config.float("maximize_grad_norm", 0)
            grad_norm = tf.add_n(
                [tf.nn.l2_loss(g) for g in var_grads.values()],
                name="grad_norm_half") * 2.0
            loss_ext = grad_norm * (-f)
            grads_and_vars_ext = self.optimizer.compute_gradients(
                loss_ext,
                var_list=list(var_grads.keys()),
                aggregation_method=aggregation_method)
            var_grads_ext = {
                var: grad
                for (grad, var) in grads_and_vars_ext if grad is not None
            }
            grads_and_vars = [(grad + var_grads_ext.get(var, 0.0), var)
                              for (grad, var) in grads_and_vars]
        if accum_grad_multiple_num_steps >= 1:
            grads_and_vars = [(accum_grad_multiple_step(
                grad,
                var,
                train_step=self.network.global_train_step,
                num_accum_steps=accum_grad_multiple_num_steps), var)
                              for (grad, var) in grads_and_vars]
        if self.config.bool("debug_grad_summaries", False):
            from TFUtil import variable_summaries, get_base_name, reuse_name_scope_of_tensor
            for grad, var in grads_and_vars:
                with reuse_name_scope_of_tensor(grad, prefix="grads/"):
                    variable_summaries(grad,
                                       name="grad_of_%s" % get_base_name(var))
                with reuse_name_scope_of_tensor(var, prefix="vars/"):
                    variable_summaries(var, name=get_base_name(var))
        # Also see tf.contrib.layers.optimizers.optimize_loss() for reference.
        if self.config.bool("gradient_nan_inf_filter", False):
            from TFUtil import nan_to_num
            grads_and_vars = [(nan_to_num(grad, nan_num=0.0, inf_num=0.0), var)
                              for (grad, var) in grads_and_vars]
        if grad_noise:
            assert grad_noise > 0
            from TFUtil import add_scaled_noise_to_gradients
            with tf.name_scope("grad_noise"):
                grads_and_vars = add_scaled_noise_to_gradients(
                    grads_and_vars, grad_noise)
        if grad_clip:
            assert grad_clip > 0
            with tf.name_scope("grad_clip"):
                grads_and_vars = [(tf.clip_by_value(grad, -grad_clip,
                                                    grad_clip), var)
                                  for grad, var in grads_and_vars]
        if grad_clip_norm:
            assert grad_clip_norm > 0
            with tf.name_scope("grad_clip_norm"):
                grads_and_vars = [(tf.clip_by_norm(grad, grad_clip_norm), var)
                                  for grad, var in grads_and_vars]
        if grad_clip_avg_norm:
            assert grad_clip_avg_norm > 0
            with tf.name_scope("grad_clip_avg_norm"):
                grads_and_vars = [
                    (tf.clip_by_average_norm(grad, grad_clip_avg_norm), var)
                    for grad, var in grads_and_vars
                ]
        if grad_clip_global_norm:
            assert grad_clip_global_norm > 0
            with tf.name_scope("grad_clip_global_norm"):
                grads_clipped, _ = tf.clip_by_global_norm(
                    [grad for (grad, _) in grads_and_vars],
                    grad_clip_global_norm)
                grads_and_vars = zip(grads_clipped,
                                     [var for (_, var) in grads_and_vars])
        if accum_grad_multiple_num_steps >= 1:
            apply_grads = tf.cond(
                tf.equal(
                    tf.mod(self.network.global_train_step,
                           accum_grad_multiple_num_steps),
                    accum_grad_multiple_num_steps - 1),
                true_fn=lambda: self.optimizer.apply_gradients(grads_and_vars),
                false_fn=lambda: tf.no_op(),
                name="apply_grads/accum_grad_multiple_step")
        else:
            apply_grads = self.optimizer.apply_gradients(grads_and_vars)
        return apply_grads
Exemple #11
0
  def create_optim_op(self):
    assert self.loss is not None
    assert self.trainable_vars, "no variables to update/optimize"
    from TFUtil import SyntheticGradient

    # Keep track of all current available vars.
    # The optimizer could add some, even some which are not so-called "slot-vars",
    # and we want to keep track about them.
    all_vars = tf.global_variables()  # type: list[tf.Variable]

    if not self.optimizer:
      self.create_optimizer()

    trainable_vars_for_gradients = list(self.trainable_vars)
    trainable_vars_custom_update = []  # type: list[tf.Variable]
    for v in self.trainable_vars:
      if hasattr(v, "custom_update"):
        trainable_vars_custom_update.append(v)
        trainable_vars_for_gradients.remove(v)

    with tf.variable_scope("optimize"):
      synthetic_gradient_scope = SyntheticGradient.enter_gradient_scope()
      apply_grads = self._get_apply_grads_op(self.loss, trainable_vars_for_gradients)
      synthetic_gradient_scope.exit()
      self.optim_meta_losses = synthetic_gradient_scope.as_fetch_dict()
      if synthetic_gradient_scope.losses:
        with tf.name_scope("meta_loss"):
          meta_loss = tf.add_n(synthetic_gradient_scope.losses)
          meta_apply_grads = self._get_apply_grads_op(meta_loss, trainable_vars_for_gradients)
        apply_grads = tf.group(apply_grads, meta_apply_grads)
      incr_step_op = tf.assign_add(self.network.global_train_step, 1, name="global_train_step_increment")
      self.optim_op = tf.group(apply_grads, incr_step_op, name="optim_and_step_incr")

    if trainable_vars_custom_update:
      with tf.variable_scope("custom_update"):
        updates = [self.optim_op]
        for param in trainable_vars_custom_update:
          custom_update = getattr(param, "custom_update")
          assert isinstance(custom_update, CustomUpdate)
          updates.append(custom_update.update_var(param))
        self.optim_op = tf.group(*updates)

    print("Initialize optimizer with slots %s." % self.optimizer.get_slot_names(), file=log.v3)
    slot_vars = []
    for slot_name in self.optimizer.get_slot_names():
      for v in trainable_vars_for_gradients:
        slot_var = self.optimizer.get_slot(var=v, name=slot_name)
        assert slot_var is not None
        assert isinstance(slot_var, tf.Variable)
        slot_vars.append(slot_var)
    self.optimizer_vars = slot_vars

    # Check if there were any other variables added.
    # E.g. currently (TF 1.0) the `AdamOptimizer` creates these additional vars
    # `[<tf.Variable 'optimize/beta1_power:0' shape=() dtype=float32_ref>,
    #   <tf.Variable 'optimize/beta2_power:0' shape=() dtype=float32_ref>]`
    # which do not correspond to trainable vars, thus we did not get them as slot vars above.
    other_new_vars = []
    for v in tf.global_variables():
      if v in all_vars:
        continue
      if v in self.optimizer_vars:
        continue
      other_new_vars.append(v)
    if other_new_vars:
      print("These additional variable were created by the optimizer: %s." % other_new_vars, file=log.v3)
      self.optimizer_vars += other_new_vars
    with tf.name_scope("optimizer_init_vars"):
      self.optimizer_init_vars_op = tf.variables_initializer(self.optimizer_vars, name="init_optim_slot_vars")
    self.init_optimizer_vars()

    if self.config.bool("debug_grad_summaries", False):
      from TFUtil import variable_summaries, get_base_name, reuse_name_scope_of_tensor
      for key in self.network.used_data_keys:
        data = self.network.extern_data.data[key]
        if data.sparse:
          continue
        with reuse_name_scope_of_tensor(data.placeholder):
          variable_summaries(data.placeholder)

    if self.config.bool("debug_add_check_numerics_ops", False):
      print("Adding checks for inf/nan.", file=log.v3)
      self.optim_op = tf.group(self.optim_op, add_check_numerics_ops([self.optim_op]))

    if self.config.bool("debug_save_updater_vars", False):
      print("Save updater/optimizer vars:", file=log.v3)
      print(self.optimizer_vars)
      for v in self.optimizer_vars:
        if v not in self.network.extra_vars_to_save:
          self.network.extra_vars_to_save.append(v)
      self.network.reset_saver()