コード例 #1
0
ファイル: TFUpdater.py プロジェクト: tazdriver/returnn
    def create_optim_op(self):
        if not self.optimizer:
            self.create_optimizer()

        assert self.loss is not None
        with tf.variable_scope("optimize"):
            # AccumulateN might not be deterministic but should be faster and should require less memory.
            # We might want to make this configurable.
            aggregation_method = tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N
            grad_noise = self.config.float("gradient_noise", 0.0)
            grad_clip = self.config.float("gradient_clip", 0.0)
            grad_clip_global_norm = self.config.float(
                "gradient_clip_global_norm", 0.0)

            # Extended self.optimizer.minimize() to optinally modify gradients.
            grads_and_vars = self.optimizer.compute_gradients(
                self.loss,
                var_list=self.trainable_vars,
                aggregation_method=aggregation_method)
            if not [v for g, v in grads_and_vars if g is not None]:
                raise Exception("no single variable to train")
            # Also see tf.contrib.layers.optimizers.optimize_loss() for reference.
            if grad_noise:
                assert grad_noise > 0
                from TFUtil import add_scaled_noise_to_gradients
                grads_and_vars = add_scaled_noise_to_gradients(
                    grads_and_vars, grad_noise)
            if grad_clip:
                assert grad_clip > 0
                grads_and_vars = [(tf.clip_by_value(grad, -grad_clip,
                                                    grad_clip), var)
                                  for grad, var in grads_and_vars]
            if grad_clip_global_norm:
                assert grad_clip_global_norm > 0
                grads_clipped, _ = tf.clip_by_global_norm(
                    [grad for (grad, _) in grads_and_vars],
                    grad_clip_global_norm)
                grads_and_vars = zip(grads_clipped,
                                     [var for (_, var) in grads_and_vars])
            apply_grads = self.optimizer.apply_gradients(grads_and_vars)
            incr_step_op = tf.assign_add(self.network.global_train_step,
                                         1,
                                         name="global_train_step_increment")
            self.optim_op = tf.group(apply_grads,
                                     incr_step_op,
                                     name="optim_and_step_incr")

        print("Initialize optimizer with slots %s." %
              self.optimizer.get_slot_names(),
              file=log.v3)
        slot_vars = []
        for slot_name in self.optimizer.get_slot_names():
            for v in self.trainable_vars:
                slot_var = self.optimizer.get_slot(var=v, name=slot_name)
                assert slot_var is not None
                slot_vars.append(slot_var)
        self.tf_session.run(
            tf.variables_initializer(slot_vars, name="init_optim_slot_vars"))
コード例 #2
0
  def _get_apply_grads_op(self, loss, trainable_vars_for_gradients):
    """
    :param tf.Tensor loss:
    :param list[tf.Variable] trainable_vars_for_gradients:
    :return: op with all variable updates combined, using the optimizer
    :rtype: tf.Operation
    """
    if not trainable_vars_for_gradients:
      return tf.no_op(name="no_grad_vars_no_op")
    # AccumulateN might not be deterministic but should be faster and should require less memory.
    # We might want to make this configurable.
    aggregation_method = tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N
    grad_noise = self.config.float("gradient_noise", 0.0)
    grad_clip = self.config.float("gradient_clip", 0.0)
    grad_clip_global_norm = self.config.float("gradient_clip_global_norm", 0.0)
    # E.g. https://github.com/openai/baselines/blob/master/baselines/deepq/simple.py: grad_norm_clipping=10 -> tf.clip_by_norm

    # Extended self.optimizer.minimize() to optionally modify gradients.
    grads_and_vars = self.optimizer.compute_gradients(
      loss, var_list=trainable_vars_for_gradients,
      aggregation_method=aggregation_method)
    if not [v for g, v in grads_and_vars if g is not None]:
      raise Exception("no single variable to train")
    if self.config.bool("debug_grad_summaries", False):
      from TFUtil import variable_summaries, get_base_name, reuse_name_scope_of_tensor
      for grad, var in grads_and_vars:
        with reuse_name_scope_of_tensor(grad, prefix="grads/"):
          variable_summaries(grad, name="grad_of_%s" % get_base_name(var))
        with reuse_name_scope_of_tensor(var, prefix="vars/"):
          variable_summaries(var, name=get_base_name(var))
    # Also see tf.contrib.layers.optimizers.optimize_loss() for reference.
    if self.config.bool("gradient_nan_inf_filter", False):
      from TFUtil import nan_to_num
      grads_and_vars = [(nan_to_num(grad, nan_num=0.0, inf_num=0.0), var) for (grad, var) in grads_and_vars]
    if grad_noise:
      assert grad_noise > 0
      from TFUtil import add_scaled_noise_to_gradients
      grads_and_vars = add_scaled_noise_to_gradients(grads_and_vars, grad_noise)
    if grad_clip:
      assert grad_clip > 0
      grads_and_vars = [(tf.clip_by_value(grad, -grad_clip, grad_clip), var) for grad, var in grads_and_vars]
    if grad_clip_global_norm:
      assert grad_clip_global_norm > 0
      grads_clipped, _ = tf.clip_by_global_norm([grad for (grad, _) in grads_and_vars], grad_clip_global_norm)
      grads_and_vars = zip(grads_clipped, [var for (_, var) in grads_and_vars])
    apply_grads = self.optimizer.apply_gradients(grads_and_vars)
    return apply_grads
コード例 #3
0
ファイル: TFUpdater.py プロジェクト: rwth-i6/returnn
  def _post_process_grad(self, grad, var, global_info):
    """
    :param tf.Tensor grad:
    :param tf.Variable var:
    :param WrapOptimizer._GetGlobalInfo global_info:
    :return: new grad, apply grad opts
    :rtype: tf.Tensor, dict[str]
    """
    updater_opts = self._get_updater_opts_from_var(var)

    accum_grad_multiple_num_steps = updater_opts.get(
      "accum_grad_multiple_step", self.config.int("accum_grad_multiple_step", 0))
    grad_noise = updater_opts.get("gradient_noise", self.config.float("gradient_noise", 0.0))
    grad_clip = updater_opts.get("gradient_clip", self.config.float("gradient_clip", 0.0))
    # E.g. https://github.com/openai/baselines/blob/master/baselines/deepq/simple.py:
    #   grad_norm_clipping=10 -> tf.clip_by_norm
    grad_clip_norm = updater_opts.get("gradient_clip_norm", self.config.float("gradient_clip_norm", 0.0))
    grad_clip_avg_norm = updater_opts.get("gradient_clip_avg_norm", self.config.float("gradient_clip_avg_norm", 0.0))
    grad_clip_global_norm = updater_opts.get(
      "gradient_clip_global_norm", self.config.float("gradient_clip_global_norm", 0.0))
    global_norm_tag = updater_opts.get(
      "global_norm_tag", self.config.value("global_norm_tag", None))
    grad_clip_global_norm_tag = updater_opts.get(
      "gradient_clip_global_norm_tag", self.config.value("gradient_clip_global_norm_tag", global_norm_tag))
    grad_norm_to_clip_to_zero = updater_opts.get(
      "grad_norm_to_clip_to_zero", self.config.float("grad_norm_to_clip_to_zero", 0.0))
    maximize_grad_norm = updater_opts.get("maximize_grad_norm", self.config.float("maximize_grad_norm", 0))

    if maximize_grad_norm:
      grad_ext = global_info.get_maximize_grad_norm_grad(maximize_grad_norm, var)
      if grad_ext is not None:
        grad += grad_ext

    if accum_grad_multiple_num_steps >= 1:
      grad = accum_grad_multiple_step(
        grad, var, train_step=self.global_train_step, num_accum_steps=accum_grad_multiple_num_steps)

    if updater_opts.get("debug_grad_summaries", self.config.bool_or_other("debug_grad_summaries", False)):
      from TFUtil import variable_summaries, get_base_name, reuse_name_scope_of_tensor
      with reuse_name_scope_of_tensor(grad, prefix="grads/"):
        variable_summaries(grad, name="grad_of_%s" % get_base_name(var))
      with reuse_name_scope_of_tensor(var, prefix="vars/"):
        variable_summaries(var, name=get_base_name(var))

    # Also see tf.contrib.layers.optimizers.optimize_loss() for reference.
    if grad_noise:
      assert grad_noise > 0
      from TFUtil import add_scaled_noise_to_gradients
      with tf.name_scope("grad_noise"):
        (grad, var), = add_scaled_noise_to_gradients([(grad, var)], grad_noise)
    if grad_clip:
      assert grad_clip > 0
      with tf.name_scope("grad_clip"):
        grad = tf.clip_by_value(grad, -grad_clip, grad_clip)
    if grad_clip_norm:
      assert grad_clip_norm > 0
      with tf.name_scope("grad_clip_norm"):
        grad = tf.clip_by_norm(grad, grad_clip_norm)
    if grad_clip_avg_norm:
      assert grad_clip_avg_norm > 0
      with tf.name_scope("grad_clip_avg_norm"):
        grad = tf.clip_by_average_norm(grad, grad_clip_avg_norm)
    if grad_clip_global_norm:
      assert grad_clip_global_norm > 0
      with tf.name_scope("grad_clip_global_norm"):
        grad = global_info.clip_by_global_norm(
          grad, clip_norm=grad_clip_global_norm, global_norm_tag=grad_clip_global_norm_tag)
    if updater_opts.get("gradient_nan_inf_filter", self.config.bool("gradient_nan_inf_filter", False)):
      from TFUtil import nan_to_num
      grad = nan_to_num(grad, nan_num=0.0, inf_num=0.0)
    if grad_norm_to_clip_to_zero:
      with tf.name_scope("grad_norm_to_clip_to_zero"):
        grad = global_info.set_zero_on_high_global_norm(
          grad, grad_norm_threshold=grad_norm_to_clip_to_zero, global_norm_tag=global_norm_tag)

    updater_opts.assert_all_read()

    opt_key, _ = self._get_optimizer_item_for_variable(var)
    apply_grad_opts = {
      "opt_key": opt_key, "accum_grad_multiple_num_steps": accum_grad_multiple_num_steps}
    return grad, apply_grad_opts
コード例 #4
0
    def _get_apply_grads_op(self, loss, trainable_vars_for_gradients):
        """
    :param tf.Tensor loss:
    :param list[tf.Variable] trainable_vars_for_gradients:
    :return: op with all variable updates combined, using the optimizer
    :rtype: tf.Operation
    """
        if not trainable_vars_for_gradients:
            return tf.no_op(name="no_grad_vars_no_op")
        # AccumulateN might not be deterministic but should be faster and should require less memory.
        # We might want to make this configurable.
        if self.config.is_true("deterministic_train"):
            aggregation_method = tf.AggregationMethod.ADD_N
        else:
            aggregation_method = tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N
        accum_grad_multiple_num_steps = self.config.int(
            "accum_grad_multiple_step", 0)
        grad_noise = self.config.float("gradient_noise", 0.0)
        grad_clip = self.config.float("gradient_clip", 0.0)
        grad_clip_norm = self.config.float("gradient_clip_norm", 0.0)
        grad_clip_avg_norm = self.config.float("gradient_clip_avg_norm", 0.0)
        grad_clip_global_norm = self.config.float("gradient_clip_global_norm",
                                                  0.0)
        # E.g. https://github.com/openai/baselines/blob/master/baselines/deepq/simple.py: grad_norm_clipping=10 -> tf.clip_by_norm

        # Extended self.optimizer.minimize() to optionally modify gradients.
        grads_and_vars = self.optimizer.compute_gradients(
            loss,
            var_list=trainable_vars_for_gradients,
            aggregation_method=aggregation_method)
        if self.config.is_true("use_horovod") and self.config.value(
                "horovod_reduce_type", "") == "grad":
            import horovod.tensorflow as hvd
            grads_and_vars = [(hvd.allreduce(
                grad, average=self.config.is_true("horovod_avg_grad"))
                               if grad is not None else None, var)
                              for (grad, var) in grads_and_vars]
        var_grads = {
            var: grad
            for (grad, var) in grads_and_vars if grad is not None
        }
        if not var_grads:
            raise Exception("no single variable to train")
        if self.config.float("maximize_grad_norm", 0):
            f = self.config.float("maximize_grad_norm", 0)
            grad_norm = tf.add_n(
                [tf.nn.l2_loss(g) for g in var_grads.values()],
                name="grad_norm_half") * 2.0
            loss_ext = grad_norm * (-f)
            grads_and_vars_ext = self.optimizer.compute_gradients(
                loss_ext,
                var_list=list(var_grads.keys()),
                aggregation_method=aggregation_method)
            var_grads_ext = {
                var: grad
                for (grad, var) in grads_and_vars_ext if grad is not None
            }
            grads_and_vars = [(grad + var_grads_ext.get(var, 0.0), var)
                              for (grad, var) in grads_and_vars]
        if accum_grad_multiple_num_steps >= 1:
            grads_and_vars = [(accum_grad_multiple_step(
                grad,
                var,
                train_step=self.network.global_train_step,
                num_accum_steps=accum_grad_multiple_num_steps), var)
                              for (grad, var) in grads_and_vars]
        if self.config.bool("debug_grad_summaries", False):
            from TFUtil import variable_summaries, get_base_name, reuse_name_scope_of_tensor
            for grad, var in grads_and_vars:
                with reuse_name_scope_of_tensor(grad, prefix="grads/"):
                    variable_summaries(grad,
                                       name="grad_of_%s" % get_base_name(var))
                with reuse_name_scope_of_tensor(var, prefix="vars/"):
                    variable_summaries(var, name=get_base_name(var))
        # Also see tf.contrib.layers.optimizers.optimize_loss() for reference.
        if self.config.bool("gradient_nan_inf_filter", False):
            from TFUtil import nan_to_num
            grads_and_vars = [(nan_to_num(grad, nan_num=0.0, inf_num=0.0), var)
                              for (grad, var) in grads_and_vars]
        if grad_noise:
            assert grad_noise > 0
            from TFUtil import add_scaled_noise_to_gradients
            with tf.name_scope("grad_noise"):
                grads_and_vars = add_scaled_noise_to_gradients(
                    grads_and_vars, grad_noise)
        if grad_clip:
            assert grad_clip > 0
            with tf.name_scope("grad_clip"):
                grads_and_vars = [(tf.clip_by_value(grad, -grad_clip,
                                                    grad_clip), var)
                                  for grad, var in grads_and_vars]
        if grad_clip_norm:
            assert grad_clip_norm > 0
            with tf.name_scope("grad_clip_norm"):
                grads_and_vars = [(tf.clip_by_norm(grad, grad_clip_norm), var)
                                  for grad, var in grads_and_vars]
        if grad_clip_avg_norm:
            assert grad_clip_avg_norm > 0
            with tf.name_scope("grad_clip_avg_norm"):
                grads_and_vars = [
                    (tf.clip_by_average_norm(grad, grad_clip_avg_norm), var)
                    for grad, var in grads_and_vars
                ]
        if grad_clip_global_norm:
            assert grad_clip_global_norm > 0
            with tf.name_scope("grad_clip_global_norm"):
                grads_clipped, _ = tf.clip_by_global_norm(
                    [grad for (grad, _) in grads_and_vars],
                    grad_clip_global_norm)
                grads_and_vars = zip(grads_clipped,
                                     [var for (_, var) in grads_and_vars])
        if accum_grad_multiple_num_steps >= 1:
            apply_grads = tf.cond(
                tf.equal(
                    tf.mod(self.network.global_train_step,
                           accum_grad_multiple_num_steps),
                    accum_grad_multiple_num_steps - 1),
                true_fn=lambda: self.optimizer.apply_gradients(grads_and_vars),
                false_fn=lambda: tf.no_op(),
                name="apply_grads/accum_grad_multiple_step")
        else:
            apply_grads = self.optimizer.apply_gradients(grads_and_vars)
        return apply_grads
コード例 #5
0
ファイル: TFUpdater.py プロジェクト: sharmaannapurna/returnn
  def create_optim_op(self):
    # Keep track of all current available vars.
    # The optimizer could add some, even some which are not so-called "slot-vars",
    # and we want to keep track about them.
    all_vars = tf.global_variables()  # type: list[tf.Variable]

    if not self.optimizer:
      self.create_optimizer()

    assert self.loss is not None
    with tf.variable_scope("optimize"):
      # AccumulateN might not be deterministic but should be faster and should require less memory.
      # We might want to make this configurable.
      aggregation_method = tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N
      grad_noise = self.config.float("gradient_noise", 0.0)
      grad_clip = self.config.float("gradient_clip", 0.0)
      grad_clip_global_norm = self.config.float("gradient_clip_global_norm", 0.0)

      # Extended self.optimizer.minimize() to optionally modify gradients.
      grads_and_vars = self.optimizer.compute_gradients(
        self.loss, var_list=self.trainable_vars,
        aggregation_method=aggregation_method)
      if not [v for g, v in grads_and_vars if g is not None]:
        raise Exception("no single variable to train")
      # Also see tf.contrib.layers.optimizers.optimize_loss() for reference.
      if self.config.bool("gradient_nan_inf_filter", False):
        from TFUtil import nan_to_num
        grads_and_vars = [(nan_to_num(grad), var) for (grad, var) in grads_and_vars]
      if grad_noise:
        assert grad_noise > 0
        from TFUtil import add_scaled_noise_to_gradients
        grads_and_vars = add_scaled_noise_to_gradients(grads_and_vars, grad_noise)
      if grad_clip:
        assert grad_clip > 0
        grads_and_vars = [(tf.clip_by_value(grad, -grad_clip, grad_clip), var) for grad, var in grads_and_vars]
      if grad_clip_global_norm:
        assert grad_clip_global_norm > 0
        grads_clipped, _ = tf.clip_by_global_norm([grad for (grad, _) in grads_and_vars], grad_clip_global_norm)
        grads_and_vars = zip(grads_clipped, [var for (_, var) in grads_and_vars])
      apply_grads = self.optimizer.apply_gradients(grads_and_vars)
      incr_step_op = tf.assign_add(self.network.global_train_step, 1, name="global_train_step_increment")
      self.optim_op = tf.group(apply_grads, incr_step_op, name="optim_and_step_incr")

    print("Initialize optimizer with slots %s." % self.optimizer.get_slot_names(), file=log.v3)
    slot_vars = []
    for slot_name in self.optimizer.get_slot_names():
      for v in self.trainable_vars:
        slot_var = self.optimizer.get_slot(var=v, name=slot_name)
        assert slot_var is not None
        assert isinstance(slot_var, tf.Variable)
        slot_vars.append(slot_var)
    self.optimizer_vars = slot_vars

    # Check if there were any other variables added.
    # E.g. currently (TF 1.0) the `AdamOptimizer` creates these additional vars
    # `[<tf.Variable 'optimize/beta1_power:0' shape=() dtype=float32_ref>,
    #   <tf.Variable 'optimize/beta2_power:0' shape=() dtype=float32_ref>]`
    # which do not correspond to trainable vars, thus we did not get them as slot vars above.
    other_new_vars = []
    for v in tf.global_variables():
      if v in all_vars:
        continue
      if v in self.optimizer_vars:
        continue
      other_new_vars.append(v)
    if other_new_vars:
      print("These additional variable were created by the optimizer: %s." % other_new_vars, file=log.v3)
      self.optimizer_vars += other_new_vars
    self.optimizer_init_vars_op = tf.variables_initializer(self.optimizer_vars, name="init_optim_slot_vars")
    self.init_optimizer_vars()