Esempio n. 1
0
    def __init__(self, params, model, name="loss"):
        """Loss constructor.
    Note that loss constructors should not modify TensorFlow graph, all
    graph construction should happen in the
    :meth:`self._compute_loss() <_compute_loss>` method.

    Args:
      params (dict): parameters describing the loss.
          All supported parameters are listed in :meth:`get_required_params`,
          :meth:`get_optional_params` functions.
      model (instance of a class derived from :class:`Model<models.model.Model>`):
          parent model that created this loss.
          Could be None if no model access is required for the use case.
      name (str): name for loss variable scope.

    Config parameters:

    * **dtype** --- data dtype. Could be either ``tf.float16`` or ``tf.float32``.
    """
        check_params(params, self.get_required_params(),
                     self.get_optional_params())
        self._params = copy.deepcopy(params)
        self._model = model

        if 'dtype' not in self._params:
            if self._model:
                self._params['dtype'] = self._model.get_tf_dtype()
            else:
                self._params['dtype'] = tf.float32

        self._name = name
Esempio n. 2
0
  def __init__(self, params, model, name="loss"):
    """Loss constructor.
    Note that loss constructors should not modify TensorFlow graph, all
    graph construction should happen in the
    :meth:`self._compute_loss() <_compute_loss>` method.

    Args:
      params (dict): parameters describing the loss.
          All supported parameters are listed in :meth:`get_required_params`,
          :meth:`get_optional_params` functions.
      model (instance of a class derived from :class:`Model<models.model.Model>`):
          parent model that created this loss.
          Could be None if no model access is required for the use case.
      name (str): name for loss variable scope.

    Config parameters:

    * **dtype** --- data dtype. Could be either ``tf.float16`` or ``tf.float32``.
    """
    check_params(params, self.get_required_params(), self.get_optional_params())
    self._params = copy.deepcopy(params)
    self._model = model

    if 'dtype' not in self._params:
      if self._model:
        self._params['dtype'] = self._model.get_tf_dtype()
      else:
        self._params['dtype'] = tf.float32

    self._name = name
Esempio n. 3
0
  def __init__(self, params, model, name="encoder", mode='train'):
    """Encoder constructor.
    Note that encoder constructors should not modify TensorFlow graph, all
    graph construction should happen in the :meth:`self._encode() <_encode>`
    method.

    Args:
      params (dict): parameters describing the encoder.
          All supported parameters are listed in :meth:`get_required_params`,
          :meth:`get_optional_params` functions.
      model (instance of a class derived from :class:`Model<models.model.Model>`):
          parent model that created this encoder.
          Could be None if no model access is required for the use case.
      name (str): name for encoder variable scope.
      mode (str): mode encoder is going to be run in.
          Could be "train", "eval" or "infer".

    Config parameters:

    * **initializer** --- any valid TensorFlow initializer. If no initializer
      is provided, model initializer will be used.
    * **initializer_params** (dict) --- dictionary that will be passed to
      initializer ``__init__`` method.
    * **regularizer** --- and valid TensorFlow regularizer. If no regularizer
      is provided, model regularizer will be used.
    * **regularizer_params** (dict) --- dictionary that will be passed to
      regularizer ``__init__`` method.
    * **dtype** --- model dtype. Could be either ``tf.float16``, ``tf.float32``
      or "mixed". For details see
      :ref:`mixed precision training <mixed_precision>` section in docs. If no
      dtype is provided, model dtype will be used.
    """
    check_params(params, self.get_required_params(), self.get_optional_params())
    self._params = copy.deepcopy(params)
    self._model = model

    if 'dtype' not in self._params:
      if self._model:
        self._params['dtype'] = self._model.params['dtype']
      else:
        self._params['dtype'] = tf.float32

    if 'regularizer' not in self._params:
      if self._model and 'regularizer' in self._model.params:
        self._params['regularizer'] = self._model.params['regularizer']
        self._params['regularizer_params'] = self._model.params['regularizer_params']

    if 'regularizer' in self._params:
      init_dict = self._params.get('regularizer_params', {})
      self._params['regularizer'] = self._params['regularizer'](**init_dict)
      if self._params['dtype'] == 'mixed':
        self._params['regularizer'] = mp_regularizer_wrapper(
          self._params['regularizer'],
        )

    if self._params['dtype'] == 'mixed':
      self._params['dtype'] = tf.float16

    self._name = name
    self._mode = mode
  def __init__(self, params):
    if params is None:
      params = {}
    check_params(
        config=params,
        required_dict={},
        optional_dict={
            'scale_min': float,
            'scale_max': float,
            'step_factor': float,
            'step_window': int
        },
    )
    self.scale_min = params.get('scale_min', 1.0)
    self.scale_max = params.get('scale_max', 2.**14)
    self.step_factor = params.get('step_factor', 2.0)
    self.step_window = params.get('step_window', 2000)

    self.iteration = tf.Variable(initial_value=0,
                                 trainable=False,
                                 dtype=tf.int64)
    self.last_overflow_iteration = tf.Variable(initial_value=-1,
                                               trainable=False,
                                               dtype=tf.int64)
    self.scale = tf.Variable(initial_value=self.scale_max,
                             trainable=False)
Esempio n. 5
0
  def __init__(self, params, model, name="decoder", mode='train'):
    """Decoder constructor.
    Note that decoder constructors should not modify TensorFlow graph, all
    graph construction should happen in the :meth:`self._decode() <_decode>`
    method.

    Args:
      params (dict): parameters describing the decoder.
          All supported parameters are listed in :meth:`get_required_params`,
          :meth:`get_optional_params` functions.
      model (instance of a class derived from :class:`Model<models.model.Model>`):
          parent model that created this decoder.
          Could be None if no model access is required for the use case.
      name (str): name for decoder variable scope.
      mode (str): mode decoder is going to be run in.
          Could be "train", "eval" or "infer".

    Config parameters:

    * **initializer** --- any valid TensorFlow initializer. If no initializer
      is provided, model initializer will be used.
    * **initializer_params** (dict) --- dictionary that will be passed to
      initializer ``__init__`` method.
    * **regularizer** --- and valid TensorFlow regularizer. If no regularizer
      is provided, model regularizer will be used.
    * **regularizer_params** (dict) --- dictionary that will be passed to
      regularizer ``__init__`` method.
    * **dtype** --- model dtype. Could be either ``tf.float16``, ``tf.float32``
      or "mixed". For details see
      :ref:`mixed precision training <mixed_precision>` section in docs. If no
      dtype is provided, model dtype will be used.
    """
    check_params(params, self.get_required_params(), self.get_optional_params())
    self._params = copy.deepcopy(params)
    self._model = model

    if 'dtype' not in self._params:
      if self._model:
        self._params['dtype'] = self._model.params['dtype']
      else:
        self._params['dtype'] = tf.float32

    if 'regularizer' not in self._params:
      if self._model and 'regularizer' in self._model.params:
        self._params['regularizer'] = self._model.params['regularizer']
        self._params['regularizer_params'] = self._model.params['regularizer_params']

    if 'regularizer' in self._params:
      init_dict = self._params.get('regularizer_params', {})
      self._params['regularizer'] = self._params['regularizer'](**init_dict)
      if self._params['dtype'] == 'mixed':
        self._params['regularizer'] = mp_regularizer_wrapper(
          self._params['regularizer'],
        )

    if self._params['dtype'] == 'mixed':
      self._params['dtype'] = tf.float16

    self._name = name
    self._mode = mode
Esempio n. 6
0
    def __init__(self, params, model, num_workers=None, worker_id=None):
        """Data layer constructor.
    The TensorFlow graph should not be created here, but rather in the
    :meth:`self.build_graph() <build_graph>` method.

    Args:
      params (dict): parameters describing the data layer.
          All supported parameters are listed in :meth:`get_required_params`,
          :meth:`get_optional_params` functions.
      model (instance of a class derived from :class:`Model<models.model.Model>`):
          parent model that created this data layer.
          Could be None if no model access is required for the use case.
      num_workers (int): number of Horovod processes or None if Horovod is not used.
      worker_id (int): Horovod process id or None if Horovod is not used.

    Config parameters:

    * **shuffle** (bool) --- whether to shuffle dataset after an epoch.
      Typically will be True for train and False for inference and evaluation.
    * **dtype** --- data dtype. Could be either ``tf.float16`` or ``tf.float32``.
    """
        check_params(params, self.get_required_params(),
                     self.get_optional_params())
        self._params = copy.deepcopy(params)
        self._model = model

        if 'dtype' not in self._params:
            if self._model:
                self._params['dtype'] = self._model.get_tf_dtype()
            else:
                self._params['dtype'] = tf.float32

        if 'use_targets' not in params:
            self._params['use_targets'] = True

        if 'shuffle' not in params:
            if self._params['use_targets']:
                self._params['shuffle'] = True
            else:
                self._params['shuffle'] = False

        if self._params['use_targets'] is False and self._params['shuffle']:
            raise ValueError(
                "Shuffle should not be performed in inference mode")

        self._input_tensors = None

        # could be used for correct Horovod processing
        self._num_workers = num_workers
        self._worker_id = worker_id
  def __init__(self, params):
    if params is None:
      params = {}
    check_params(
        config=params,
        required_dict={},
        optional_dict={
            'scale_min': float,
            'scale_max': float,
            'log_max': float,
            'beta1': float,
            'beta2': float,
            'overflow_std_dev': float
        },
    )
    self.scale_min = params.get('scale_min', 1.0)
    self.scale_max = params.get('scale_max', 2.**14)
    self.log_max = params.get('log_max', 16.)
    self.beta1 = params.get('beta1', 0.99)
    self.beta2 = params.get('beta2', 0.999)
    self.overflow_std_dev = params.get('overflow_std_dev', 3.09)

    self.iteration = tf.Variable(initial_value=0,
                                 trainable=False,
                                 dtype=tf.int64)
    self.scale = tf.Variable(initial_value=1.0,
                             trainable=False)
    self.x_hat = tf.Variable(initial_value=0,
                             trainable=False,
                             dtype=tf.float32)
    self.slow_x_hat = tf.Variable(initial_value=0,
                                  trainable=False,
                                  dtype=tf.float32)
    self.xsquared_hat = tf.Variable(initial_value=0,
                                    trainable=False,
                                    dtype=tf.float32)
    self.b1_correction = tf.Variable(initial_value=1.,
                                     trainable=False,
                                     dtype=tf.float32)
    self.b2_correction = tf.Variable(initial_value=1.,
                                     trainable=False,
                                     dtype=tf.float32)
Esempio n. 8
0
    def __init__(self, params, mode="train", hvd=None):
        """Model constructor.
    The TensorFlow graph should not be created here, but rather in the
    :meth:`self.compile() <compile>` method.

    Args:
      params (dict): parameters describing the model.
          All supported parameters are listed in :meth:`get_required_params`,
          :meth:`get_optional_params` functions.
      mode (string, optional): "train", "eval" or "infer".
          If mode is "train" all parts of the graph will be built
          (model, loss, optimizer).
          If mode is "eval", only model and loss will be built.
          If mode is "infer", only model will be built.
      hvd (optional): if Horovod is used, this should be
          ``horovod.tensorflow`` module.
          If Horovod is not used, it should be None.

    Config parameters:

    * **random_seed** (int) --- random seed to use.
    * **use_horovod** (bool) --- whether to use Horovod for distributed
      execution.
    * **num_gpus** (int) --- number of GPUs to use. This parameter cannot be
      used if ``gpu_ids`` is specified. When ``use_horovod`` is True
      this parameter is ignored.
    * **gpu_ids** (list of ints) --- GPU ids to use. This parameter cannot be
      used if ``num_gpus`` is specified. When ``use_horovod`` is True
      this parameter is ignored.
    * **batch_size_per_gpu** (int) --- batch size to use for each GPU.
    * **num_epochs** (int) --- number of epochs to run training for.
      This parameter cannot be used if ``max_steps`` is specified.
    * **max_steps** (int) --- number of steps to run training for.
      This parameter cannot be used if ``num_epochs`` is specified.
    * **save_summaries_steps** (int or None) --- how often to save summaries.
      Setting it to None disables summaries saving.
    * **print_loss_steps** (int or None) --- how often to print loss during
      training. Setting it to None disables loss printing.
    * **print_samples_steps** (int or None) --- how often to print training
      samples (input sequences, correct answers and model predictions).
      Setting it to None disables samples printing.
    * **print_bench_info_steps** (int or None) --- how often to print training
      benchmarking information (average number of objects processed per step).
      Setting it to None disables intermediate benchmarking printing, but
      the average information across the whole training will always be printed
      after the last iteration.
    * **save_checkpoint_steps** (int or None) --- how often to save model
      checkpoints. Setting it to None disables checkpoint saving.
    * **eval_steps** (int) --- how often to run evaluation during training.
      This parameter is only checked if ``--mode`` argument of ``run.py`` is
      "train\_eval". If no evaluation is needed you should use "train" mode.
    * **logdir** (string) --- path to the log directory where all checkpoints
      and summaries will be saved.
    * **data_layer** (any class derived from
      :class:`DataLayer <data.data_layer.DataLayer>`) --- data layer class
      to use.
    * **data_layer_params** (dict) --- dictionary with data layer
      configuration.
      For complete list of possible parameters see the corresponding
      class docs.
    * **optimizer** (string or TensorFlow optimizer class) --- optimizer to
      use for training. Could be either "Adam", "Adagrad", "Ftrl", "Momentum",
      "RMSProp", "SGD" or any valid TensorFlow optimizer class.
    * **optimizer_params** (dict) --- dictionary that will be passed to
      optimizer ``__init__`` method.
    * **initializer** --- any valid TensorFlow initializer.
    * **initializer_params** (dict) --- dictionary that will be passed to
      initializer ``__init__`` method.
    * **regularizer** --- and valid TensorFlow regularizer.
    * **regularizer_params** (dict) --- dictionary that will be passed to
      regularizer ``__init__`` method.
    * **dtype** --- model dtype. Could be either ``tf.float16``,
      ``tf.float32`` or "mixed". For details see
      :ref:`mixed precision training <mixed_precision>` section in docs.
    * **lr_policy** --- any valid learning rate policy function. For examples,
      see :any:`optimizers.lr_policies` module.
    * **lr_policy_params** (dict) --- dictionary containing lr_policy
      parameters.
    * **max_grad_norm** (float) --- maximum value of gradient norm. Clipping
      will be performed if some gradients exceed this value (this is checked
      for each variable independently).
    * **loss_scaling** --- could be float or string. If float, static loss
      scaling is applied. If string, the corresponding automatic
      loss scaling algorithm is used. Must be one of 'Backoff'
      of 'LogMax' (case insensitive). Only used when dtype="mixed". For details
      see :ref:`mixed precision training <mixed_precision>` section in docs.
    * **summaries** (list) --- which summaries to log. Could contain
      "learning_rate", "gradients", "gradient_norm", "global_gradient_norm",
      "variables", "variable_norm".
    * **iter_size** (int) --- use this parameter to emulate large batches.
      The gradients will be accumulated for ``iter_size`` number of steps before
      applying update.
    * **larc_params** --- dictionary with parameters for LARC (or LARS)
      optimization algorithms. Can contain the following parameters:

      * **larc_mode** --- Could be either "scale" (LARS) or "clip" (LARC).
        Note that it works in addition to any other optimization algorithm
        since we treat
        it as adaptive gradient clipping and learning rate adjustment.
      * **larc_eta** (float) --- LARC or LARS scaling parameter.
      * **min_update** (float) --- minimal value of the LARC (LARS) update.
      * **epsilon** (float) --- small number added to gradient norm in
        denominator for numerical stability.
    """
        check_params(params, self.get_required_params(),
                     self.get_optional_params())

        self._params = copy.deepcopy(params)

        if self._params.get('iter_size', 1) > 1 and hvd is None:
            raise ValueError("iter_size is only supported in Horovod mode")

        # parameter checks
        self._mode = mode
        if self._mode not in ["train", "infer", "eval"]:
            raise ValueError(
                "Mode has to be one of ['train', 'infer', 'eval']")

        if "max_steps" in params and "num_epochs" in params:
            raise ValueError(
                "You can't provide both max_steps and num_epochs. "
                "Please, remove one of them from the config.")
        if mode == "train":
            if "max_steps" not in params and "num_epochs" not in params:
                raise ValueError("For training mode either max_steps or "
                                 "num_epochs has to be provided")

        if 'print_samples_steps' not in self._params:
            self._params['print_samples_steps'] = None
        if 'print_loss_steps' not in self._params:
            self._params['print_loss_steps'] = None
        if 'save_checkpoint_steps' not in self._params:
            self._params['save_checkpoint_steps'] = None
        if 'save_summaries_steps' not in self._params:
            self._params['save_summaries_steps'] = None
        if 'print_bench_info_steps' not in self._params:
            self._params['print_bench_info_steps'] = None

        # checking that frequencies of samples and loss are aligned
        s_fr = self._params['print_samples_steps']
        l_fr = self._params['print_loss_steps']
        if s_fr is not None and l_fr is not None and s_fr % l_fr != 0:
            raise ValueError("print_samples_steps has to be a multiple of "
                             "print_loss_steps.")

        self._hvd = hvd
        if self._hvd:
            self._gpu_ids = range(1)
        else:
            if 'gpu_ids' in self._params:
                self._gpu_ids = self._params['gpu_ids']
            elif 'num_gpus' in self._params:
                self._gpu_ids = range(self._params['num_gpus'])
            else:
                raise ValueError('Either "gpu_ids" or "num_gpus" has to '
                                 'be specified in the config')

        # setting random seed
        rs = self._params.get('random_seed', int(time.time()))
        if self.on_horovod:
            rs += hvd.rank()
        tf.set_random_seed(rs)
        np.random.seed(rs)

        if 'dtype' not in self._params:
            self._params['dtype'] = tf.float32

        dl_params = self._params.get('data_layer_params', {})
        dl_params['batch_size'] = self._params['batch_size_per_gpu']
        dl_params['mode'] = self._mode

        if self.on_horovod:
            self._data_layer = self._params['data_layer'](
                params=dl_params,
                model=self,
                num_workers=self._hvd.size(),
                worker_id=self._hvd.rank(),
            )
        else:
            self._data_layers = []
            for worker_id in range(self.num_gpus):
                self._data_layers.append(self._params['data_layer'](
                    params=dl_params,
                    model=self,
                    num_workers=self.num_gpus,
                    worker_id=worker_id,
                ))

        if self._mode == "train":
            if "max_steps" in self._params:
                self._last_step = self._params["max_steps"]
                self._steps_in_epoch = None
            else:
                # doing a few less steps if data size is not divisible by the batch size
                self._steps_in_epoch = self.get_data_layer().get_size_in_samples() // \
                                       self.get_data_layer().params['batch_size']
                if self._steps_in_epoch is None:
                    raise ValueError(
                        'The data_layer is not compatible with '
                        'epoch execution, since it does not provide '
                        'get_size_in_samples() method. Either update the '
                        'data layer or switch to using "max_steps" '
                        'paremeter.')
                if self.on_horovod:
                    self._steps_in_epoch //= self._hvd.size()
                else:
                    self._steps_in_epoch //= self.num_gpus
                self._steps_in_epoch //= self._params.get('iter_size', 1)
                if self._steps_in_epoch == 0:
                    raise ValueError(
                        "Overall batch size is too big for this dataset.")
                self._last_step = self._params[
                    'num_epochs'] * self._steps_in_epoch

        if self.on_horovod:
            self._output = None
        else:
            self._outputs = [None] * self.num_gpus

        self.loss = None
        self.train_op = None
        self.eval_losses = None
        self._num_objects_per_step = None
        self.skip_update_ph = None
Esempio n. 9
0
    def __init__(self, params, mode="train", hvd=None):
        """Model constructor.
    The TensorFlow graph should not be created here, but rather in the
    :meth:`self.compile() <compile>` method.

    Args:
      params (dict): parameters describing the model.
          All supported parameters are listed in :meth:`get_required_params`,
          :meth:`get_optional_params` functions.
      mode (string, optional): "train", "eval" or "infer".
          If mode is "train" all parts of the graph will be built
          (model, loss, optimizer).
          If mode is "eval", only model and loss will be built.
          If mode is "infer", only model will be built.
      hvd (optional): if Horovod is used, this should be
          ``horovod.tensorflow`` module.
          If Horovod is not used, it should be None.

    Config parameters:

    * **random_seed** (int) --- random seed to use.
    * **use_horovod** (bool) --- whether to use Horovod for distributed
      execution.
    * **num_gpus** (int) --- number of GPUs to use. This parameter cannot be
      used if ``gpu_ids`` is specified. When ``use_horovod`` is True
      this parameter is ignored.
    * **gpu_ids** (list of ints) --- GPU ids to use. This parameter cannot be
      used if ``num_gpus`` is specified. When ``use_horovod`` is True
      this parameter is ignored.
    * **batch_size_per_gpu** (int) --- batch size to use for each GPU.
    * **num_epochs** (int) --- number of epochs to run training for.
      This parameter cannot be used if ``max_steps`` is specified.
    * **max_steps** (int) --- number of steps to run training for.
      This parameter cannot be used if ``num_epochs`` is specified.
    * **save_summaries_steps** (int or None) --- how often to save summaries.
      Setting it to None disables summaries saving.
    * **print_loss_steps** (int or None) --- how often to print loss during
      training. Setting it to None disables loss printing.
    * **print_samples_steps** (int or None) --- how often to print training
      samples (input sequences, correct answers and model predictions).
      Setting it to None disables samples printing.
    * **save_checkpoint_steps** (int or None) --- how often to save model
      checkpoints. Setting it to None disables checkpoint saving.
    * **eval_steps** (int) --- how often to run evaluation during training.
      This parameter is only checked if ``--mode`` argument of ``run.py`` is
      "train\_eval". If no evaluation is needed you should use "train" mode.
    * **logdir** (string) --- path to the log directory where all checkpoints
      and summaries will be saved.
    * **data_layer** (any class derived from
      :class:`DataLayer <data.data_layer.DataLayer>`) --- data layer class
      to use.
    * **data_layer_params** (dict) --- dictionary with data layer
      configuration.
      For complete list of possible parameters see the corresponding
      class docs.
    * **learning_rate** (float) --- initial learning rate for training.
    * **optimizer** (string or TensorFlow optimizer class) --- optimizer to
      use for training. Could be either "Adam", "Adagrad", "Ftrl", "Momentum",
      "RMSProp", "SGD" or any valid TensorFlow optimizer class.
    * **optimizer_params** (dict) --- dictionary that will be passed to
      optimizer ``__init__`` method.
    * **initializer** --- any valid TensorFlow initializer.
    * **initializer_params** (dict) --- dictionary that will be passed to
      initializer ``__init__`` method.
    * **regularizer** --- and valid TensorFlow regularizer.
    * **regularizer_params** (dict) --- dictionary that will be passed to
      regularizer ``__init__`` method.
    * **dtype** --- model dtype. Could be either ``tf.float16``,
      ``tf.float32`` or "mixed". For details see
      :ref:`mixed precision training <mixed_precision>` section in docs.
    * **lr_policy** --- any valid learning rate policy function. For examples,
      see :any:`optimizers.lr_policies` module.
    * **lr_policy_params** (dict) --- dictionary containing lr_policy
      parameters.
    * **max_grad_norm** (float) --- maximum value of gradient norm. Clipping
      will be performed if some gradients exceed this value (this is checked
      for each variable independently).
    * **larc_mode** --- specify this to use LARC or LARS optimization
      algorithms. Could be either "scale" (LARS) or "clip" (LARC).
      You also need to specify ``larc_nu`` to enable LARC or LARS. Note that
      it works in addition to any other optimization algorithm since we treat
      it as adaptive gradient clipping and learning rate adjustment.
    * **larc_nu** (float) --- LARC or LARS scaling parameter.
    * **loss_scale** (float) --- static loss scale to use. For details see
      :ref:`mixed precision training <mixed_precision>` section in docs.
    * **automatic_loss_scaling** --- automatic loss scaling mode. Could be
      either None, "Backoff" or "Logmax". For details see
      :ref:`mixed precision training <mixed_precision>` section in docs.
    * **summaries** (list) --- which summaries to log. Could contain
      "learning_rate", "gradients", "gradient_norm", "global_gradient_norm",
      "variables", "variable_norm".
    """
        check_params(params, self.get_required_params(),
                     self.get_optional_params())

        self._params = copy.deepcopy(params)

        # parameter checks
        self._mode = mode
        if self._mode not in ["train", "infer", "eval"]:
            raise ValueError(
                "Mode has to be one of ['train', 'infer', 'eval']")

        if "max_steps" in params and "num_epochs" in params:
            raise ValueError(
                "You can't provide both max_steps and num_epochs. "
                "Please, remove one of them from the config.")
        if mode == "train":
            if "max_steps" not in params and "num_epochs" not in params:
                raise ValueError("For training mode either max_steps or "
                                 "num_epochs has to be provided")

        if 'print_samples_steps' not in self._params:
            self._params['print_samples_steps'] = None
        if 'print_loss_steps' not in self._params:
            self._params['print_loss_steps'] = None
        if 'save_checkpoint_steps' not in self._params:
            self._params['save_checkpoint_steps'] = None
        if 'save_summaries_steps' not in self._params:
            self._params['save_summaries_steps'] = None

        # checking that frequencies of samples and loss are aligned
        s_fr = self._params['print_samples_steps']
        l_fr = self._params['print_loss_steps']
        if s_fr is not None and l_fr is not None and s_fr % l_fr != 0:
            raise ValueError("print_samples_steps has to be a multiple of "
                             "print_loss_steps.")

        self._hvd = hvd
        if self._hvd:
            self._gpu_ids = range(1)
        else:
            if 'gpu_ids' in self._params:
                self._gpu_ids = self._params['gpu_ids']
            elif 'num_gpus' in self._params:
                self._gpu_ids = range(self._params['num_gpus'])
            else:
                raise ValueError('Either "gpu_ids" or "num_gpus" has to '
                                 'be specified in the config')

        # setting random seed
        rs = self._params.get('random_seed', int(time.time()))
        if self.on_horovod:
            rs += hvd.rank()
        tf.set_random_seed(rs)
        np.random.seed(rs)

        if 'dtype' not in self._params:
            self._params['dtype'] = tf.float32

        dl_params = self._params.get('data_layer_params', {})
        dl_params['batch_size'] = self._params['batch_size_per_gpu']
        dl_params['use_targets'] = (self._mode == "train"
                                    or self._mode == "eval")

        if self.on_horovod:
            self._data_layer = self._params['data_layer'](
                params=dl_params,
                model=self,
                num_workers=self._hvd.size(),
                worker_id=self._hvd.rank(),
            )
        else:
            dl = self._params['data_layer'](params=dl_params, model=self)
            self._data_layer = MultiGPUWrapper(dl, num_gpus=self.num_gpus)

        if self._mode == "train":
            if "max_steps" in self._params:
                self._last_step = self._params["max_steps"]
                self._steps_in_epoch = None
            else:
                # doing a few less steps if data size is not divisible by the batch size
                self._steps_in_epoch = self._data_layer.get_size_in_batches()
                # if on Horovod, there will be hvd.size() independent data_layer copies
                # and thus the total size is hvd.size() times smaller.
                if self.on_horovod:
                    self._steps_in_epoch //= self._hvd.size()
                self._last_step = self._params[
                    'num_epochs'] * self._steps_in_epoch

        self._outputs = [None] * self.num_gpus
        self.loss = None
        self.train_op = None
Esempio n. 10
0
  def __init__(self, params, mode="train", hvd=None):
    """Model constructor.
    The TensorFlow graph should not be created here, but rather in the
    :meth:`self.compile() <compile>` method.

    Args:
      params (dict): parameters describing the model.
          All supported parameters are listed in :meth:`get_required_params`,
          :meth:`get_optional_params` functions.
      mode (string, optional): "train", "eval" or "infer".
          If mode is "train" all parts of the graph will be built
          (model, loss, optimizer).
          If mode is "eval", only model and loss will be built.
          If mode is "infer", only model will be built.
      hvd (optional): if Horovod is used, this should be
          ``horovod.tensorflow`` module.
          If Horovod is not used, it should be None.

    Config parameters:

    * **random_seed** (int) --- random seed to use.
    * **use_horovod** (bool) --- whether to use Horovod for distributed
      execution.
    * **num_gpus** (int) --- number of GPUs to use. This parameter cannot be
      used if ``gpu_ids`` is specified. When ``use_horovod`` is True
      this parameter is ignored.
    * **gpu_ids** (list of ints) --- GPU ids to use. This parameter cannot be
      used if ``num_gpus`` is specified. When ``use_horovod`` is True
      this parameter is ignored.
    * **batch_size_per_gpu** (int) --- batch size to use for each GPU.
    * **num_epochs** (int) --- number of epochs to run training for.
      This parameter cannot be used if ``max_steps`` is specified.
    * **max_steps** (int) --- number of steps to run training for.
      This parameter cannot be used if ``num_epochs`` is specified.
    * **save_summaries_steps** (int or None) --- how often to save summaries.
      Setting it to None disables summaries saving.
    * **print_loss_steps** (int or None) --- how often to print loss during
      training. Setting it to None disables loss printing.
    * **print_samples_steps** (int or None) --- how often to print training
      samples (input sequences, correct answers and model predictions).
      Setting it to None disables samples printing.
    * **save_checkpoint_steps** (int or None) --- how often to save model
      checkpoints. Setting it to None disables checkpoint saving.
    * **eval_steps** (int) --- how often to run evaluation during training.
      This parameter is only checked if ``--mode`` argument of ``run.py`` is
      "train\_eval". If no evaluation is needed you should use "train" mode.
    * **logdir** (string) --- path to the log directory where all checkpoints
      and summaries will be saved.
    * **data_layer** (any class derived from
      :class:`DataLayer <data.data_layer.DataLayer>`) --- data layer class
      to use.
    * **data_layer_params** (dict) --- dictionary with data layer
      configuration.
      For complete list of possible parameters see the corresponding
      class docs.
    * **optimizer** (string or TensorFlow optimizer class) --- optimizer to
      use for training. Could be either "Adam", "Adagrad", "Ftrl", "Momentum",
      "RMSProp", "SGD" or any valid TensorFlow optimizer class.
    * **optimizer_params** (dict) --- dictionary that will be passed to
      optimizer ``__init__`` method.
    * **initializer** --- any valid TensorFlow initializer.
    * **initializer_params** (dict) --- dictionary that will be passed to
      initializer ``__init__`` method.
    * **regularizer** --- and valid TensorFlow regularizer.
    * **regularizer_params** (dict) --- dictionary that will be passed to
      regularizer ``__init__`` method.
    * **dtype** --- model dtype. Could be either ``tf.float16``,
      ``tf.float32`` or "mixed". For details see
      :ref:`mixed precision training <mixed_precision>` section in docs.
    * **lr_policy** --- any valid learning rate policy function. For examples,
      see :any:`optimizers.lr_policies` module.
    * **lr_policy_params** (dict) --- dictionary containing lr_policy
      parameters.
    * **max_grad_norm** (float) --- maximum value of gradient norm. Clipping
      will be performed if some gradients exceed this value (this is checked
      for each variable independently).
    * **loss_scale** (float) --- static loss scale to use. For details see
      :ref:`mixed precision training <mixed_precision>` section in docs.
    * **automatic_loss_scaling** --- automatic loss scaling mode. Could be
      either None, "Backoff" or "Logmax". For details see
      :ref:`mixed precision training <mixed_precision>` section in docs.
    * **summaries** (list) --- which summaries to log. Could contain
      "learning_rate", "gradients", "gradient_norm", "global_gradient_norm",
      "variables", "variable_norm".
    * **larc_params** --- dictionary with parameters for LARC (or LARS)
      optimization algorithms. Can contain the following parameters:

      * **larc_mode** --- Could be either "scale" (LARS) or "clip" (LARC).
        Note that it works in addition to any other optimization algorithm
        since we treat
        it as adaptive gradient clipping and learning rate adjustment.
      * **larc_eta** (float) --- LARC or LARS scaling parameter.
      * **min_update** (float) --- minimal value of the LARC (LARS) update.
      * **epsilon** (float) --- small number added to gradient norm in
        denominator for numerical stability.
    """
    check_params(params, self.get_required_params(), self.get_optional_params())

    self._params = copy.deepcopy(params)

    # parameter checks
    self._mode = mode
    if self._mode not in ["train", "infer", "eval"]:
      raise ValueError("Mode has to be one of ['train', 'infer', 'eval']")

    if "max_steps" in params and "num_epochs" in params:
      raise ValueError("You can't provide both max_steps and num_epochs. "
                       "Please, remove one of them from the config.")
    if mode == "train":
      if "max_steps" not in params and "num_epochs" not in params:
        raise ValueError("For training mode either max_steps or "
                         "num_epochs has to be provided")

    if 'print_samples_steps' not in self._params:
      self._params['print_samples_steps'] = None
    if 'print_loss_steps' not in self._params:
      self._params['print_loss_steps'] = None
    if 'save_checkpoint_steps' not in self._params:
      self._params['save_checkpoint_steps'] = None
    if 'save_summaries_steps' not in self._params:
      self._params['save_summaries_steps'] = None

    # checking that frequencies of samples and loss are aligned
    s_fr = self._params['print_samples_steps']
    l_fr = self._params['print_loss_steps']
    if s_fr is not None and l_fr is not None and s_fr % l_fr != 0:
      raise ValueError("print_samples_steps has to be a multiple of "
                       "print_loss_steps.")

    self._hvd = hvd
    if self._hvd:
        self._gpu_ids = range(1)
    else:
      if 'gpu_ids' in self._params:
        self._gpu_ids = self._params['gpu_ids']
      elif 'num_gpus' in self._params:
        self._gpu_ids = range(self._params['num_gpus'])
      else:
        raise ValueError('Either "gpu_ids" or "num_gpus" has to '
                         'be specified in the config')

    # setting random seed
    rs = self._params.get('random_seed', int(time.time()))
    if self.on_horovod:
      rs += hvd.rank()
    tf.set_random_seed(rs)
    np.random.seed(rs)

    if 'dtype' not in self._params:
      self._params['dtype'] = tf.float32

    dl_params = self._params.get('data_layer_params', {})
    dl_params['batch_size'] = self._params['batch_size_per_gpu']
    dl_params['mode'] = self._mode

    if self.on_horovod:
      self._data_layer = self._params['data_layer'](
        params=dl_params, model=self,
        num_workers=self._hvd.size(), worker_id=self._hvd.rank(),
      )
    else:
      self._data_layers = []
      for worker_id in range(self.num_gpus):
        self._data_layers.append(self._params['data_layer'](
          params=dl_params, model=self,
          num_workers=self.num_gpus, worker_id=worker_id,
        ))

    if self._mode == "train":
      if "max_steps" in self._params:
        self._last_step = self._params["max_steps"]
        self._steps_in_epoch = None
      else:
        # doing a few less steps if data size is not divisible by the batch size
        self._steps_in_epoch = self.get_data_layer().get_size_in_samples() // \
                               self.get_data_layer().params['batch_size']
        if self._steps_in_epoch is None:
          raise ValueError('The data_layer is not compatible with '
                           'epoch execution, since it does not provide '
                           'get_size_in_samples() method. Either update the '
                           'data layer or switch to using "max_steps" '
                           'paremeter.')
        if self.on_horovod:
          self._steps_in_epoch //= self._hvd.size()
        else:
          self._steps_in_epoch //= self.num_gpus
        self._last_step = self._params['num_epochs'] * self._steps_in_epoch

    if self.on_horovod:
      self._output = None
    else:
      self._outputs = [None] * self.num_gpus
    self.loss = None
    self.train_op = None
    self.eval_losses = None
Esempio n. 11
0
def post_process_gradients(grads_and_vars, summaries, lr, clip_gradients,
                           larc_params):
    """Applies post processing to gradients, i.e. clipping, LARC, summaries."""
    if "global_gradient_norm" in summaries:
        tf.summary.scalar(
            "global_gradient_norm",
            _global_norm_with_cast(grads_and_vars),
        )

    # Optionally clip gradients by global norm.
    if clip_gradients is not None:
        grads_and_vars = _clip_gradients_by_norm(grads_and_vars,
                                                 clip_gradients)

    # Add histograms for variables, gradients and gradient norms.
    for gradient, variable in grads_and_vars:
        if isinstance(gradient, tf.IndexedSlices):
            grad_values = gradient.values
        else:
            grad_values = gradient

        if isinstance(variable, tf.IndexedSlices):
            var_values = variable.values
        else:
            var_values = variable

        if grad_values is not None:
            var_name = variable.name.replace(":", "_")
            if "gradients" in summaries:
                # need to mask nans for automatic loss scaling
                tf.summary.histogram("gradients/%s" % var_name,
                                     mask_nans(grad_values))
            if "gradient_norm" in summaries:
                tf.summary.scalar("gradient_norm/%s" % var_name,
                                  tf.norm(grad_values))
            if "variables" in summaries:
                tf.summary.histogram("variables/%s" % var_name, var_values)
            if "variable_norm" in summaries:
                tf.summary.scalar("variable_norm/%s" % var_name,
                                  tf.norm(var_values))

    if clip_gradients is not None and "global_gradient_norm" in summaries:
        tf.summary.scalar(
            "global_clipped_gradient_norm",
            _global_norm_with_cast(grads_and_vars),
        )

    # LARC gradient re-scaling
    if larc_params is not None:
        check_params(
            config=larc_params,
            required_dict={'larc_eta': float},
            optional_dict={
                'larc_mode': ['clip', 'scale'],
                'min_update': float,
                'epsilon': float
            },
        )
        larc_eta = larc_params['larc_eta']
        larc_mode = larc_params.get('larc_mode', 'clip')
        min_update = larc_params.get('min_update', 1e-7)
        eps = larc_params.get('epsilon', 1e-7)

        grads_and_vars_larc = [None] * len(grads_and_vars)
        for idx, (g, v) in enumerate(grads_and_vars):
            var_dtype = v.dtype
            v_norm = tf.norm(tensor=tf.cast(v, tf.float32), ord=2)
            g_norm = tf.norm(tensor=tf.cast(g, tf.float32), ord=2)

            if larc_mode == 'clip':
                larc_grad_update = tf.maximum(
                    larc_eta * v_norm / (lr * (g_norm + eps)),
                    min_update,
                )
                if "larc_summaries" in summaries:
                    tf.summary.scalar(
                        'larc_clip_on/{}'.format(v.name),
                        tf.cast(tf.less(larc_grad_update, 1.0), tf.int32))
                larc_grad_update = tf.minimum(larc_grad_update, 1.0)
            else:
                larc_grad_update = tf.maximum(
                    larc_eta * v_norm / (g_norm + eps),
                    min_update,
                )
            larc_grad_update = tf.saturate_cast(larc_grad_update, var_dtype)
            grads_and_vars_larc[idx] = (larc_grad_update * g, v)

            # adding additional summary
            if "larc_summaries" in summaries:
                tf.summary.scalar('larc_grad_update/{}'.format(v.name),
                                  larc_grad_update)
                tf.summary.scalar("larc_final_lr/{}".format(v.name),
                                  tf.cast(lr, var_dtype) * larc_grad_update)
        grads_and_vars = grads_and_vars_larc
    return grads_and_vars
Esempio n. 12
0
def optimize_loss(loss,
                  optimizer,
                  optimizer_params,
                  learning_rate_decay_fn,
                  global_step=None,
                  dtype=tf.float32,
                  gradient_noise_scale=None,
                  gradient_multipliers=None,
                  clip_gradients=None,
                  update_ops=None,
                  variables=None,
                  name=None,
                  summaries=None,
                  colocate_gradients_with_ops=False,
                  increment_global_step=True,
                  larc_params=None,
                  loss_scale=1.0,
                  automatic_loss_scaling=None,
                  on_horovod=False):
  """Given loss and parameters for optimizer, returns a training op.

  Various ways of passing optimizers include:

  - by string specifying the name of the optimizer. See OPTIMIZER_CLS_NAMES
      for full list. E.g. `optimize_loss(..., optimizer='Adam')`.
  - by function taking learning rate `Tensor` as argument and returning an
      `Optimizer` instance. E.g. `optimize_loss(...,
      optimizer=lambda lr: tf.train.MomentumOptimizer(lr, momentum=0.5))`.
    Alternatively, if `learning_rate` is `None`, the function takes no
    arguments. E.g. `optimize_loss(..., learning_rate=None,
      optimizer=lambda: tf.train.MomentumOptimizer(0.5, momentum=0.5))`.
  - by a subclass of `Optimizer` having a single-argument constructor
      (the argument is the learning rate), such as AdamOptimizer or
      AdagradOptimizer. E.g. `optimize_loss(...,
      optimizer=tf.train.AdagradOptimizer)`.
  - by an instance of a subclass of `Optimizer`.
      E.g., `optimize_loss(..., optimizer=tf.train.AdagradOptimizer(0.5))`.

  Args:
    loss: Scalar `Tensor`.
    global_step: Scalar int `Tensor`, step counter to update on each step
                 unless `increment_global_step` is `False`. If not supplied,
                 it will be fetched from the default graph (see
                 `tf.train.get_global_step` for details). If it has
                 not been created, no step will be incremented with each weight
                 update. `learning_rate_decay_fn` requires `global_step`.
    learning_rate: float or `Tensor`, magnitude of update per each training
                   step. Can be `None`.
    optimizer: string, class or optimizer instance, used as trainer.
               string should be name of optimizer, like 'SGD',
                 'Adam', 'Adagrad'. Full list in OPTIMIZER_CLS_NAMES constant.
               class should be sub-class of `tf.Optimizer` that implements
                 `compute_gradients` and `apply_gradients` functions.
               optimizer instance should be instantiation of `tf.Optimizer`
                 sub-class and have `compute_gradients` and `apply_gradients`
                 functions.
    gradient_noise_scale: float or None, adds 0-mean normal noise scaled by this
                          value.
    gradient_multipliers: dict of variables or variable names to floats.
                          If present, gradients for specified
                          variables will be multiplied by given constant.
    clip_gradients: float, callable or `None`. If float, is provided, a global
      clipping is applied to prevent the norm of the gradient to exceed this
      value. Alternatively, a callable can be provided e.g.: adaptive_clipping.
      This callable takes a `list` of `(gradients, variables)` `tuple`s and
      returns the same thing with the gradients modified.
    learning_rate_decay_fn: function, takes `learning_rate` and `global_step`
                            `Tensor`s, returns `Tensor`.
                            Can be used to implement any learning rate decay
                            functions.
                            For example: `tf.train.exponential_decay`.
                            Ignored if `learning_rate` is not supplied.
    update_ops: list of update `Operation`s to execute at each step. If `None`,
                uses elements of UPDATE_OPS collection. The order of execution
                between `update_ops` and `loss` is non-deterministic.
    variables: list of variables to optimize or
               `None` to use all trainable variables.
    name: The name for this operation is used to scope operations and summaries.
    summaries: List of internal quantities to visualize on tensorboard. If not
               set only the loss and the learning rate will be reported. The
               complete list is in OPTIMIZER_SUMMARIES.
    colocate_gradients_with_ops: If True, try colocating gradients with the
                                 corresponding op.
    increment_global_step: Whether to increment `global_step`. If your model
      calls `optimize_loss` multiple times per training step (e.g. to optimize
      different parts of the model), use this arg to avoid incrementing
      `global_step` more times than necessary.
    LARC_mode: 'scale' or 'clip'
    LARC_nu: If not None, LARC re-scaling will be
             applied https://arxiv.org/pdf/1708.03888.pdf with nu=LARC_nu
    automatic_loss_scaling: if not None, use the corresponding automatic
                            loss scaling algorithm. Must be one of 'Backoff'
                            of 'LogMax'. `dtype` must be "mixed" to use ALS.
  Returns:
    Training op.

  Raises:
    ValueError: if:
        * `loss` is an invalid type or shape.
        * `global_step` is an invalid type or shape.
        * `learning_rate` is an invalid type or value.
        * `optimizer` has the wrong type.
        * `clip_gradients` is neither float nor callable.
        * `learning_rate` and `learning_rate_decay_fn` are supplied, but no
          `global_step` is available.
        * `gradients` is empty.
  """
  loss = ops.convert_to_tensor(loss)
  contrib_framework.assert_scalar(loss)
  if global_step is None:
    global_step = tf.train.get_or_create_global_step()
  else:
    tf.train.assert_global_step(global_step)
  with vs.variable_scope(name, "OptimizeLoss", [loss, global_step]):
    # Update ops take UPDATE_OPS collection if not provided.
    if update_ops is None:
      update_ops = set(ops.get_collection(ops.GraphKeys.UPDATE_OPS))
    # Make sure update ops are ran before computing loss.
    if update_ops:
      loss = control_flow_ops.with_dependencies(list(update_ops), loss)

    if summaries is None:
      summaries = ["learning_rate", "global_gradient_norm"]
    else:
      for summ in summaries:
        if summ not in OPTIMIZER_SUMMARIES:
          raise ValueError("Summaries should be one of [%s], you provided %s." %
                           (", ".join(OPTIMIZER_SUMMARIES), summ))
    if global_step is None:
      raise ValueError("global_step is required for learning_rate_decay_fn.")
    lr = learning_rate_decay_fn(global_step)

    if "learning_rate" in summaries:
      summary.scalar("learning_rate", lr)

    # Create optimizer, given specified parameters.
    if isinstance(optimizer, six.string_types):
      if lr is None:
        raise ValueError("Learning rate is None, but should be specified if "
                         "optimizer is string (%s)." % optimizer)
      if optimizer not in OPTIMIZER_CLS_NAMES:
        raise ValueError(
            "Optimizer name should be one of [%s], you provided %s." %
            (", ".join(OPTIMIZER_CLS_NAMES), optimizer))
      opt = OPTIMIZER_CLS_NAMES[optimizer](learning_rate=lr, **optimizer_params)
    elif (isinstance(optimizer, type) and
          issubclass(optimizer, optimizer_.Optimizer)):
      if lr is None:
        raise ValueError("Learning rate is None, but should be specified if "
                         "optimizer is class (%s)." % optimizer)
      opt = optimizer(learning_rate=lr, **optimizer_params)
    elif isinstance(optimizer, optimizer_.Optimizer):
      opt = optimizer
    elif callable(optimizer):
      if lr is not None:
        opt = optimizer(lr, **optimizer_params)
      else:
        opt = optimizer(**optimizer_params)
      if not isinstance(opt, optimizer_.Optimizer):
        raise ValueError("Unrecognized optimizer: function should return "
                         "subclass of Optimizer. Got %s." % str(opt))
    else:
      raise ValueError("Unrecognized optimizer: should be string, "
                       "subclass of Optimizer, instance of "
                       "subclass of Optimizer or function with one argument. "
                       "Got %s." % str(optimizer))
    # All trainable variables, if specific variables are not specified.
    if variables is None:
      variables = vars_.trainable_variables()

    if automatic_loss_scaling is not None:
      if automatic_loss_scaling not in AutomaticLossScaler.SUPPORTED_ALGOS:
        raise ValueError("Unknown automatic loss scaling algorithm: %s."
                         % automatic_loss_sclaing)
      if dtype != "mixed":
        raise ValueError("Automatic loss scaling can be used only with "
                         "dtype=mixed.")
      loss_scale = AutomaticLossScaler(algorithm=automatic_loss_scaling)

    if dtype == 'mixed':
      opt = MixedPrecisionOptimizerWrapper(opt, loss_scale=loss_scale)
    if on_horovod:
      opt = DistributedOptimizer(opt)

    # Compute gradients.
    gradients = opt.compute_gradients(
      loss, variables,
      colocate_gradients_with_ops=colocate_gradients_with_ops,
    )

    # Optionally add gradient noise.
    if gradient_noise_scale is not None:
      gradients = _add_scaled_noise_to_gradients(gradients,
                                                 gradient_noise_scale)

    # Multiply some gradients.
    if gradient_multipliers is not None:
      gradients = _multiply_gradients(gradients, gradient_multipliers)
      if not gradients:
        raise ValueError(
            "Empty list of (gradient, var) pairs encountered. This is most "
            "likely to be caused by an improper value of gradient_multipliers.")

    if "global_gradient_norm" in summaries or "gradient_norm" in summaries:
      summary.scalar(
        "global_norm/gradient_norm",
        clip_ops.global_norm(list(map(
          lambda x: tf.cast(x, tf.float32),
          list(zip(*gradients))[0])
        )),
      )

    # Optionally clip gradients by global norm.
    if clip_gradients is not None and larc_params is not None:
      raise AttributeError(
        "LARC and gradient norm clipping should not be used together"
      )
    if isinstance(clip_gradients, float):
      gradients = _clip_gradients_by_norm(gradients, clip_gradients)
    elif callable(clip_gradients):
      gradients = clip_gradients(gradients)
    elif clip_gradients is not None:
      raise ValueError(
          "Unknown type %s for clip_gradients" % type(clip_gradients))

    # Add histograms for variables, gradients and gradient norms.
    for gradient, variable in gradients:
      if isinstance(gradient, ops.IndexedSlices):
        grad_values = gradient.values
      else:
        grad_values = gradient

      if isinstance(variable, ops.IndexedSlices):
        var_values = variable.values
      else:
        var_values = variable

      if grad_values is not None:
        var_name = variable.name.replace(":", "_")
        if "gradients" in summaries:
          summary.histogram("gradients/%s" % var_name, mask_nans(grad_values))
        if "gradient_norm" in summaries:
          summary.scalar("gradient_norm/%s" % var_name,
                         clip_ops.global_norm([grad_values]))
        if "variables" in summaries:
          summary.histogram("variables/%s" % var_name, var_values)
        if "variable_norm" in summaries:
          summary.scalar("variable_norm/%s" % var_name,
                         clip_ops.global_norm([var_values]))

    if clip_gradients is not None and ("global_gradient_norm" in summaries or
                                       "gradient_norm" in summaries):
      summary.scalar(
        "global_norm/clipped_gradient_norm",
        clip_ops.global_norm(list(map(
          lambda x: tf.cast(x, tf.float32),
          list(zip(*gradients))[0])
        )),
      )

    # LARC gradient re-scaling
    if larc_params is not None:
      check_params(
        config=larc_params,
        required_dict={'larc_eta': float},
        optional_dict={
          'larc_mode': ['clip', 'scale'],
          'min_update': float,
          'epsilon': float
        },
      )
      larc_eta = larc_params['larc_eta']
      larc_mode = larc_params.get('larc_mode', 'clip')
      min_update = larc_params.get('min_update', 1e-7)
      eps = larc_params.get('epsilon', 1e-7)

      for idx, (g, v) in enumerate(gradients):
        var_dtype = v.dtype
        v_norm = tf.norm(tensor=tf.cast(v, tf.float32), ord=2)
        g_norm = tf.norm(tensor=tf.cast(g, tf.float32), ord=2)

        if larc_mode == 'clip':
          larc_grad_update = tf.maximum(
            larc_eta * v_norm / (lr * (g_norm + eps)),
            min_update,
          )
          if "larc_summaries" in summaries:
            summary.scalar('larc_clip_on/{}'.format(v.name),
                           tf.cast(tf.less(larc_grad_update, 1.0), tf.int32))
          larc_grad_update = tf.minimum(larc_grad_update, 1.0)
        else:
          larc_grad_update = tf.maximum(
            larc_eta * v_norm / (g_norm + eps),
            min_update,
          )
        larc_grad_update = tf.saturate_cast(larc_grad_update, var_dtype)
        gradients[idx] = (larc_grad_update * g, v)

        # adding additional summary
        if "larc_summaries" in summaries:
          summary.scalar('larc_grad_update/{}'.format(v.name), larc_grad_update)
          summary.scalar("larc_final_lr/{}".format(v.name),
                         tf.cast(lr, var_dtype) * larc_grad_update)

    # Create gradient updates.
    grad_updates = opt.apply_gradients(
        gradients,
        global_step=global_step if increment_global_step else None,
        name="train")

    # # Ensure the train_tensor computes grad_updates.
    train_tensor = control_flow_ops.with_dependencies([grad_updates], loss)

    return train_tensor
Esempio n. 13
0
def optimize_loss(loss,
                  optimizer,
                  optimizer_params,
                  learning_rate_decay_fn,
                  global_step=None,
                  dtype=tf.float32,
                  gradient_noise_scale=None,
                  gradient_multipliers=None,
                  clip_gradients=None,
                  update_ops=None,
                  variables=None,
                  name=None,
                  summaries=None,
                  colocate_gradients_with_ops=False,
                  increment_global_step=True,
                  larc_params=None,
                  loss_scale=1.0,
                  automatic_loss_scaling=None,
                  on_horovod=False):
    """Given loss and parameters for optimizer, returns a training op.

  Various ways of passing optimizers include:

  - by string specifying the name of the optimizer. See OPTIMIZER_CLS_NAMES
      for full list. E.g. `optimize_loss(..., optimizer='Adam')`.
  - by function taking learning rate `Tensor` as argument and returning an
      `Optimizer` instance. E.g. `optimize_loss(...,
      optimizer=lambda lr: tf.train.MomentumOptimizer(lr, momentum=0.5))`.
    Alternatively, if `learning_rate` is `None`, the function takes no
    arguments. E.g. `optimize_loss(..., learning_rate=None,
      optimizer=lambda: tf.train.MomentumOptimizer(0.5, momentum=0.5))`.
  - by a subclass of `Optimizer` having a single-argument constructor
      (the argument is the learning rate), such as AdamOptimizer or
      AdagradOptimizer. E.g. `optimize_loss(...,
      optimizer=tf.train.AdagradOptimizer)`.
  - by an instance of a subclass of `Optimizer`.
      E.g., `optimize_loss(..., optimizer=tf.train.AdagradOptimizer(0.5))`.

  Args:
    loss: Scalar `Tensor`.
    global_step: Scalar int `Tensor`, step counter to update on each step
                 unless `increment_global_step` is `False`. If not supplied,
                 it will be fetched from the default graph (see
                 `tf.train.get_global_step` for details). If it has
                 not been created, no step will be incremented with each weight
                 update. `learning_rate_decay_fn` requires `global_step`.
    learning_rate: float or `Tensor`, magnitude of update per each training
                   step. Can be `None`.
    optimizer: string, class or optimizer instance, used as trainer.
               string should be name of optimizer, like 'SGD',
                 'Adam', 'Adagrad'. Full list in OPTIMIZER_CLS_NAMES constant.
               class should be sub-class of `tf.Optimizer` that implements
                 `compute_gradients` and `apply_gradients` functions.
               optimizer instance should be instantiation of `tf.Optimizer`
                 sub-class and have `compute_gradients` and `apply_gradients`
                 functions.
    gradient_noise_scale: float or None, adds 0-mean normal noise scaled by this
                          value.
    gradient_multipliers: dict of variables or variable names to floats.
                          If present, gradients for specified
                          variables will be multiplied by given constant.
    clip_gradients: float, callable or `None`. If float, is provided, a global
      clipping is applied to prevent the norm of the gradient to exceed this
      value. Alternatively, a callable can be provided e.g.: adaptive_clipping.
      This callable takes a `list` of `(gradients, variables)` `tuple`s and
      returns the same thing with the gradients modified.
    learning_rate_decay_fn: function, takes `learning_rate` and `global_step`
                            `Tensor`s, returns `Tensor`.
                            Can be used to implement any learning rate decay
                            functions.
                            For example: `tf.train.exponential_decay`.
                            Ignored if `learning_rate` is not supplied.
    update_ops: list of update `Operation`s to execute at each step. If `None`,
                uses elements of UPDATE_OPS collection. The order of execution
                between `update_ops` and `loss` is non-deterministic.
    variables: list of variables to optimize or
               `None` to use all trainable variables.
    name: The name for this operation is used to scope operations and summaries.
    summaries: List of internal quantities to visualize on tensorboard. If not
               set only the loss and the learning rate will be reported. The
               complete list is in OPTIMIZER_SUMMARIES.
    colocate_gradients_with_ops: If True, try colocating gradients with the
                                 corresponding op.
    increment_global_step: Whether to increment `global_step`. If your model
      calls `optimize_loss` multiple times per training step (e.g. to optimize
      different parts of the model), use this arg to avoid incrementing
      `global_step` more times than necessary.
    LARC_mode: 'scale' or 'clip'
    LARC_nu: If not None, LARC re-scaling will be
             applied https://arxiv.org/pdf/1708.03888.pdf with nu=LARC_nu
    automatic_loss_scaling: if not None, use the corresponding automatic
                            loss scaling algorithm. Must be one of 'Backoff'
                            of 'LogMax'. `dtype` must be "mixed" to use ALS.
  Returns:
    Training op.

  Raises:
    ValueError: if:
        * `loss` is an invalid type or shape.
        * `global_step` is an invalid type or shape.
        * `learning_rate` is an invalid type or value.
        * `optimizer` has the wrong type.
        * `clip_gradients` is neither float nor callable.
        * `learning_rate` and `learning_rate_decay_fn` are supplied, but no
          `global_step` is available.
        * `gradients` is empty.
  """
    loss = ops.convert_to_tensor(loss)
    contrib_framework.assert_scalar(loss)
    if global_step is None:
        global_step = tf.train.get_or_create_global_step()
    else:
        tf.train.assert_global_step(global_step)
    with vs.variable_scope(name, "OptimizeLoss", [loss, global_step]):
        # Update ops take UPDATE_OPS collection if not provided.
        if update_ops is None:
            update_ops = set(ops.get_collection(ops.GraphKeys.UPDATE_OPS))
        # Make sure update ops are ran before computing loss.
        if update_ops:
            loss = control_flow_ops.with_dependencies(list(update_ops), loss)

        if summaries is None:
            summaries = ["learning_rate", "global_gradient_norm"]
        else:
            for summ in summaries:
                if summ not in OPTIMIZER_SUMMARIES:
                    raise ValueError(
                        "Summaries should be one of [%s], you provided %s." %
                        (", ".join(OPTIMIZER_SUMMARIES), summ))
        if global_step is None:
            raise ValueError(
                "global_step is required for learning_rate_decay_fn.")
        lr = learning_rate_decay_fn(global_step)

        if "learning_rate" in summaries:
            summary.scalar("learning_rate", lr)

        # Create optimizer, given specified parameters.
        if isinstance(optimizer, six.string_types):
            if lr is None:
                raise ValueError(
                    "Learning rate is None, but should be specified if "
                    "optimizer is string (%s)." % optimizer)
            if optimizer not in OPTIMIZER_CLS_NAMES:
                raise ValueError(
                    "Optimizer name should be one of [%s], you provided %s." %
                    (", ".join(OPTIMIZER_CLS_NAMES), optimizer))
            opt = OPTIMIZER_CLS_NAMES[optimizer](learning_rate=lr,
                                                 **optimizer_params)
        elif (isinstance(optimizer, type)
              and issubclass(optimizer, optimizer_.Optimizer)):
            if lr is None:
                raise ValueError(
                    "Learning rate is None, but should be specified if "
                    "optimizer is class (%s)." % optimizer)
            opt = optimizer(learning_rate=lr, **optimizer_params)
        elif isinstance(optimizer, optimizer_.Optimizer):
            opt = optimizer
        elif callable(optimizer):
            if lr is not None:
                opt = optimizer(lr, **optimizer_params)
            else:
                opt = optimizer(**optimizer_params)
            if not isinstance(opt, optimizer_.Optimizer):
                raise ValueError(
                    "Unrecognized optimizer: function should return "
                    "subclass of Optimizer. Got %s." % str(opt))
        else:
            raise ValueError(
                "Unrecognized optimizer: should be string, "
                "subclass of Optimizer, instance of "
                "subclass of Optimizer or function with one argument. "
                "Got %s." % str(optimizer))
        # All trainable variables, if specific variables are not specified.
        if variables is None:
            variables = vars_.trainable_variables()

        if automatic_loss_scaling is not None:
            if not automatic_loss_scaling in AutomaticLossScaler.SUPPORTED_ALGOS:
                raise ValueError(
                    "Unknown automatic loss scaling algorithm: %s." %
                    automatic_loss_sclaing)
            if dtype != "mixed":
                raise ValueError(
                    "Automatic loss scaling can be used only with "
                    "dtype=mixed.")
            loss_scaler = AutomaticLossScaler(algorithm=automatic_loss_scaling)
        else:
            loss_scaler = None

        if dtype == 'mixed':
            opt = MixedPrecisionOptimizerWrapper(
                opt,
                automatic_loss_scaler=loss_scaler,
            )
        if on_horovod:
            opt = DistributedOptimizer(opt)

        # Compute gradients.
        gradients = opt.compute_gradients(
            loss if loss_scale == 1.0 else loss * loss_scale,
            variables,
            colocate_gradients_with_ops=colocate_gradients_with_ops)

        if loss_scale != 1.0:
            gradients = _multiply_gradients_const(gradients, 1.0 / loss_scale)

        # Optionally add gradient noise.
        if gradient_noise_scale is not None:
            gradients = _add_scaled_noise_to_gradients(gradients,
                                                       gradient_noise_scale)

        # Multiply some gradients.
        if gradient_multipliers is not None:
            gradients = _multiply_gradients(gradients, gradient_multipliers)
            if not gradients:
                raise ValueError(
                    "Empty list of (gradient, var) pairs encountered. This is most "
                    "likely to be caused by an improper value of gradient_multipliers."
                )

        if "global_gradient_norm" in summaries or "gradient_norm" in summaries:
            summary.scalar(
                "global_norm/gradient_norm",
                clip_ops.global_norm(
                    list(
                        map(lambda x: tf.cast(x, tf.float32),
                            list(zip(*gradients))[0]))),
            )

        # Optionally clip gradients by global norm.
        if clip_gradients is not None and larc_params is not None:
            raise AttributeError(
                "LARC and gradient norm clipping should not be used together")
        if isinstance(clip_gradients, float):
            gradients = _clip_gradients_by_norm(gradients, clip_gradients)
        elif callable(clip_gradients):
            gradients = clip_gradients(gradients)
        elif clip_gradients is not None:
            raise ValueError("Unknown type %s for clip_gradients" %
                             type(clip_gradients))

        # Add histograms for variables, gradients and gradient norms.
        for gradient, variable in gradients:
            if isinstance(gradient, ops.IndexedSlices):
                grad_values = gradient.values
            else:
                grad_values = gradient

            if isinstance(variable, ops.IndexedSlices):
                var_values = variable.values
            else:
                var_values = variable

            if grad_values is not None:
                var_name = variable.name.replace(":", "_")
                if "gradients" in summaries:
                    summary.histogram("gradients/%s" % var_name,
                                      mask_nans(grad_values))
                if "gradient_norm" in summaries:
                    summary.scalar("gradient_norm/%s" % var_name,
                                   clip_ops.global_norm([grad_values]))
                if "variables" in summaries:
                    summary.histogram("variables/%s" % var_name, var_values)
                if "variable_norm" in summaries:
                    summary.scalar("variable_norm/%s" % var_name,
                                   clip_ops.global_norm([var_values]))

        if clip_gradients is not None and ("global_gradient_norm" in summaries
                                           or "gradient_norm" in summaries):
            summary.scalar(
                "global_norm/clipped_gradient_norm",
                clip_ops.global_norm(
                    list(
                        map(lambda x: tf.cast(x, tf.float32),
                            list(zip(*gradients))[0]))),
            )

        # LARC gradient re-scaling
        if larc_params is not None:
            check_params(
                config=larc_params,
                required_dict={'larc_eta': float},
                optional_dict={
                    'larc_mode': ['clip', 'scale'],
                    'min_update': float,
                    'epsilon': float
                },
            )
            larc_eta = larc_params['larc_eta']
            larc_mode = larc_params.get('larc_mode', 'clip')
            min_update = larc_params.get('min_update', 1e-7)
            eps = larc_params.get('epsilon', 1e-7)

            for idx, (g, v) in enumerate(gradients):
                var_dtype = v.dtype
                v_norm = tf.norm(tensor=tf.cast(v, tf.float32), ord=2)
                g_norm = tf.norm(tensor=tf.cast(g, tf.float32), ord=2)

                if larc_mode == 'clip':
                    larc_grad_update = tf.maximum(
                        larc_eta * v_norm / (lr * (g_norm + eps)),
                        min_update,
                    )
                    if "larc_summaries" in summaries:
                        summary.scalar(
                            'larc_clip_on/{}'.format(v.name),
                            tf.cast(tf.less(larc_grad_update, 1.0), tf.int32))
                    larc_grad_update = tf.minimum(larc_grad_update, 1.0)
                else:
                    larc_grad_update = tf.maximum(
                        larc_eta * v_norm / (g_norm + eps),
                        min_update,
                    )
                larc_grad_update = tf.saturate_cast(larc_grad_update,
                                                    var_dtype)
                gradients[idx] = (larc_grad_update * g, v)

                # adding additional summary
                if "larc_summaries" in summaries:
                    summary.scalar('larc_grad_update/{}'.format(v.name),
                                   larc_grad_update)
                    summary.scalar("larc_final_lr/{}".format(v.name),
                                   tf.cast(lr, var_dtype) * larc_grad_update)

        # Create gradient updates.
        grad_updates = opt.apply_gradients(
            gradients,
            global_step=global_step if increment_global_step else None,
            name="train")

        # # Ensure the train_tensor computes grad_updates.
        train_tensor = control_flow_ops.with_dependencies([grad_updates], loss)

        return train_tensor