def __init__(self, params, model, name="loss"): """Loss constructor. Note that loss constructors should not modify TensorFlow graph, all graph construction should happen in the :meth:`self._compute_loss() <_compute_loss>` method. Args: params (dict): parameters describing the loss. All supported parameters are listed in :meth:`get_required_params`, :meth:`get_optional_params` functions. model (instance of a class derived from :class:`Model<models.model.Model>`): parent model that created this loss. Could be None if no model access is required for the use case. name (str): name for loss variable scope. Config parameters: * **dtype** --- data dtype. Could be either ``tf.float16`` or ``tf.float32``. """ check_params(params, self.get_required_params(), self.get_optional_params()) self._params = copy.deepcopy(params) self._model = model if 'dtype' not in self._params: if self._model: self._params['dtype'] = self._model.get_tf_dtype() else: self._params['dtype'] = tf.float32 self._name = name
def __init__(self, params, model, name="loss"): """Loss constructor. Note that loss constructors should not modify TensorFlow graph, all graph construction should happen in the :meth:`self._compute_loss() <_compute_loss>` method. Args: params (dict): parameters describing the loss. All supported parameters are listed in :meth:`get_required_params`, :meth:`get_optional_params` functions. model (instance of a class derived from :class:`Model<models.model.Model>`): parent model that created this loss. Could be None if no model access is required for the use case. name (str): name for loss variable scope. Config parameters: * **dtype** --- data dtype. Could be either ``tf.float16`` or ``tf.float32``. """ check_params(params, self.get_required_params(), self.get_optional_params()) self._params = copy.deepcopy(params) self._model = model if 'dtype' not in self._params: if self._model: self._params['dtype'] = self._model.get_tf_dtype() else: self._params['dtype'] = tf.float32 self._name = name
def __init__(self, params, model, name="encoder", mode='train'): """Encoder constructor. Note that encoder constructors should not modify TensorFlow graph, all graph construction should happen in the :meth:`self._encode() <_encode>` method. Args: params (dict): parameters describing the encoder. All supported parameters are listed in :meth:`get_required_params`, :meth:`get_optional_params` functions. model (instance of a class derived from :class:`Model<models.model.Model>`): parent model that created this encoder. Could be None if no model access is required for the use case. name (str): name for encoder variable scope. mode (str): mode encoder is going to be run in. Could be "train", "eval" or "infer". Config parameters: * **initializer** --- any valid TensorFlow initializer. If no initializer is provided, model initializer will be used. * **initializer_params** (dict) --- dictionary that will be passed to initializer ``__init__`` method. * **regularizer** --- and valid TensorFlow regularizer. If no regularizer is provided, model regularizer will be used. * **regularizer_params** (dict) --- dictionary that will be passed to regularizer ``__init__`` method. * **dtype** --- model dtype. Could be either ``tf.float16``, ``tf.float32`` or "mixed". For details see :ref:`mixed precision training <mixed_precision>` section in docs. If no dtype is provided, model dtype will be used. """ check_params(params, self.get_required_params(), self.get_optional_params()) self._params = copy.deepcopy(params) self._model = model if 'dtype' not in self._params: if self._model: self._params['dtype'] = self._model.params['dtype'] else: self._params['dtype'] = tf.float32 if 'regularizer' not in self._params: if self._model and 'regularizer' in self._model.params: self._params['regularizer'] = self._model.params['regularizer'] self._params['regularizer_params'] = self._model.params['regularizer_params'] if 'regularizer' in self._params: init_dict = self._params.get('regularizer_params', {}) self._params['regularizer'] = self._params['regularizer'](**init_dict) if self._params['dtype'] == 'mixed': self._params['regularizer'] = mp_regularizer_wrapper( self._params['regularizer'], ) if self._params['dtype'] == 'mixed': self._params['dtype'] = tf.float16 self._name = name self._mode = mode
def __init__(self, params): if params is None: params = {} check_params( config=params, required_dict={}, optional_dict={ 'scale_min': float, 'scale_max': float, 'step_factor': float, 'step_window': int }, ) self.scale_min = params.get('scale_min', 1.0) self.scale_max = params.get('scale_max', 2.**14) self.step_factor = params.get('step_factor', 2.0) self.step_window = params.get('step_window', 2000) self.iteration = tf.Variable(initial_value=0, trainable=False, dtype=tf.int64) self.last_overflow_iteration = tf.Variable(initial_value=-1, trainable=False, dtype=tf.int64) self.scale = tf.Variable(initial_value=self.scale_max, trainable=False)
def __init__(self, params, model, name="decoder", mode='train'): """Decoder constructor. Note that decoder constructors should not modify TensorFlow graph, all graph construction should happen in the :meth:`self._decode() <_decode>` method. Args: params (dict): parameters describing the decoder. All supported parameters are listed in :meth:`get_required_params`, :meth:`get_optional_params` functions. model (instance of a class derived from :class:`Model<models.model.Model>`): parent model that created this decoder. Could be None if no model access is required for the use case. name (str): name for decoder variable scope. mode (str): mode decoder is going to be run in. Could be "train", "eval" or "infer". Config parameters: * **initializer** --- any valid TensorFlow initializer. If no initializer is provided, model initializer will be used. * **initializer_params** (dict) --- dictionary that will be passed to initializer ``__init__`` method. * **regularizer** --- and valid TensorFlow regularizer. If no regularizer is provided, model regularizer will be used. * **regularizer_params** (dict) --- dictionary that will be passed to regularizer ``__init__`` method. * **dtype** --- model dtype. Could be either ``tf.float16``, ``tf.float32`` or "mixed". For details see :ref:`mixed precision training <mixed_precision>` section in docs. If no dtype is provided, model dtype will be used. """ check_params(params, self.get_required_params(), self.get_optional_params()) self._params = copy.deepcopy(params) self._model = model if 'dtype' not in self._params: if self._model: self._params['dtype'] = self._model.params['dtype'] else: self._params['dtype'] = tf.float32 if 'regularizer' not in self._params: if self._model and 'regularizer' in self._model.params: self._params['regularizer'] = self._model.params['regularizer'] self._params['regularizer_params'] = self._model.params['regularizer_params'] if 'regularizer' in self._params: init_dict = self._params.get('regularizer_params', {}) self._params['regularizer'] = self._params['regularizer'](**init_dict) if self._params['dtype'] == 'mixed': self._params['regularizer'] = mp_regularizer_wrapper( self._params['regularizer'], ) if self._params['dtype'] == 'mixed': self._params['dtype'] = tf.float16 self._name = name self._mode = mode
def __init__(self, params, model, num_workers=None, worker_id=None): """Data layer constructor. The TensorFlow graph should not be created here, but rather in the :meth:`self.build_graph() <build_graph>` method. Args: params (dict): parameters describing the data layer. All supported parameters are listed in :meth:`get_required_params`, :meth:`get_optional_params` functions. model (instance of a class derived from :class:`Model<models.model.Model>`): parent model that created this data layer. Could be None if no model access is required for the use case. num_workers (int): number of Horovod processes or None if Horovod is not used. worker_id (int): Horovod process id or None if Horovod is not used. Config parameters: * **shuffle** (bool) --- whether to shuffle dataset after an epoch. Typically will be True for train and False for inference and evaluation. * **dtype** --- data dtype. Could be either ``tf.float16`` or ``tf.float32``. """ check_params(params, self.get_required_params(), self.get_optional_params()) self._params = copy.deepcopy(params) self._model = model if 'dtype' not in self._params: if self._model: self._params['dtype'] = self._model.get_tf_dtype() else: self._params['dtype'] = tf.float32 if 'use_targets' not in params: self._params['use_targets'] = True if 'shuffle' not in params: if self._params['use_targets']: self._params['shuffle'] = True else: self._params['shuffle'] = False if self._params['use_targets'] is False and self._params['shuffle']: raise ValueError( "Shuffle should not be performed in inference mode") self._input_tensors = None # could be used for correct Horovod processing self._num_workers = num_workers self._worker_id = worker_id
def __init__(self, params): if params is None: params = {} check_params( config=params, required_dict={}, optional_dict={ 'scale_min': float, 'scale_max': float, 'log_max': float, 'beta1': float, 'beta2': float, 'overflow_std_dev': float }, ) self.scale_min = params.get('scale_min', 1.0) self.scale_max = params.get('scale_max', 2.**14) self.log_max = params.get('log_max', 16.) self.beta1 = params.get('beta1', 0.99) self.beta2 = params.get('beta2', 0.999) self.overflow_std_dev = params.get('overflow_std_dev', 3.09) self.iteration = tf.Variable(initial_value=0, trainable=False, dtype=tf.int64) self.scale = tf.Variable(initial_value=1.0, trainable=False) self.x_hat = tf.Variable(initial_value=0, trainable=False, dtype=tf.float32) self.slow_x_hat = tf.Variable(initial_value=0, trainable=False, dtype=tf.float32) self.xsquared_hat = tf.Variable(initial_value=0, trainable=False, dtype=tf.float32) self.b1_correction = tf.Variable(initial_value=1., trainable=False, dtype=tf.float32) self.b2_correction = tf.Variable(initial_value=1., trainable=False, dtype=tf.float32)
def __init__(self, params, mode="train", hvd=None): """Model constructor. The TensorFlow graph should not be created here, but rather in the :meth:`self.compile() <compile>` method. Args: params (dict): parameters describing the model. All supported parameters are listed in :meth:`get_required_params`, :meth:`get_optional_params` functions. mode (string, optional): "train", "eval" or "infer". If mode is "train" all parts of the graph will be built (model, loss, optimizer). If mode is "eval", only model and loss will be built. If mode is "infer", only model will be built. hvd (optional): if Horovod is used, this should be ``horovod.tensorflow`` module. If Horovod is not used, it should be None. Config parameters: * **random_seed** (int) --- random seed to use. * **use_horovod** (bool) --- whether to use Horovod for distributed execution. * **num_gpus** (int) --- number of GPUs to use. This parameter cannot be used if ``gpu_ids`` is specified. When ``use_horovod`` is True this parameter is ignored. * **gpu_ids** (list of ints) --- GPU ids to use. This parameter cannot be used if ``num_gpus`` is specified. When ``use_horovod`` is True this parameter is ignored. * **batch_size_per_gpu** (int) --- batch size to use for each GPU. * **num_epochs** (int) --- number of epochs to run training for. This parameter cannot be used if ``max_steps`` is specified. * **max_steps** (int) --- number of steps to run training for. This parameter cannot be used if ``num_epochs`` is specified. * **save_summaries_steps** (int or None) --- how often to save summaries. Setting it to None disables summaries saving. * **print_loss_steps** (int or None) --- how often to print loss during training. Setting it to None disables loss printing. * **print_samples_steps** (int or None) --- how often to print training samples (input sequences, correct answers and model predictions). Setting it to None disables samples printing. * **print_bench_info_steps** (int or None) --- how often to print training benchmarking information (average number of objects processed per step). Setting it to None disables intermediate benchmarking printing, but the average information across the whole training will always be printed after the last iteration. * **save_checkpoint_steps** (int or None) --- how often to save model checkpoints. Setting it to None disables checkpoint saving. * **eval_steps** (int) --- how often to run evaluation during training. This parameter is only checked if ``--mode`` argument of ``run.py`` is "train\_eval". If no evaluation is needed you should use "train" mode. * **logdir** (string) --- path to the log directory where all checkpoints and summaries will be saved. * **data_layer** (any class derived from :class:`DataLayer <data.data_layer.DataLayer>`) --- data layer class to use. * **data_layer_params** (dict) --- dictionary with data layer configuration. For complete list of possible parameters see the corresponding class docs. * **optimizer** (string or TensorFlow optimizer class) --- optimizer to use for training. Could be either "Adam", "Adagrad", "Ftrl", "Momentum", "RMSProp", "SGD" or any valid TensorFlow optimizer class. * **optimizer_params** (dict) --- dictionary that will be passed to optimizer ``__init__`` method. * **initializer** --- any valid TensorFlow initializer. * **initializer_params** (dict) --- dictionary that will be passed to initializer ``__init__`` method. * **regularizer** --- and valid TensorFlow regularizer. * **regularizer_params** (dict) --- dictionary that will be passed to regularizer ``__init__`` method. * **dtype** --- model dtype. Could be either ``tf.float16``, ``tf.float32`` or "mixed". For details see :ref:`mixed precision training <mixed_precision>` section in docs. * **lr_policy** --- any valid learning rate policy function. For examples, see :any:`optimizers.lr_policies` module. * **lr_policy_params** (dict) --- dictionary containing lr_policy parameters. * **max_grad_norm** (float) --- maximum value of gradient norm. Clipping will be performed if some gradients exceed this value (this is checked for each variable independently). * **loss_scaling** --- could be float or string. If float, static loss scaling is applied. If string, the corresponding automatic loss scaling algorithm is used. Must be one of 'Backoff' of 'LogMax' (case insensitive). Only used when dtype="mixed". For details see :ref:`mixed precision training <mixed_precision>` section in docs. * **summaries** (list) --- which summaries to log. Could contain "learning_rate", "gradients", "gradient_norm", "global_gradient_norm", "variables", "variable_norm". * **iter_size** (int) --- use this parameter to emulate large batches. The gradients will be accumulated for ``iter_size`` number of steps before applying update. * **larc_params** --- dictionary with parameters for LARC (or LARS) optimization algorithms. Can contain the following parameters: * **larc_mode** --- Could be either "scale" (LARS) or "clip" (LARC). Note that it works in addition to any other optimization algorithm since we treat it as adaptive gradient clipping and learning rate adjustment. * **larc_eta** (float) --- LARC or LARS scaling parameter. * **min_update** (float) --- minimal value of the LARC (LARS) update. * **epsilon** (float) --- small number added to gradient norm in denominator for numerical stability. """ check_params(params, self.get_required_params(), self.get_optional_params()) self._params = copy.deepcopy(params) if self._params.get('iter_size', 1) > 1 and hvd is None: raise ValueError("iter_size is only supported in Horovod mode") # parameter checks self._mode = mode if self._mode not in ["train", "infer", "eval"]: raise ValueError( "Mode has to be one of ['train', 'infer', 'eval']") if "max_steps" in params and "num_epochs" in params: raise ValueError( "You can't provide both max_steps and num_epochs. " "Please, remove one of them from the config.") if mode == "train": if "max_steps" not in params and "num_epochs" not in params: raise ValueError("For training mode either max_steps or " "num_epochs has to be provided") if 'print_samples_steps' not in self._params: self._params['print_samples_steps'] = None if 'print_loss_steps' not in self._params: self._params['print_loss_steps'] = None if 'save_checkpoint_steps' not in self._params: self._params['save_checkpoint_steps'] = None if 'save_summaries_steps' not in self._params: self._params['save_summaries_steps'] = None if 'print_bench_info_steps' not in self._params: self._params['print_bench_info_steps'] = None # checking that frequencies of samples and loss are aligned s_fr = self._params['print_samples_steps'] l_fr = self._params['print_loss_steps'] if s_fr is not None and l_fr is not None and s_fr % l_fr != 0: raise ValueError("print_samples_steps has to be a multiple of " "print_loss_steps.") self._hvd = hvd if self._hvd: self._gpu_ids = range(1) else: if 'gpu_ids' in self._params: self._gpu_ids = self._params['gpu_ids'] elif 'num_gpus' in self._params: self._gpu_ids = range(self._params['num_gpus']) else: raise ValueError('Either "gpu_ids" or "num_gpus" has to ' 'be specified in the config') # setting random seed rs = self._params.get('random_seed', int(time.time())) if self.on_horovod: rs += hvd.rank() tf.set_random_seed(rs) np.random.seed(rs) if 'dtype' not in self._params: self._params['dtype'] = tf.float32 dl_params = self._params.get('data_layer_params', {}) dl_params['batch_size'] = self._params['batch_size_per_gpu'] dl_params['mode'] = self._mode if self.on_horovod: self._data_layer = self._params['data_layer']( params=dl_params, model=self, num_workers=self._hvd.size(), worker_id=self._hvd.rank(), ) else: self._data_layers = [] for worker_id in range(self.num_gpus): self._data_layers.append(self._params['data_layer']( params=dl_params, model=self, num_workers=self.num_gpus, worker_id=worker_id, )) if self._mode == "train": if "max_steps" in self._params: self._last_step = self._params["max_steps"] self._steps_in_epoch = None else: # doing a few less steps if data size is not divisible by the batch size self._steps_in_epoch = self.get_data_layer().get_size_in_samples() // \ self.get_data_layer().params['batch_size'] if self._steps_in_epoch is None: raise ValueError( 'The data_layer is not compatible with ' 'epoch execution, since it does not provide ' 'get_size_in_samples() method. Either update the ' 'data layer or switch to using "max_steps" ' 'paremeter.') if self.on_horovod: self._steps_in_epoch //= self._hvd.size() else: self._steps_in_epoch //= self.num_gpus self._steps_in_epoch //= self._params.get('iter_size', 1) if self._steps_in_epoch == 0: raise ValueError( "Overall batch size is too big for this dataset.") self._last_step = self._params[ 'num_epochs'] * self._steps_in_epoch if self.on_horovod: self._output = None else: self._outputs = [None] * self.num_gpus self.loss = None self.train_op = None self.eval_losses = None self._num_objects_per_step = None self.skip_update_ph = None
def __init__(self, params, mode="train", hvd=None): """Model constructor. The TensorFlow graph should not be created here, but rather in the :meth:`self.compile() <compile>` method. Args: params (dict): parameters describing the model. All supported parameters are listed in :meth:`get_required_params`, :meth:`get_optional_params` functions. mode (string, optional): "train", "eval" or "infer". If mode is "train" all parts of the graph will be built (model, loss, optimizer). If mode is "eval", only model and loss will be built. If mode is "infer", only model will be built. hvd (optional): if Horovod is used, this should be ``horovod.tensorflow`` module. If Horovod is not used, it should be None. Config parameters: * **random_seed** (int) --- random seed to use. * **use_horovod** (bool) --- whether to use Horovod for distributed execution. * **num_gpus** (int) --- number of GPUs to use. This parameter cannot be used if ``gpu_ids`` is specified. When ``use_horovod`` is True this parameter is ignored. * **gpu_ids** (list of ints) --- GPU ids to use. This parameter cannot be used if ``num_gpus`` is specified. When ``use_horovod`` is True this parameter is ignored. * **batch_size_per_gpu** (int) --- batch size to use for each GPU. * **num_epochs** (int) --- number of epochs to run training for. This parameter cannot be used if ``max_steps`` is specified. * **max_steps** (int) --- number of steps to run training for. This parameter cannot be used if ``num_epochs`` is specified. * **save_summaries_steps** (int or None) --- how often to save summaries. Setting it to None disables summaries saving. * **print_loss_steps** (int or None) --- how often to print loss during training. Setting it to None disables loss printing. * **print_samples_steps** (int or None) --- how often to print training samples (input sequences, correct answers and model predictions). Setting it to None disables samples printing. * **save_checkpoint_steps** (int or None) --- how often to save model checkpoints. Setting it to None disables checkpoint saving. * **eval_steps** (int) --- how often to run evaluation during training. This parameter is only checked if ``--mode`` argument of ``run.py`` is "train\_eval". If no evaluation is needed you should use "train" mode. * **logdir** (string) --- path to the log directory where all checkpoints and summaries will be saved. * **data_layer** (any class derived from :class:`DataLayer <data.data_layer.DataLayer>`) --- data layer class to use. * **data_layer_params** (dict) --- dictionary with data layer configuration. For complete list of possible parameters see the corresponding class docs. * **learning_rate** (float) --- initial learning rate for training. * **optimizer** (string or TensorFlow optimizer class) --- optimizer to use for training. Could be either "Adam", "Adagrad", "Ftrl", "Momentum", "RMSProp", "SGD" or any valid TensorFlow optimizer class. * **optimizer_params** (dict) --- dictionary that will be passed to optimizer ``__init__`` method. * **initializer** --- any valid TensorFlow initializer. * **initializer_params** (dict) --- dictionary that will be passed to initializer ``__init__`` method. * **regularizer** --- and valid TensorFlow regularizer. * **regularizer_params** (dict) --- dictionary that will be passed to regularizer ``__init__`` method. * **dtype** --- model dtype. Could be either ``tf.float16``, ``tf.float32`` or "mixed". For details see :ref:`mixed precision training <mixed_precision>` section in docs. * **lr_policy** --- any valid learning rate policy function. For examples, see :any:`optimizers.lr_policies` module. * **lr_policy_params** (dict) --- dictionary containing lr_policy parameters. * **max_grad_norm** (float) --- maximum value of gradient norm. Clipping will be performed if some gradients exceed this value (this is checked for each variable independently). * **larc_mode** --- specify this to use LARC or LARS optimization algorithms. Could be either "scale" (LARS) or "clip" (LARC). You also need to specify ``larc_nu`` to enable LARC or LARS. Note that it works in addition to any other optimization algorithm since we treat it as adaptive gradient clipping and learning rate adjustment. * **larc_nu** (float) --- LARC or LARS scaling parameter. * **loss_scale** (float) --- static loss scale to use. For details see :ref:`mixed precision training <mixed_precision>` section in docs. * **automatic_loss_scaling** --- automatic loss scaling mode. Could be either None, "Backoff" or "Logmax". For details see :ref:`mixed precision training <mixed_precision>` section in docs. * **summaries** (list) --- which summaries to log. Could contain "learning_rate", "gradients", "gradient_norm", "global_gradient_norm", "variables", "variable_norm". """ check_params(params, self.get_required_params(), self.get_optional_params()) self._params = copy.deepcopy(params) # parameter checks self._mode = mode if self._mode not in ["train", "infer", "eval"]: raise ValueError( "Mode has to be one of ['train', 'infer', 'eval']") if "max_steps" in params and "num_epochs" in params: raise ValueError( "You can't provide both max_steps and num_epochs. " "Please, remove one of them from the config.") if mode == "train": if "max_steps" not in params and "num_epochs" not in params: raise ValueError("For training mode either max_steps or " "num_epochs has to be provided") if 'print_samples_steps' not in self._params: self._params['print_samples_steps'] = None if 'print_loss_steps' not in self._params: self._params['print_loss_steps'] = None if 'save_checkpoint_steps' not in self._params: self._params['save_checkpoint_steps'] = None if 'save_summaries_steps' not in self._params: self._params['save_summaries_steps'] = None # checking that frequencies of samples and loss are aligned s_fr = self._params['print_samples_steps'] l_fr = self._params['print_loss_steps'] if s_fr is not None and l_fr is not None and s_fr % l_fr != 0: raise ValueError("print_samples_steps has to be a multiple of " "print_loss_steps.") self._hvd = hvd if self._hvd: self._gpu_ids = range(1) else: if 'gpu_ids' in self._params: self._gpu_ids = self._params['gpu_ids'] elif 'num_gpus' in self._params: self._gpu_ids = range(self._params['num_gpus']) else: raise ValueError('Either "gpu_ids" or "num_gpus" has to ' 'be specified in the config') # setting random seed rs = self._params.get('random_seed', int(time.time())) if self.on_horovod: rs += hvd.rank() tf.set_random_seed(rs) np.random.seed(rs) if 'dtype' not in self._params: self._params['dtype'] = tf.float32 dl_params = self._params.get('data_layer_params', {}) dl_params['batch_size'] = self._params['batch_size_per_gpu'] dl_params['use_targets'] = (self._mode == "train" or self._mode == "eval") if self.on_horovod: self._data_layer = self._params['data_layer']( params=dl_params, model=self, num_workers=self._hvd.size(), worker_id=self._hvd.rank(), ) else: dl = self._params['data_layer'](params=dl_params, model=self) self._data_layer = MultiGPUWrapper(dl, num_gpus=self.num_gpus) if self._mode == "train": if "max_steps" in self._params: self._last_step = self._params["max_steps"] self._steps_in_epoch = None else: # doing a few less steps if data size is not divisible by the batch size self._steps_in_epoch = self._data_layer.get_size_in_batches() # if on Horovod, there will be hvd.size() independent data_layer copies # and thus the total size is hvd.size() times smaller. if self.on_horovod: self._steps_in_epoch //= self._hvd.size() self._last_step = self._params[ 'num_epochs'] * self._steps_in_epoch self._outputs = [None] * self.num_gpus self.loss = None self.train_op = None
def __init__(self, params, mode="train", hvd=None): """Model constructor. The TensorFlow graph should not be created here, but rather in the :meth:`self.compile() <compile>` method. Args: params (dict): parameters describing the model. All supported parameters are listed in :meth:`get_required_params`, :meth:`get_optional_params` functions. mode (string, optional): "train", "eval" or "infer". If mode is "train" all parts of the graph will be built (model, loss, optimizer). If mode is "eval", only model and loss will be built. If mode is "infer", only model will be built. hvd (optional): if Horovod is used, this should be ``horovod.tensorflow`` module. If Horovod is not used, it should be None. Config parameters: * **random_seed** (int) --- random seed to use. * **use_horovod** (bool) --- whether to use Horovod for distributed execution. * **num_gpus** (int) --- number of GPUs to use. This parameter cannot be used if ``gpu_ids`` is specified. When ``use_horovod`` is True this parameter is ignored. * **gpu_ids** (list of ints) --- GPU ids to use. This parameter cannot be used if ``num_gpus`` is specified. When ``use_horovod`` is True this parameter is ignored. * **batch_size_per_gpu** (int) --- batch size to use for each GPU. * **num_epochs** (int) --- number of epochs to run training for. This parameter cannot be used if ``max_steps`` is specified. * **max_steps** (int) --- number of steps to run training for. This parameter cannot be used if ``num_epochs`` is specified. * **save_summaries_steps** (int or None) --- how often to save summaries. Setting it to None disables summaries saving. * **print_loss_steps** (int or None) --- how often to print loss during training. Setting it to None disables loss printing. * **print_samples_steps** (int or None) --- how often to print training samples (input sequences, correct answers and model predictions). Setting it to None disables samples printing. * **save_checkpoint_steps** (int or None) --- how often to save model checkpoints. Setting it to None disables checkpoint saving. * **eval_steps** (int) --- how often to run evaluation during training. This parameter is only checked if ``--mode`` argument of ``run.py`` is "train\_eval". If no evaluation is needed you should use "train" mode. * **logdir** (string) --- path to the log directory where all checkpoints and summaries will be saved. * **data_layer** (any class derived from :class:`DataLayer <data.data_layer.DataLayer>`) --- data layer class to use. * **data_layer_params** (dict) --- dictionary with data layer configuration. For complete list of possible parameters see the corresponding class docs. * **optimizer** (string or TensorFlow optimizer class) --- optimizer to use for training. Could be either "Adam", "Adagrad", "Ftrl", "Momentum", "RMSProp", "SGD" or any valid TensorFlow optimizer class. * **optimizer_params** (dict) --- dictionary that will be passed to optimizer ``__init__`` method. * **initializer** --- any valid TensorFlow initializer. * **initializer_params** (dict) --- dictionary that will be passed to initializer ``__init__`` method. * **regularizer** --- and valid TensorFlow regularizer. * **regularizer_params** (dict) --- dictionary that will be passed to regularizer ``__init__`` method. * **dtype** --- model dtype. Could be either ``tf.float16``, ``tf.float32`` or "mixed". For details see :ref:`mixed precision training <mixed_precision>` section in docs. * **lr_policy** --- any valid learning rate policy function. For examples, see :any:`optimizers.lr_policies` module. * **lr_policy_params** (dict) --- dictionary containing lr_policy parameters. * **max_grad_norm** (float) --- maximum value of gradient norm. Clipping will be performed if some gradients exceed this value (this is checked for each variable independently). * **loss_scale** (float) --- static loss scale to use. For details see :ref:`mixed precision training <mixed_precision>` section in docs. * **automatic_loss_scaling** --- automatic loss scaling mode. Could be either None, "Backoff" or "Logmax". For details see :ref:`mixed precision training <mixed_precision>` section in docs. * **summaries** (list) --- which summaries to log. Could contain "learning_rate", "gradients", "gradient_norm", "global_gradient_norm", "variables", "variable_norm". * **larc_params** --- dictionary with parameters for LARC (or LARS) optimization algorithms. Can contain the following parameters: * **larc_mode** --- Could be either "scale" (LARS) or "clip" (LARC). Note that it works in addition to any other optimization algorithm since we treat it as adaptive gradient clipping and learning rate adjustment. * **larc_eta** (float) --- LARC or LARS scaling parameter. * **min_update** (float) --- minimal value of the LARC (LARS) update. * **epsilon** (float) --- small number added to gradient norm in denominator for numerical stability. """ check_params(params, self.get_required_params(), self.get_optional_params()) self._params = copy.deepcopy(params) # parameter checks self._mode = mode if self._mode not in ["train", "infer", "eval"]: raise ValueError("Mode has to be one of ['train', 'infer', 'eval']") if "max_steps" in params and "num_epochs" in params: raise ValueError("You can't provide both max_steps and num_epochs. " "Please, remove one of them from the config.") if mode == "train": if "max_steps" not in params and "num_epochs" not in params: raise ValueError("For training mode either max_steps or " "num_epochs has to be provided") if 'print_samples_steps' not in self._params: self._params['print_samples_steps'] = None if 'print_loss_steps' not in self._params: self._params['print_loss_steps'] = None if 'save_checkpoint_steps' not in self._params: self._params['save_checkpoint_steps'] = None if 'save_summaries_steps' not in self._params: self._params['save_summaries_steps'] = None # checking that frequencies of samples and loss are aligned s_fr = self._params['print_samples_steps'] l_fr = self._params['print_loss_steps'] if s_fr is not None and l_fr is not None and s_fr % l_fr != 0: raise ValueError("print_samples_steps has to be a multiple of " "print_loss_steps.") self._hvd = hvd if self._hvd: self._gpu_ids = range(1) else: if 'gpu_ids' in self._params: self._gpu_ids = self._params['gpu_ids'] elif 'num_gpus' in self._params: self._gpu_ids = range(self._params['num_gpus']) else: raise ValueError('Either "gpu_ids" or "num_gpus" has to ' 'be specified in the config') # setting random seed rs = self._params.get('random_seed', int(time.time())) if self.on_horovod: rs += hvd.rank() tf.set_random_seed(rs) np.random.seed(rs) if 'dtype' not in self._params: self._params['dtype'] = tf.float32 dl_params = self._params.get('data_layer_params', {}) dl_params['batch_size'] = self._params['batch_size_per_gpu'] dl_params['mode'] = self._mode if self.on_horovod: self._data_layer = self._params['data_layer']( params=dl_params, model=self, num_workers=self._hvd.size(), worker_id=self._hvd.rank(), ) else: self._data_layers = [] for worker_id in range(self.num_gpus): self._data_layers.append(self._params['data_layer']( params=dl_params, model=self, num_workers=self.num_gpus, worker_id=worker_id, )) if self._mode == "train": if "max_steps" in self._params: self._last_step = self._params["max_steps"] self._steps_in_epoch = None else: # doing a few less steps if data size is not divisible by the batch size self._steps_in_epoch = self.get_data_layer().get_size_in_samples() // \ self.get_data_layer().params['batch_size'] if self._steps_in_epoch is None: raise ValueError('The data_layer is not compatible with ' 'epoch execution, since it does not provide ' 'get_size_in_samples() method. Either update the ' 'data layer or switch to using "max_steps" ' 'paremeter.') if self.on_horovod: self._steps_in_epoch //= self._hvd.size() else: self._steps_in_epoch //= self.num_gpus self._last_step = self._params['num_epochs'] * self._steps_in_epoch if self.on_horovod: self._output = None else: self._outputs = [None] * self.num_gpus self.loss = None self.train_op = None self.eval_losses = None
def post_process_gradients(grads_and_vars, summaries, lr, clip_gradients, larc_params): """Applies post processing to gradients, i.e. clipping, LARC, summaries.""" if "global_gradient_norm" in summaries: tf.summary.scalar( "global_gradient_norm", _global_norm_with_cast(grads_and_vars), ) # Optionally clip gradients by global norm. if clip_gradients is not None: grads_and_vars = _clip_gradients_by_norm(grads_and_vars, clip_gradients) # Add histograms for variables, gradients and gradient norms. for gradient, variable in grads_and_vars: if isinstance(gradient, tf.IndexedSlices): grad_values = gradient.values else: grad_values = gradient if isinstance(variable, tf.IndexedSlices): var_values = variable.values else: var_values = variable if grad_values is not None: var_name = variable.name.replace(":", "_") if "gradients" in summaries: # need to mask nans for automatic loss scaling tf.summary.histogram("gradients/%s" % var_name, mask_nans(grad_values)) if "gradient_norm" in summaries: tf.summary.scalar("gradient_norm/%s" % var_name, tf.norm(grad_values)) if "variables" in summaries: tf.summary.histogram("variables/%s" % var_name, var_values) if "variable_norm" in summaries: tf.summary.scalar("variable_norm/%s" % var_name, tf.norm(var_values)) if clip_gradients is not None and "global_gradient_norm" in summaries: tf.summary.scalar( "global_clipped_gradient_norm", _global_norm_with_cast(grads_and_vars), ) # LARC gradient re-scaling if larc_params is not None: check_params( config=larc_params, required_dict={'larc_eta': float}, optional_dict={ 'larc_mode': ['clip', 'scale'], 'min_update': float, 'epsilon': float }, ) larc_eta = larc_params['larc_eta'] larc_mode = larc_params.get('larc_mode', 'clip') min_update = larc_params.get('min_update', 1e-7) eps = larc_params.get('epsilon', 1e-7) grads_and_vars_larc = [None] * len(grads_and_vars) for idx, (g, v) in enumerate(grads_and_vars): var_dtype = v.dtype v_norm = tf.norm(tensor=tf.cast(v, tf.float32), ord=2) g_norm = tf.norm(tensor=tf.cast(g, tf.float32), ord=2) if larc_mode == 'clip': larc_grad_update = tf.maximum( larc_eta * v_norm / (lr * (g_norm + eps)), min_update, ) if "larc_summaries" in summaries: tf.summary.scalar( 'larc_clip_on/{}'.format(v.name), tf.cast(tf.less(larc_grad_update, 1.0), tf.int32)) larc_grad_update = tf.minimum(larc_grad_update, 1.0) else: larc_grad_update = tf.maximum( larc_eta * v_norm / (g_norm + eps), min_update, ) larc_grad_update = tf.saturate_cast(larc_grad_update, var_dtype) grads_and_vars_larc[idx] = (larc_grad_update * g, v) # adding additional summary if "larc_summaries" in summaries: tf.summary.scalar('larc_grad_update/{}'.format(v.name), larc_grad_update) tf.summary.scalar("larc_final_lr/{}".format(v.name), tf.cast(lr, var_dtype) * larc_grad_update) grads_and_vars = grads_and_vars_larc return grads_and_vars
def optimize_loss(loss, optimizer, optimizer_params, learning_rate_decay_fn, global_step=None, dtype=tf.float32, gradient_noise_scale=None, gradient_multipliers=None, clip_gradients=None, update_ops=None, variables=None, name=None, summaries=None, colocate_gradients_with_ops=False, increment_global_step=True, larc_params=None, loss_scale=1.0, automatic_loss_scaling=None, on_horovod=False): """Given loss and parameters for optimizer, returns a training op. Various ways of passing optimizers include: - by string specifying the name of the optimizer. See OPTIMIZER_CLS_NAMES for full list. E.g. `optimize_loss(..., optimizer='Adam')`. - by function taking learning rate `Tensor` as argument and returning an `Optimizer` instance. E.g. `optimize_loss(..., optimizer=lambda lr: tf.train.MomentumOptimizer(lr, momentum=0.5))`. Alternatively, if `learning_rate` is `None`, the function takes no arguments. E.g. `optimize_loss(..., learning_rate=None, optimizer=lambda: tf.train.MomentumOptimizer(0.5, momentum=0.5))`. - by a subclass of `Optimizer` having a single-argument constructor (the argument is the learning rate), such as AdamOptimizer or AdagradOptimizer. E.g. `optimize_loss(..., optimizer=tf.train.AdagradOptimizer)`. - by an instance of a subclass of `Optimizer`. E.g., `optimize_loss(..., optimizer=tf.train.AdagradOptimizer(0.5))`. Args: loss: Scalar `Tensor`. global_step: Scalar int `Tensor`, step counter to update on each step unless `increment_global_step` is `False`. If not supplied, it will be fetched from the default graph (see `tf.train.get_global_step` for details). If it has not been created, no step will be incremented with each weight update. `learning_rate_decay_fn` requires `global_step`. learning_rate: float or `Tensor`, magnitude of update per each training step. Can be `None`. optimizer: string, class or optimizer instance, used as trainer. string should be name of optimizer, like 'SGD', 'Adam', 'Adagrad'. Full list in OPTIMIZER_CLS_NAMES constant. class should be sub-class of `tf.Optimizer` that implements `compute_gradients` and `apply_gradients` functions. optimizer instance should be instantiation of `tf.Optimizer` sub-class and have `compute_gradients` and `apply_gradients` functions. gradient_noise_scale: float or None, adds 0-mean normal noise scaled by this value. gradient_multipliers: dict of variables or variable names to floats. If present, gradients for specified variables will be multiplied by given constant. clip_gradients: float, callable or `None`. If float, is provided, a global clipping is applied to prevent the norm of the gradient to exceed this value. Alternatively, a callable can be provided e.g.: adaptive_clipping. This callable takes a `list` of `(gradients, variables)` `tuple`s and returns the same thing with the gradients modified. learning_rate_decay_fn: function, takes `learning_rate` and `global_step` `Tensor`s, returns `Tensor`. Can be used to implement any learning rate decay functions. For example: `tf.train.exponential_decay`. Ignored if `learning_rate` is not supplied. update_ops: list of update `Operation`s to execute at each step. If `None`, uses elements of UPDATE_OPS collection. The order of execution between `update_ops` and `loss` is non-deterministic. variables: list of variables to optimize or `None` to use all trainable variables. name: The name for this operation is used to scope operations and summaries. summaries: List of internal quantities to visualize on tensorboard. If not set only the loss and the learning rate will be reported. The complete list is in OPTIMIZER_SUMMARIES. colocate_gradients_with_ops: If True, try colocating gradients with the corresponding op. increment_global_step: Whether to increment `global_step`. If your model calls `optimize_loss` multiple times per training step (e.g. to optimize different parts of the model), use this arg to avoid incrementing `global_step` more times than necessary. LARC_mode: 'scale' or 'clip' LARC_nu: If not None, LARC re-scaling will be applied https://arxiv.org/pdf/1708.03888.pdf with nu=LARC_nu automatic_loss_scaling: if not None, use the corresponding automatic loss scaling algorithm. Must be one of 'Backoff' of 'LogMax'. `dtype` must be "mixed" to use ALS. Returns: Training op. Raises: ValueError: if: * `loss` is an invalid type or shape. * `global_step` is an invalid type or shape. * `learning_rate` is an invalid type or value. * `optimizer` has the wrong type. * `clip_gradients` is neither float nor callable. * `learning_rate` and `learning_rate_decay_fn` are supplied, but no `global_step` is available. * `gradients` is empty. """ loss = ops.convert_to_tensor(loss) contrib_framework.assert_scalar(loss) if global_step is None: global_step = tf.train.get_or_create_global_step() else: tf.train.assert_global_step(global_step) with vs.variable_scope(name, "OptimizeLoss", [loss, global_step]): # Update ops take UPDATE_OPS collection if not provided. if update_ops is None: update_ops = set(ops.get_collection(ops.GraphKeys.UPDATE_OPS)) # Make sure update ops are ran before computing loss. if update_ops: loss = control_flow_ops.with_dependencies(list(update_ops), loss) if summaries is None: summaries = ["learning_rate", "global_gradient_norm"] else: for summ in summaries: if summ not in OPTIMIZER_SUMMARIES: raise ValueError("Summaries should be one of [%s], you provided %s." % (", ".join(OPTIMIZER_SUMMARIES), summ)) if global_step is None: raise ValueError("global_step is required for learning_rate_decay_fn.") lr = learning_rate_decay_fn(global_step) if "learning_rate" in summaries: summary.scalar("learning_rate", lr) # Create optimizer, given specified parameters. if isinstance(optimizer, six.string_types): if lr is None: raise ValueError("Learning rate is None, but should be specified if " "optimizer is string (%s)." % optimizer) if optimizer not in OPTIMIZER_CLS_NAMES: raise ValueError( "Optimizer name should be one of [%s], you provided %s." % (", ".join(OPTIMIZER_CLS_NAMES), optimizer)) opt = OPTIMIZER_CLS_NAMES[optimizer](learning_rate=lr, **optimizer_params) elif (isinstance(optimizer, type) and issubclass(optimizer, optimizer_.Optimizer)): if lr is None: raise ValueError("Learning rate is None, but should be specified if " "optimizer is class (%s)." % optimizer) opt = optimizer(learning_rate=lr, **optimizer_params) elif isinstance(optimizer, optimizer_.Optimizer): opt = optimizer elif callable(optimizer): if lr is not None: opt = optimizer(lr, **optimizer_params) else: opt = optimizer(**optimizer_params) if not isinstance(opt, optimizer_.Optimizer): raise ValueError("Unrecognized optimizer: function should return " "subclass of Optimizer. Got %s." % str(opt)) else: raise ValueError("Unrecognized optimizer: should be string, " "subclass of Optimizer, instance of " "subclass of Optimizer or function with one argument. " "Got %s." % str(optimizer)) # All trainable variables, if specific variables are not specified. if variables is None: variables = vars_.trainable_variables() if automatic_loss_scaling is not None: if automatic_loss_scaling not in AutomaticLossScaler.SUPPORTED_ALGOS: raise ValueError("Unknown automatic loss scaling algorithm: %s." % automatic_loss_sclaing) if dtype != "mixed": raise ValueError("Automatic loss scaling can be used only with " "dtype=mixed.") loss_scale = AutomaticLossScaler(algorithm=automatic_loss_scaling) if dtype == 'mixed': opt = MixedPrecisionOptimizerWrapper(opt, loss_scale=loss_scale) if on_horovod: opt = DistributedOptimizer(opt) # Compute gradients. gradients = opt.compute_gradients( loss, variables, colocate_gradients_with_ops=colocate_gradients_with_ops, ) # Optionally add gradient noise. if gradient_noise_scale is not None: gradients = _add_scaled_noise_to_gradients(gradients, gradient_noise_scale) # Multiply some gradients. if gradient_multipliers is not None: gradients = _multiply_gradients(gradients, gradient_multipliers) if not gradients: raise ValueError( "Empty list of (gradient, var) pairs encountered. This is most " "likely to be caused by an improper value of gradient_multipliers.") if "global_gradient_norm" in summaries or "gradient_norm" in summaries: summary.scalar( "global_norm/gradient_norm", clip_ops.global_norm(list(map( lambda x: tf.cast(x, tf.float32), list(zip(*gradients))[0]) )), ) # Optionally clip gradients by global norm. if clip_gradients is not None and larc_params is not None: raise AttributeError( "LARC and gradient norm clipping should not be used together" ) if isinstance(clip_gradients, float): gradients = _clip_gradients_by_norm(gradients, clip_gradients) elif callable(clip_gradients): gradients = clip_gradients(gradients) elif clip_gradients is not None: raise ValueError( "Unknown type %s for clip_gradients" % type(clip_gradients)) # Add histograms for variables, gradients and gradient norms. for gradient, variable in gradients: if isinstance(gradient, ops.IndexedSlices): grad_values = gradient.values else: grad_values = gradient if isinstance(variable, ops.IndexedSlices): var_values = variable.values else: var_values = variable if grad_values is not None: var_name = variable.name.replace(":", "_") if "gradients" in summaries: summary.histogram("gradients/%s" % var_name, mask_nans(grad_values)) if "gradient_norm" in summaries: summary.scalar("gradient_norm/%s" % var_name, clip_ops.global_norm([grad_values])) if "variables" in summaries: summary.histogram("variables/%s" % var_name, var_values) if "variable_norm" in summaries: summary.scalar("variable_norm/%s" % var_name, clip_ops.global_norm([var_values])) if clip_gradients is not None and ("global_gradient_norm" in summaries or "gradient_norm" in summaries): summary.scalar( "global_norm/clipped_gradient_norm", clip_ops.global_norm(list(map( lambda x: tf.cast(x, tf.float32), list(zip(*gradients))[0]) )), ) # LARC gradient re-scaling if larc_params is not None: check_params( config=larc_params, required_dict={'larc_eta': float}, optional_dict={ 'larc_mode': ['clip', 'scale'], 'min_update': float, 'epsilon': float }, ) larc_eta = larc_params['larc_eta'] larc_mode = larc_params.get('larc_mode', 'clip') min_update = larc_params.get('min_update', 1e-7) eps = larc_params.get('epsilon', 1e-7) for idx, (g, v) in enumerate(gradients): var_dtype = v.dtype v_norm = tf.norm(tensor=tf.cast(v, tf.float32), ord=2) g_norm = tf.norm(tensor=tf.cast(g, tf.float32), ord=2) if larc_mode == 'clip': larc_grad_update = tf.maximum( larc_eta * v_norm / (lr * (g_norm + eps)), min_update, ) if "larc_summaries" in summaries: summary.scalar('larc_clip_on/{}'.format(v.name), tf.cast(tf.less(larc_grad_update, 1.0), tf.int32)) larc_grad_update = tf.minimum(larc_grad_update, 1.0) else: larc_grad_update = tf.maximum( larc_eta * v_norm / (g_norm + eps), min_update, ) larc_grad_update = tf.saturate_cast(larc_grad_update, var_dtype) gradients[idx] = (larc_grad_update * g, v) # adding additional summary if "larc_summaries" in summaries: summary.scalar('larc_grad_update/{}'.format(v.name), larc_grad_update) summary.scalar("larc_final_lr/{}".format(v.name), tf.cast(lr, var_dtype) * larc_grad_update) # Create gradient updates. grad_updates = opt.apply_gradients( gradients, global_step=global_step if increment_global_step else None, name="train") # # Ensure the train_tensor computes grad_updates. train_tensor = control_flow_ops.with_dependencies([grad_updates], loss) return train_tensor
def optimize_loss(loss, optimizer, optimizer_params, learning_rate_decay_fn, global_step=None, dtype=tf.float32, gradient_noise_scale=None, gradient_multipliers=None, clip_gradients=None, update_ops=None, variables=None, name=None, summaries=None, colocate_gradients_with_ops=False, increment_global_step=True, larc_params=None, loss_scale=1.0, automatic_loss_scaling=None, on_horovod=False): """Given loss and parameters for optimizer, returns a training op. Various ways of passing optimizers include: - by string specifying the name of the optimizer. See OPTIMIZER_CLS_NAMES for full list. E.g. `optimize_loss(..., optimizer='Adam')`. - by function taking learning rate `Tensor` as argument and returning an `Optimizer` instance. E.g. `optimize_loss(..., optimizer=lambda lr: tf.train.MomentumOptimizer(lr, momentum=0.5))`. Alternatively, if `learning_rate` is `None`, the function takes no arguments. E.g. `optimize_loss(..., learning_rate=None, optimizer=lambda: tf.train.MomentumOptimizer(0.5, momentum=0.5))`. - by a subclass of `Optimizer` having a single-argument constructor (the argument is the learning rate), such as AdamOptimizer or AdagradOptimizer. E.g. `optimize_loss(..., optimizer=tf.train.AdagradOptimizer)`. - by an instance of a subclass of `Optimizer`. E.g., `optimize_loss(..., optimizer=tf.train.AdagradOptimizer(0.5))`. Args: loss: Scalar `Tensor`. global_step: Scalar int `Tensor`, step counter to update on each step unless `increment_global_step` is `False`. If not supplied, it will be fetched from the default graph (see `tf.train.get_global_step` for details). If it has not been created, no step will be incremented with each weight update. `learning_rate_decay_fn` requires `global_step`. learning_rate: float or `Tensor`, magnitude of update per each training step. Can be `None`. optimizer: string, class or optimizer instance, used as trainer. string should be name of optimizer, like 'SGD', 'Adam', 'Adagrad'. Full list in OPTIMIZER_CLS_NAMES constant. class should be sub-class of `tf.Optimizer` that implements `compute_gradients` and `apply_gradients` functions. optimizer instance should be instantiation of `tf.Optimizer` sub-class and have `compute_gradients` and `apply_gradients` functions. gradient_noise_scale: float or None, adds 0-mean normal noise scaled by this value. gradient_multipliers: dict of variables or variable names to floats. If present, gradients for specified variables will be multiplied by given constant. clip_gradients: float, callable or `None`. If float, is provided, a global clipping is applied to prevent the norm of the gradient to exceed this value. Alternatively, a callable can be provided e.g.: adaptive_clipping. This callable takes a `list` of `(gradients, variables)` `tuple`s and returns the same thing with the gradients modified. learning_rate_decay_fn: function, takes `learning_rate` and `global_step` `Tensor`s, returns `Tensor`. Can be used to implement any learning rate decay functions. For example: `tf.train.exponential_decay`. Ignored if `learning_rate` is not supplied. update_ops: list of update `Operation`s to execute at each step. If `None`, uses elements of UPDATE_OPS collection. The order of execution between `update_ops` and `loss` is non-deterministic. variables: list of variables to optimize or `None` to use all trainable variables. name: The name for this operation is used to scope operations and summaries. summaries: List of internal quantities to visualize on tensorboard. If not set only the loss and the learning rate will be reported. The complete list is in OPTIMIZER_SUMMARIES. colocate_gradients_with_ops: If True, try colocating gradients with the corresponding op. increment_global_step: Whether to increment `global_step`. If your model calls `optimize_loss` multiple times per training step (e.g. to optimize different parts of the model), use this arg to avoid incrementing `global_step` more times than necessary. LARC_mode: 'scale' or 'clip' LARC_nu: If not None, LARC re-scaling will be applied https://arxiv.org/pdf/1708.03888.pdf with nu=LARC_nu automatic_loss_scaling: if not None, use the corresponding automatic loss scaling algorithm. Must be one of 'Backoff' of 'LogMax'. `dtype` must be "mixed" to use ALS. Returns: Training op. Raises: ValueError: if: * `loss` is an invalid type or shape. * `global_step` is an invalid type or shape. * `learning_rate` is an invalid type or value. * `optimizer` has the wrong type. * `clip_gradients` is neither float nor callable. * `learning_rate` and `learning_rate_decay_fn` are supplied, but no `global_step` is available. * `gradients` is empty. """ loss = ops.convert_to_tensor(loss) contrib_framework.assert_scalar(loss) if global_step is None: global_step = tf.train.get_or_create_global_step() else: tf.train.assert_global_step(global_step) with vs.variable_scope(name, "OptimizeLoss", [loss, global_step]): # Update ops take UPDATE_OPS collection if not provided. if update_ops is None: update_ops = set(ops.get_collection(ops.GraphKeys.UPDATE_OPS)) # Make sure update ops are ran before computing loss. if update_ops: loss = control_flow_ops.with_dependencies(list(update_ops), loss) if summaries is None: summaries = ["learning_rate", "global_gradient_norm"] else: for summ in summaries: if summ not in OPTIMIZER_SUMMARIES: raise ValueError( "Summaries should be one of [%s], you provided %s." % (", ".join(OPTIMIZER_SUMMARIES), summ)) if global_step is None: raise ValueError( "global_step is required for learning_rate_decay_fn.") lr = learning_rate_decay_fn(global_step) if "learning_rate" in summaries: summary.scalar("learning_rate", lr) # Create optimizer, given specified parameters. if isinstance(optimizer, six.string_types): if lr is None: raise ValueError( "Learning rate is None, but should be specified if " "optimizer is string (%s)." % optimizer) if optimizer not in OPTIMIZER_CLS_NAMES: raise ValueError( "Optimizer name should be one of [%s], you provided %s." % (", ".join(OPTIMIZER_CLS_NAMES), optimizer)) opt = OPTIMIZER_CLS_NAMES[optimizer](learning_rate=lr, **optimizer_params) elif (isinstance(optimizer, type) and issubclass(optimizer, optimizer_.Optimizer)): if lr is None: raise ValueError( "Learning rate is None, but should be specified if " "optimizer is class (%s)." % optimizer) opt = optimizer(learning_rate=lr, **optimizer_params) elif isinstance(optimizer, optimizer_.Optimizer): opt = optimizer elif callable(optimizer): if lr is not None: opt = optimizer(lr, **optimizer_params) else: opt = optimizer(**optimizer_params) if not isinstance(opt, optimizer_.Optimizer): raise ValueError( "Unrecognized optimizer: function should return " "subclass of Optimizer. Got %s." % str(opt)) else: raise ValueError( "Unrecognized optimizer: should be string, " "subclass of Optimizer, instance of " "subclass of Optimizer or function with one argument. " "Got %s." % str(optimizer)) # All trainable variables, if specific variables are not specified. if variables is None: variables = vars_.trainable_variables() if automatic_loss_scaling is not None: if not automatic_loss_scaling in AutomaticLossScaler.SUPPORTED_ALGOS: raise ValueError( "Unknown automatic loss scaling algorithm: %s." % automatic_loss_sclaing) if dtype != "mixed": raise ValueError( "Automatic loss scaling can be used only with " "dtype=mixed.") loss_scaler = AutomaticLossScaler(algorithm=automatic_loss_scaling) else: loss_scaler = None if dtype == 'mixed': opt = MixedPrecisionOptimizerWrapper( opt, automatic_loss_scaler=loss_scaler, ) if on_horovod: opt = DistributedOptimizer(opt) # Compute gradients. gradients = opt.compute_gradients( loss if loss_scale == 1.0 else loss * loss_scale, variables, colocate_gradients_with_ops=colocate_gradients_with_ops) if loss_scale != 1.0: gradients = _multiply_gradients_const(gradients, 1.0 / loss_scale) # Optionally add gradient noise. if gradient_noise_scale is not None: gradients = _add_scaled_noise_to_gradients(gradients, gradient_noise_scale) # Multiply some gradients. if gradient_multipliers is not None: gradients = _multiply_gradients(gradients, gradient_multipliers) if not gradients: raise ValueError( "Empty list of (gradient, var) pairs encountered. This is most " "likely to be caused by an improper value of gradient_multipliers." ) if "global_gradient_norm" in summaries or "gradient_norm" in summaries: summary.scalar( "global_norm/gradient_norm", clip_ops.global_norm( list( map(lambda x: tf.cast(x, tf.float32), list(zip(*gradients))[0]))), ) # Optionally clip gradients by global norm. if clip_gradients is not None and larc_params is not None: raise AttributeError( "LARC and gradient norm clipping should not be used together") if isinstance(clip_gradients, float): gradients = _clip_gradients_by_norm(gradients, clip_gradients) elif callable(clip_gradients): gradients = clip_gradients(gradients) elif clip_gradients is not None: raise ValueError("Unknown type %s for clip_gradients" % type(clip_gradients)) # Add histograms for variables, gradients and gradient norms. for gradient, variable in gradients: if isinstance(gradient, ops.IndexedSlices): grad_values = gradient.values else: grad_values = gradient if isinstance(variable, ops.IndexedSlices): var_values = variable.values else: var_values = variable if grad_values is not None: var_name = variable.name.replace(":", "_") if "gradients" in summaries: summary.histogram("gradients/%s" % var_name, mask_nans(grad_values)) if "gradient_norm" in summaries: summary.scalar("gradient_norm/%s" % var_name, clip_ops.global_norm([grad_values])) if "variables" in summaries: summary.histogram("variables/%s" % var_name, var_values) if "variable_norm" in summaries: summary.scalar("variable_norm/%s" % var_name, clip_ops.global_norm([var_values])) if clip_gradients is not None and ("global_gradient_norm" in summaries or "gradient_norm" in summaries): summary.scalar( "global_norm/clipped_gradient_norm", clip_ops.global_norm( list( map(lambda x: tf.cast(x, tf.float32), list(zip(*gradients))[0]))), ) # LARC gradient re-scaling if larc_params is not None: check_params( config=larc_params, required_dict={'larc_eta': float}, optional_dict={ 'larc_mode': ['clip', 'scale'], 'min_update': float, 'epsilon': float }, ) larc_eta = larc_params['larc_eta'] larc_mode = larc_params.get('larc_mode', 'clip') min_update = larc_params.get('min_update', 1e-7) eps = larc_params.get('epsilon', 1e-7) for idx, (g, v) in enumerate(gradients): var_dtype = v.dtype v_norm = tf.norm(tensor=tf.cast(v, tf.float32), ord=2) g_norm = tf.norm(tensor=tf.cast(g, tf.float32), ord=2) if larc_mode == 'clip': larc_grad_update = tf.maximum( larc_eta * v_norm / (lr * (g_norm + eps)), min_update, ) if "larc_summaries" in summaries: summary.scalar( 'larc_clip_on/{}'.format(v.name), tf.cast(tf.less(larc_grad_update, 1.0), tf.int32)) larc_grad_update = tf.minimum(larc_grad_update, 1.0) else: larc_grad_update = tf.maximum( larc_eta * v_norm / (g_norm + eps), min_update, ) larc_grad_update = tf.saturate_cast(larc_grad_update, var_dtype) gradients[idx] = (larc_grad_update * g, v) # adding additional summary if "larc_summaries" in summaries: summary.scalar('larc_grad_update/{}'.format(v.name), larc_grad_update) summary.scalar("larc_final_lr/{}".format(v.name), tf.cast(lr, var_dtype) * larc_grad_update) # Create gradient updates. grad_updates = opt.apply_gradients( gradients, global_step=global_step if increment_global_step else None, name="train") # # Ensure the train_tensor computes grad_updates. train_tensor = control_flow_ops.with_dependencies([grad_updates], loss) return train_tensor