def __init__(self, params, model, name="loss"): """Loss constructor. Note that loss constructors should not modify TensorFlow graph, all graph construction should happen in the :meth:`self._compute_loss() <_compute_loss>` method. Args: params (dict): parameters describing the loss. All supported parameters are listed in :meth:`get_required_params`, :meth:`get_optional_params` functions. model (instance of a class derived from :class:`Model<models.model.Model>`): parent model that created this loss. Could be None if no model access is required for the use case. name (str): name for loss variable scope. Config parameters: * **dtype** --- data dtype. Could be either ``tf.float16`` or ``tf.float32``. """ check_params(params, self.get_required_params(), self.get_optional_params()) self._params = copy.deepcopy(params) self._model = model if 'dtype' not in self._params: if self._model: self._params['dtype'] = self._model.get_tf_dtype() else: self._params['dtype'] = tf.float32 self._name = name
def __init__(self, params): if params is None: params = {} check_params( config=params, required_dict={}, optional_dict={ 'scale_min': float, 'scale_max': float, 'step_factor': float, 'step_window': int }, ) self.scale_min = params.get('scale_min', 1.0) self.scale_max = params.get('scale_max', 2.**24) self.step_factor = params.get('step_factor', 2.0) self.step_window = params.get('step_window', 2000) self.iteration = tf.Variable(initial_value=0, trainable=False, dtype=tf.int64) self.last_overflow_iteration = tf.Variable(initial_value=-1, trainable=False, dtype=tf.int64) self.scale = tf.Variable(initial_value=self.scale_max, trainable=False)
def __init__(self, params, model, name="decoder", mode='train'): """Decoder constructor. Note that decoder constructors should not modify TensorFlow graph, all graph construction should happen in the :meth:`self._decode() <_decode>` method. Args: params (dict): parameters describing the decoder. All supported parameters are listed in :meth:`get_required_params`, :meth:`get_optional_params` functions. model (instance of a class derived from :class:`Model<models.model.Model>`): parent model that created this decoder. Could be None if no model access is required for the use case. name (str): name for decoder variable scope. mode (str): mode decoder is going to be run in. Could be "train", "eval" or "infer". Config parameters: * **initializer** --- any valid TensorFlow initializer. If no initializer is provided, model initializer will be used. * **initializer_params** (dict) --- dictionary that will be passed to initializer ``__init__`` method. * **regularizer** --- and valid TensorFlow regularizer. If no regularizer is provided, model regularizer will be used. * **regularizer_params** (dict) --- dictionary that will be passed to regularizer ``__init__`` method. * **dtype** --- model dtype. Could be either ``tf.float16``, ``tf.float32`` or "mixed". For details see :ref:`mixed precision training <mixed_precision>` section in docs. If no dtype is provided, model dtype will be used. """ check_params(params, self.get_required_params(), self.get_optional_params()) self._params = copy.deepcopy(params) self._model = model if 'dtype' not in self._params: if self._model: self._params['dtype'] = self._model.params['dtype'] else: self._params['dtype'] = tf.float32 self._name = name self._mode = mode self._compiled = False
def __init__(self, params, model, num_workers, worker_id): """Data layer constructor. The TensorFlow graph should not be created here, but rather in the :meth:`self.build_graph() <build_graph>` method. Args: params (dict): parameters describing the data layer. All supported parameters are listed in :meth:`get_required_params`, :meth:`get_optional_params` functions. model (instance of a class derived from :class:`Model<models.model.Model>`): parent model that created this data layer. Could be None if no model access is required for the use case. num_workers (int): number of Horovod processes or number of GPUs if Horovod is not used. worker_id (int): Horovod process id or GPU id if Horovod is not used. Config parameters: * **shuffle** (bool) --- whether to shuffle dataset after an epoch. Typically will be True for train and False for inference and evaluation. * **dtype** --- data dtype. Could be either ``tf.float16`` or ``tf.float32``. """ check_params(params, self.get_required_params(), self.get_optional_params()) self._params = copy.deepcopy(params) self._model = model if 'dtype' not in self._params: if self._model: self._params['dtype'] = self._model.get_tf_dtype() else: self._params['dtype'] = tf.float32 if 'shuffle' not in params: self._params['shuffle'] = (self._params['mode'] == 'train') if self._params['mode'] != 'train' and self._params['shuffle']: raise ValueError( "Shuffle should not be performed in eval or infer modes") # should be used for correct evaluation on multiple GPUs self._num_workers = num_workers self._worker_id = worker_id
def __init__(self, params): if params is None: params = {} check_params( config=params, required_dict={}, optional_dict={ 'scale_min': float, 'scale_max': float, 'log_max': float, 'beta1': float, 'beta2': float, 'overflow_std_dev': float }, ) self.scale_min = params.get('scale_min', 1.0) self.scale_max = params.get('scale_max', 2.**24) self.log_max = params.get('log_max', 16.) self.beta1 = params.get('beta1', 0.99) self.beta2 = params.get('beta2', 0.999) self.overflow_std_dev = params.get('overflow_std_dev', 3.09) self.iteration = tf.Variable(initial_value=0, trainable=False, dtype=tf.int64) self.scale = tf.Variable(initial_value=1.0, trainable=False) self.x_hat = tf.Variable(initial_value=0, trainable=False, dtype=tf.float32) self.slow_x_hat = tf.Variable(initial_value=0, trainable=False, dtype=tf.float32) self.xsquared_hat = tf.Variable(initial_value=0, trainable=False, dtype=tf.float32) self.b1_correction = tf.Variable(initial_value=1., trainable=False, dtype=tf.float32) self.b2_correction = tf.Variable(initial_value=1., trainable=False, dtype=tf.float32)
def post_process_gradients(grads_and_vars, summaries, lr, clip_gradients, larc_params): """Applies post processing to gradients, i.e. clipping, LARC, summaries.""" if "global_gradient_norm" in summaries: tf.summary.scalar( "global_gradient_norm", _global_norm_with_cast(grads_and_vars), ) # Optionally clip gradients by global norm. if clip_gradients is not None: grads_and_vars = _clip_gradients_by_norm(grads_and_vars, clip_gradients) # Add histograms for variables, gradients and gradient norms. for gradient, variable in grads_and_vars: if isinstance(gradient, tf.IndexedSlices): grad_values = gradient.values else: grad_values = gradient if isinstance(variable, tf.IndexedSlices): var_values = variable.values else: var_values = variable if grad_values is not None: var_name = variable.name.replace(":", "_") if "gradients" in summaries: # need to mask nans for automatic loss scaling tf.summary.histogram("gradients/%s" % var_name, mask_nans(grad_values)) if "gradient_norm" in summaries: tf.summary.scalar("gradient_norm/%s" % var_name, tf.norm(grad_values)) if "variables" in summaries: tf.summary.histogram("variables/%s" % var_name, var_values) if "variable_norm" in summaries: tf.summary.scalar("variable_norm/%s" % var_name, tf.norm(var_values)) if clip_gradients is not None and "global_gradient_norm" in summaries: tf.summary.scalar( "global_clipped_gradient_norm", _global_norm_with_cast(grads_and_vars), ) # LARC gradient re-scaling if larc_params is not None: check_params( config=larc_params, required_dict={'larc_eta': float}, optional_dict={ 'larc_mode': ['clip', 'scale'], 'min_update': float, 'epsilon': float }, ) larc_eta = larc_params['larc_eta'] larc_mode = larc_params.get('larc_mode', 'clip') min_update = larc_params.get('min_update', 1e-7) eps = larc_params.get('epsilon', 1e-7) grads_and_vars_larc = [None] * len(grads_and_vars) for idx, (g, v) in enumerate(grads_and_vars): var_dtype = v.dtype v_norm = tf.norm(tensor=tf.cast(v, tf.float32), ord=2) g_norm = tf.norm(tensor=tf.cast(g, tf.float32), ord=2) if larc_mode == 'clip': larc_grad_update = tf.maximum( larc_eta * v_norm / (lr * (g_norm + eps)), min_update, ) if "larc_summaries" in summaries: tf.summary.scalar('larc_clip_on/{}'.format(v.name), tf.cast(tf.less(larc_grad_update, 1.0), tf.int32)) larc_grad_update = tf.minimum(larc_grad_update, 1.0) else: larc_grad_update = tf.maximum( larc_eta * v_norm / (g_norm + eps), min_update, ) larc_grad_update = tf.saturate_cast(larc_grad_update, var_dtype) grads_and_vars_larc[idx] = (larc_grad_update * g, v) # adding additional summary if "larc_summaries" in summaries: tf.summary.scalar('larc_grad_update/{}'.format(v.name), larc_grad_update) tf.summary.scalar("larc_final_lr/{}".format(v.name), tf.cast(lr, var_dtype) * larc_grad_update) grads_and_vars = grads_and_vars_larc return grads_and_vars
def __init__(self, params, mode="train", hvd=None): """Model constructor. The TensorFlow graph should not be created here, but rather in the :meth:`self.compile() <compile>` method. Args: params (dict): parameters describing the model. All supported parameters are listed in :meth:`get_required_params`, :meth:`get_optional_params` functions. mode (string, optional): "train", "eval" or "infer". If mode is "train" all parts of the graph will be built (model, loss, optimizer). If mode is "eval", only model and loss will be built. If mode is "infer", only model will be built. hvd (optional): if Horovod is used, this should be ``horovod.tensorflow`` module. If Horovod is not used, it should be None. Config parameters: * **random_seed** (int) --- random seed to use. * **use_horovod** (bool) --- whether to use Horovod for distributed execution. * **num_gpus** (int) --- number of GPUs to use. This parameter cannot be used if ``gpu_ids`` is specified. When ``use_horovod`` is True this parameter is ignored. * **gpu_ids** (list of ints) --- GPU ids to use. This parameter cannot be used if ``num_gpus`` is specified. When ``use_horovod`` is True this parameter is ignored. * **batch_size_per_gpu** (int) --- batch size to use for each GPU. * **eval_batch_size_per_gpu** (int) --- batch size to use for each GPU during inference. This is for when training and inference have different computation and memory requirements, such as when training uses sampled softmax and inference uses full softmax. If not specified, it's set to ``batch_size_per_gpu``. * **restore_best_checkpoint** (bool) --- if set to True, when doing evaluation and inference, the model will load the best checkpoint instead of the latest checkpoint. Best checkpoint is evaluated based on evaluation results, so it's only available when the model is trained untder ``train_eval`` mode. Default to False. * **load_model** (str) --- points to the location of the pretrained model for transfer learning. If specified, during training, the system will look into the checkpoint in this folder and restore all variables whose names and shapes match a variable in the new model. * **num_epochs** (int) --- number of epochs to run training for. This parameter cannot be used if ``max_steps`` is specified. * **max_steps** (int) --- number of steps to run training for. This parameter cannot be used if ``num_epochs`` is specified. * **save_summaries_steps** (int or None) --- how often to save summaries. Setting it to None disables summaries saving. * **print_loss_steps** (int or None) --- how often to print loss during training. Setting it to None disables loss printing. * **print_samples_steps** (int or None) --- how often to print training samples (input sequences, correct answers and model predictions). Setting it to None disables samples printing. * **print_bench_info_steps** (int or None) --- how often to print training benchmarking information (average number of objects processed per step). Setting it to None disables intermediate benchmarking printing, but the average information across the whole training will always be printed after the last iteration. * **save_checkpoint_steps** (int or None) --- how often to save model checkpoints. Setting it to None disables checkpoint saving. * **eval_steps** (int) --- how often to run evaluation during training. This parameter is only checked if ``--mode`` argument of ``run.py`` is "train\_eval". If no evaluation is needed you should use "train" mode. * **logdir** (string) --- path to the log directory where all checkpoints and summaries will be saved. * **data_layer** (any class derived from :class:`DataLayer <data.data_layer.DataLayer>`) --- data layer class to use. * **data_layer_params** (dict) --- dictionary with data layer configuration. For complete list of possible parameters see the corresponding class docs. * **optimizer** (string or TensorFlow optimizer class) --- optimizer to use for training. Could be either "Adam", "Adagrad", "Ftrl", "Momentum", "RMSProp", "SGD" or any valid TensorFlow optimizer class. * **optimizer_params** (dict) --- dictionary that will be passed to optimizer ``__init__`` method. * **initializer** --- any valid TensorFlow initializer. * **initializer_params** (dict) --- dictionary that will be passed to initializer ``__init__`` method. * **freeze_variables_regex** (str or None) --- if zero or more characters at the beginning of the name of a trainable variable match this pattern, then this variable will be frozen during training. Setting it to None disables freezing of variables. * **regularizer** --- and valid TensorFlow regularizer. * **regularizer_params** (dict) --- dictionary that will be passed to regularizer ``__init__`` method. * **dtype** --- model dtype. Could be either ``tf.float16``, ``tf.float32`` or "mixed". For details see :ref:`mixed precision training <mixed_precision>` section in docs. * **lr_policy** --- any valid learning rate policy function. For examples, see :any:`optimizers.lr_policies` module. * **lr_policy_params** (dict) --- dictionary containing lr_policy parameters. * **max_grad_norm** (float) --- maximum value of gradient norm. Clipping will be performed if some gradients exceed this value (this is checked for each variable independently). * **loss_scaling** --- could be float or string. If float, static loss scaling is applied. If string, the corresponding automatic loss scaling algorithm is used. Must be one of 'Backoff' of 'LogMax' (case insensitive). Only used when dtype="mixed". For details see :ref:`mixed precision training <mixed_precision>` section in docs. * **loss_scaling_params** (dict) --- dictionary containing loss scaling parameters. * **summaries** (list) --- which summaries to log. Could contain "learning_rate", "gradients", "gradient_norm", "global_gradient_norm", "variables", "variable_norm", "loss_scale". * **iter_size** (int) --- use this parameter to emulate large batches. The gradients will be accumulated for ``iter_size`` number of steps before applying update. * **larc_params** --- dictionary with parameters for LARC (or LARS) optimization algorithms. Can contain the following parameters: * **larc_mode** --- Could be either "scale" (LARS) or "clip" (LARC). Note that it works in addition to any other optimization algorithm since we treat it as adaptive gradient clipping and learning rate adjustment. * **larc_eta** (float) --- LARC or LARS scaling parameter. * **min_update** (float) --- minimal value of the LARC (LARS) update. * **epsilon** (float) --- small number added to gradient norm in denominator for numerical stability. """ check_params(params, self.get_required_params(), self.get_optional_params()) self._params = copy.deepcopy(params) if self._params.get('iter_size', 1) > 1 and hvd is None: raise ValueError("iter_size is only supported in Horovod mode") # parameter checks self._mode = mode self._interactive = False if self._mode == "interactive_infer": self._mode = "infer" self._interactive = True if self._mode not in ["train", "infer", "eval"]: raise ValueError( "Mode has to be one of ['train', 'infer', 'eval']") if "max_steps" in params and "num_epochs" in params: raise ValueError( "You can't provide both max_steps and num_epochs. " "Please, remove one of them from the config.") if mode == "train": if "max_steps" not in params and "num_epochs" not in params: raise ValueError("For training mode either max_steps or " "num_epochs has to be provided") if 'print_samples_steps' not in self._params: self._params['print_samples_steps'] = None if 'print_loss_steps' not in self._params: self._params['print_loss_steps'] = None if 'save_checkpoint_steps' not in self._params: self._params['save_checkpoint_steps'] = None if 'save_summaries_steps' not in self._params: self._params['save_summaries_steps'] = None if 'print_bench_info_steps' not in self._params: self._params['print_bench_info_steps'] = None self._params['finetune'] = self._params.get('finetune', False) # self._params['base_logdir'] = self._params.get('base_logdir', None) self._params['load_model'] = self._params.get('load_model', None) self._params['load_fc'] = self._params.get('load_fc', False) self._params['eval_batch_size_per_gpu'] = self._params.get( 'eval_batch_size_per_gpu', self._params['batch_size_per_gpu']) # checking that frequencies of samples and loss are aligned s_fr = self._params['print_samples_steps'] l_fr = self._params['print_loss_steps'] if s_fr is not None and l_fr is not None and s_fr % l_fr != 0: raise ValueError("print_samples_steps has to be a multiple of " "print_loss_steps.") self._hvd = hvd if self._hvd: self._gpu_ids = range(1) else: if 'gpu_ids' in self._params: self._gpu_ids = self._params['gpu_ids'] elif 'num_gpus' in self._params: self._gpu_ids = range(self._params['num_gpus']) else: raise ValueError('Either "gpu_ids" or "num_gpus" has to ' 'be specified in the config') if self._interactive and len(self._gpu_ids) > 1: raise ValueError( "Interactive infer is meant to be used with 1 gpu") # setting random seed rs = self._params.get('random_seed', int(time.time())) if self.on_horovod: rs += hvd.rank() tf.set_random_seed(rs) np.random.seed(rs) if 'dtype' not in self._params: self._params['dtype'] = tf.float32 dl_params = self._params.get('data_layer_params', {}) if mode == 'train': dl_params['batch_size'] = self._params['batch_size_per_gpu'] else: dl_params['batch_size'] = self._params['eval_batch_size_per_gpu'] if 'lm_vocab_file' in self._params: dl_params['lm_vocab_file'] = self._params['lm_vocab_file'] if 'processed_data_folder' in self._params: dl_params['processed_data_folder'] = self._params[ 'processed_data_folder'] dl_params['mode'] = self._mode dl_params['interactive'] = self._interactive if self.on_horovod: self._data_layer = self._params['data_layer']( params=dl_params, model=self, num_workers=self._hvd.size(), worker_id=self._hvd.rank(), ) else: self._data_layers = [] for worker_id in range(self.num_gpus): self._data_layers.append(self._params['data_layer']( params=dl_params, model=self, num_workers=self.num_gpus, worker_id=worker_id, )) if self._mode == "train": if "max_steps" in self._params: self._last_step = self._params["max_steps"] self._steps_in_epoch = None else: # doing a few less steps if data size is not divisible by the batch size self._steps_in_epoch = self.get_data_layer().get_size_in_samples() // \ self.get_data_layer().params['batch_size'] if self._steps_in_epoch is None: raise ValueError( 'The data_layer is not compatible with ' 'epoch execution, since it does not provide ' 'get_size_in_samples() method. Either update the ' 'data layer or switch to using "max_steps" ' 'paremeter.') if self.on_horovod: self._steps_in_epoch //= self._hvd.size() else: self._steps_in_epoch //= self.num_gpus self._steps_in_epoch //= self._params.get('iter_size', 1) if self._steps_in_epoch == 0: raise ValueError( "Overall batch size is too big for this dataset.") self._last_step = self._params[ 'num_epochs'] * self._steps_in_epoch if self.on_horovod: self._output = None else: self._outputs = [None] * self.num_gpus self.loss = None self.train_op = None self.eval_losses = None self._num_objects_per_step = None self.skip_update_ph = None