def handle_fp16_and_distributed_optimizer(optimizer, lr_schedule, hvd_backend=None): if hvd_backend == "horovod": import horovod.tensorflow.keras as hvd from horovod.tensorflow import Compression elif hvd_backend == "byteps": import byteps.tensorflow.keras as hvd from byteps.tensorflow import Compression if hvd_backend: compression = Compression.none if compat.CUSTOM_GLOBAL_FLOATX == "float16": compression = Compression.fp16 if lr_schedule is not None and hvd_backend is None: # TODO(ZhaoChengqi): pay attention to API changes optimizer._set_hyper("learning_rate", lr_schedule) # specify the following scenario if compat.CUSTOM_GLOBAL_FLOATX == "float16": if compat.IS_PREV_TF_2_4_0: from tensorflow.keras.mixed_precision.experimental import LossScaleOptimizer from tensorflow.python.keras import backend from tensorflow.python.training.experimental.loss_scale import get_loss_scale_weights revised_loss_scale = RevisedDynamicLossScale() if hvd_backend: opt = LossScaleOptimizer(optimizer, loss_scale=1) opt = hvd.DistributedOptimizer(opt, compression=compression, sparse_as_dense=True) opt._loss_scale = revised_loss_scale for weight in get_loss_scale_weights(opt._loss_scale): backend.track_variable(weight) opt._track_trackable(opt._loss_scale, 'loss_scale', overwrite=True) else: opt = LossScaleOptimizer(optimizer, loss_scale=revised_loss_scale) else: if hvd_backend: opt = HorovodDistributedLossScaleOptimizer( inner_optimizer=optimizer, compression=compression, sparse_as_dense=True, hvd_backend=hvd_backend) else: opt = tf.keras.mixed_precision.LossScaleOptimizer(optimizer) opt._loss_scale = RevisedDynamicLossScale( initial_loss_scale=2**15, growth_steps=2000, multiplier=2) opt._track_trackable(opt._loss_scale, "loss_scale", overwrite=True) return opt return optimizer
def __init__(self, optimizer, loss_scale): """Initializes this loss scale optimizer. Args: optimizer: The Optimizer instance to wrap. loss_scale: The loss scale to scale the loss and gradients. This can either be an int/float to use a fixed loss scale, the string "dynamic" to use dynamic loss scaling, or an instance of a LossScale. The string "dynamic" equivalent to passing `DynamicLossScale()`, and passing an int/float is equivalent to passing a FixedLossScale with the given loss scale. """ if not isinstance(optimizer, optimizer_v2.OptimizerV2): raise ValueError( '"optimizer" must be an instance of OptimizerV2, but ' 'got: %s' % optimizer) if optimizer.clipnorm is not None: raise ValueError( 'LossScaleOptimizer does not support wrapping ' 'optimizers with a clipnorm. Optimizer %s has clipnorm ' '%s' % (optimizer, optimizer.clipnorm)) if optimizer.clipvalue is not None: raise ValueError('LossScaleOptimizer does not support wrapping ' 'optimizers with a clipvalue. Optimizer %s has ' 'clipvalue %s' % (optimizer, optimizer.clipvalue)) self._raise_if_strategy_unsupported() self.clipnorm = None self.clipvalue = None self._optimizer = optimizer self._loss_scale = keras_loss_scale_module.get(loss_scale) if self._loss_scale is None: raise ValueError('loss_scale cannot be None.') # We don't call super().__init__, since we do not want to call OptimizerV2's # constructor. _DelegatingTrackableMixin.__init__(self, self._optimizer) for weight in loss_scale_module.get_loss_scale_weights( self._loss_scale): # We cannot call `track_variable` in the LossScale class itself, because a # file outside of Keras cannot depend on a Keras file. Calling it here # instead is OK, because a variable only needs to be tracked if used with # a Keras class, and the only way to use LossScale with a Keras class is # through the LossScaleOptimizer. backend.track_variable(weight) self._track_trackable(self._loss_scale, 'loss_scale') # Needed because the superclass's __getattribute__ checks this. self._hyper = {} # To support restoring TensorFlow 2.2 checkpoints. self._track_trackable(FakeOptimizerForRestoration(self._optimizer), 'base_optimizer')
def __call__(opt, loss_scale): """Initializes a loss scaled optimizer. Args: opt: The Optimizer instance to wrap. loss_scale: The loss scale to scale the loss and gradients. This can either be an int/float to use a fixed loss scale, the string "dynamic" to use dynamic loss scaling, or an instance of a LossScale. The string "dynamic" equivalent to passing `DynamicLossScale()`, and passing an int/float is equivalent to passing a FixedLossScale with the given loss scale. Returns: Keras Optimizer with loss scaling """ opt._loss_scale = loss_scale_module.get(loss_scale) for weight in loss_scale_module.get_loss_scale_weights( opt._loss_scale): # We cannot call `track_variable` in the LossScale class itself, because a # file outside of Keras cannot depend on a Keras file. Calling it here # instead is OK, because a variable only needs to be tracked if used with # a Keras class, and the only way to use LossScale with a Keras class is # through the LossScaleOptimizer. backend.track_variable(weight) opt._track_trackable(opt._loss_scale, 'loss_scale') class BaseOptimizer(object): _class = opt.__class__ _classname = "%s.%s" % (opt.__module__, opt.__class__.__name__) _compute_gradients = opt._compute_gradients get_gradients = opt.get_gradients apply_gradients = opt.apply_gradients get_config = opt.get_config from_config = opt.from_config opt.loss_scale_base_opt = BaseOptimizer # Generate a fake class with name "LossScaleOptimizer" # Essential to avoid modifying the optimizer original class base_opt_class_dict = dict(opt.__class__.__dict__) base_opt_class_dict.update(dict(LossScaleOptimizer.__dict__)) del base_opt_class_dict["__call__"] del base_opt_class_dict["__dict__"] del base_opt_class_dict["__weakref__"] opt.__class__ = type(LossScaleOptimizer.__name__, (opt.loss_scale_base_opt._class, ), base_opt_class_dict) return opt
def __init__(self, optimizer, loss_scale): """Initializes this loss scale optimizer. Args: optimizer: The Optimizer instance to wrap. loss_scale: The loss scale to scale the loss and gradients. This can either be an int/float to use a fixed loss scale, the string "dynamic" to use dynamic loss scaling, or an instance of a LossScale. The string "dynamic" equivalent to passing `DynamicLossScale()`, and passing an int/float is equivalent to passing a FixedLossScale with the given loss scale. """ if not isinstance(optimizer, optimizer_v2.OptimizerV2): raise ValueError( '"optimizer" must be an instance of OptimizerV2, but ' 'got: %s' % optimizer) if hasattr(optimizer, 'clipnorm'): raise ValueError( 'LossScaleOptimizer does not support wrapping ' 'optimizers with a clipnorm. Optimizer %s has clipnorm ' '%s' % (optimizer, optimizer.clipnorm)) if hasattr(optimizer, 'clipvalue'): raise ValueError('LossScaleOptimizer does not support wrapping ' 'optimizers with a clipvalue. Optimizer %s has ' 'clipvalue %s' % (optimizer, optimizer.clipvalue)) self._optimizer = optimizer self._loss_scale = keras_loss_scale_module.get(loss_scale) for weight in loss_scale_module.get_loss_scale_weights( self._loss_scale): # We cannot call `track_variable` in the LossScale class itself, because a # file outside of Keras cannot depend on a Keras file. Calling it here # instead is OK, because a variable only needs to be tracked if used with # a Keras class, and the only way to use LossScale with a Keras class is # through the LossScaleOptimizer. backend.track_variable(weight) self._track_trackable(self._optimizer, 'base_optimizer') self._track_trackable(self._loss_scale, 'loss_scale') # Needed because the superclass's __getattribute__ checks this. self._hyper = {}
def _handle_fp16_and_distributed_optimizer(optimizer, lr_schedule, hvd_backend=None): if hvd_backend == "horovod": import horovod.tensorflow.keras as hvd from horovod.tensorflow import Compression elif hvd_backend == "byteps": import byteps.tensorflow.keras as hvd from byteps.tensorflow import Compression if hvd_backend: compression = Compression.none if compat.CUSTOM_GLOBAL_FLOATX == "float16": compression = Compression.fp16 if lr_schedule is not None and hvd_backend is None: # TODO(ZhaoChengqi): pay attention to API changes optimizer._set_hyper("learning_rate", lr_schedule) # specify the following scenario # there is a bug under TF2.3+Horovod+fp16+XLA if compat.CUSTOM_GLOBAL_FLOATX == "float16": logging.info("NOTICE: using revised DynamicLossScale under fp16") revised_loss_scale = training_utils.RevisedDynamicLossScale() if hvd_backend: opt = LossScaleOptimizer(optimizer, loss_scale=1) opt = hvd.DistributedOptimizer(opt, compression=compression, sparse_as_dense=True) opt._loss_scale = revised_loss_scale for weight in loss_scale_module.get_loss_scale_weights( opt._loss_scale): backend.track_variable(weight) opt._track_trackable(opt._loss_scale, 'loss_scale', overwrite=True) else: opt = LossScaleOptimizer(optimizer, loss_scale=revised_loss_scale) return opt return optimizer