def _create_optimizer(self): """Creates optimizer.""" params = self.params # TODO(b/139414679): Explore the difference between using # LearningRateSchedule and callback for GPU runs, and try to merge them. lr_schedule = optimizer.LearningRateSchedule( params["learning_rate"], params["hidden_size"], params["learning_rate_warmup_steps"]) opt = tf.keras.optimizers.Adam( lr_schedule if self.use_tpu else params["learning_rate"], params["optimizer_adam_beta1"], params["optimizer_adam_beta2"], epsilon=params["optimizer_adam_epsilon"]) if params["dtype"] == tf.float16: opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer( opt, loss_scale=flags_core.get_loss_scale(self.flags_obj, default_for_fp16="dynamic")) if self.flags_obj.fp16_implementation == "graph_rewrite": # Note: when flags_obj.fp16_implementation == "graph_rewrite", dtype as # determined by flags_core.get_tf_dtype(flags_obj) would be 'float32' # which will ensure tf.compat.v2.keras.mixed_precision and # tf.train.experimental.enable_mixed_precision_graph_rewrite do not double # up. opt = tf.train.experimental.enable_mixed_precision_graph_rewrite(opt) return opt
def _create_optimizer(self): """Creates optimizer.""" params = self.params # TODO(b/139414679): Explore the difference between using # LearningRateSchedule and callback for GPU runs, and try to merge them. lr_schedule = optimizer.LearningRateSchedule( params["learning_rate"], params["hidden_size"], params["learning_rate_warmup_steps"]) opt = tf.keras.optimizers.Adam( lr_schedule if self.use_tpu else params["learning_rate"], params["optimizer_adam_beta1"], params["optimizer_adam_beta2"], epsilon=params["optimizer_adam_epsilon"]) return opt
def _create_optimizer(self): """Creates optimizer.""" params = self.params # TODO(b/139414679): Explore the difference between using # LearningRateSchedule and callback for GPU runs, and try to merge them. lr_schedule = optimizer.LearningRateSchedule( params["learning_rate"], params["hidden_size"], params["learning_rate_warmup_steps"]) opt = tf.keras.optimizers.Adam( lr_schedule if self.use_tpu else params["learning_rate"], params["optimizer_adam_beta1"], params["optimizer_adam_beta2"], epsilon=params["optimizer_adam_epsilon"]) if params["dtype"] == tf.float16: opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer( opt, loss_scale=flags_core.get_loss_scale(self.flags_obj, default_for_fp16="dynamic")) return opt