def _configure_fp16_optimizer(self, optimizer): initial_dynamic_scale = self.initial_dynamic_scale() dynamic_loss_args = self.dynamic_loss_scale_args() clip_grad = self.gradient_clipping() if self.optimizer_name() == ADAM_OPTIMIZER: if self.dynamic_loss_scale(): logging.info('Creating fp16 optimizer with dynamic loss scale') optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True, initial_dynamic_scale=initial_dynamic_scale, dynamic_loss_args=dynamic_loss_args, mpu=self.mpu, clip_grad=clip_grad, fused_adam_legacy=True) else: logging.info('Creating fp16 optimizer with static loss scale: {}'.format( self.loss_scale())) optimizer = FP16_Optimizer(optimizer, static_loss_scale=self.loss_scale(), mpu=self.mpu, clip_grad=clip_grad, fused_adam_legacy=True) else: logging.info('Creating fp16 unfused optimizer with dynamic loss scale') optimizer = FP16_UnfusedOptimizer( optimizer, dynamic_loss_scale=self.dynamic_loss_scale(), dynamic_loss_args=dynamic_loss_args, mpu=self.mpu, clip_grad=clip_grad, fused_lamb_legacy=True if self.optimizer_name() == LAMB_OPTIMIZER else False) return optimizer
def _configure_fp16_optimizer(self, optimizer): initial_dynamic_scale = self.initial_dynamic_scale() dynamic_loss_args = self.dynamic_loss_scale_args() clip_grad = self.gradient_clipping() if self.optimizer_name() == ADAM_OPTIMIZER: if self.dynamic_loss_scale(): logger.info('Creating fp16 optimizer with dynamic loss scale') timers = self.timers if self.wall_clock_breakdown() else None optimizer = FP16_Optimizer( optimizer, dynamic_loss_scale=True, initial_dynamic_scale=initial_dynamic_scale, dynamic_loss_args=dynamic_loss_args, mpu=self.mpu, clip_grad=clip_grad, fused_adam_legacy=self.optimizer_legacy_fusion(), timers=timers) else: logger.info( 'Creating fp16 optimizer with static loss scale: {}'. format(self.loss_scale())) optimizer = FP16_Optimizer( optimizer, static_loss_scale=self.loss_scale(), mpu=self.mpu, clip_grad=clip_grad, fused_adam_legacy=self.optimizer_legacy_fusion()) else: logger.info( 'Creating fp16 unfused optimizer with dynamic loss scale') optimizer = FP16_UnfusedOptimizer( optimizer, dynamic_loss_scale=self.dynamic_loss_scale(), dynamic_loss_args=dynamic_loss_args, mpu=self.mpu, clip_grad=clip_grad, fused_lamb_legacy=self.optimizer_name() == LAMB_OPTIMIZER) return optimizer