Ejemplo n.º 1
0
    def __init__(self, model, criterion, opt_config,
                 print_freq=10,
                 save_freq=1000,
                 grad_clip=float('inf'),
                 batch_first=False,
                 save_info={},
                 save_path='.',
                 checkpoint_filename='checkpoint%s.pth',
                 keep_checkpoints=5,
                 math='fp32',
                 cuda=True,
                 distributed=False,
                 verbose=False,
                 log_dir=None,
                 num_minibatches=20,
                 cupti=False):
        super(Seq2SeqTrainer, self).__init__()
        self.model = model
        self.criterion = criterion
        self.epoch = 0
        self.save_info = save_info
        self.save_path = save_path
        self.save_freq = save_freq
        self.save_counter = 0
        self.checkpoint_filename = checkpoint_filename
        self.checkpoint_counter = cycle(range(keep_checkpoints))
        self.opt_config = opt_config
        self.cuda = cuda
        self.distributed = distributed
        self.print_freq = print_freq
        self.batch_first = batch_first
        self.verbose = verbose
        self.loss = None
        self.cupti = cupti

        self.log_dir = log_dir
        self.num_steps = num_minibatches
        self.math = math
        self.grad_clip = grad_clip

        if cuda:
            self.model = self.model.cuda()
            self.criterion = self.criterion.cuda()

        if distributed:
            self.model = DDP(self.model, log_dir=self.log_dir)

        if math == 'fp16':
            self.model = self.model.half()
            self.fp_optimizer = Fp16Optimizer(self.model, grad_clip)
            params = self.fp_optimizer.fp32_params
        elif math == 'fp32':
            self.fp_optimizer = Fp32Optimizer(self.model, grad_clip)
            params = self.model.parameters()

        opt_name = opt_config['optimizer']
        lr = opt_config['lr']
        self.optimizer = torch.optim.__dict__[opt_name](params, lr=lr)
        print("optimizer name " + opt_name)
        print(type(self.optimizer))
Ejemplo n.º 2
0
    def __init__(self,
                 model,
                 criterion,
                 opt_config,
                 print_freq=10,
                 save_freq=1000,
                 grad_clip=float('inf'),
                 batch_first=False,
                 save_info={},
                 save_path='.',
                 checkpoint_filename='checkpoint%s.pth',
                 keep_checkpoints=5,
                 math='fp32',
                 cuda=True,
                 distributed=False,
                 verbose=False):
        super(Seq2SeqTrainer, self).__init__()
        self.model = model
        self.criterion = criterion
        self.epoch = 0
        self.save_info = save_info
        self.save_path = save_path
        self.save_freq = save_freq
        self.save_counter = 0
        self.checkpoint_filename = checkpoint_filename
        self.checkpoint_counter = cycle(range(keep_checkpoints))
        self.opt_config = opt_config
        self.cuda = cuda
        self.distributed = distributed
        self.print_freq = print_freq
        self.batch_first = batch_first
        self.verbose = verbose
        self.loss = None

        if cuda:
            self.model = self.model.cuda()
            self.criterion = self.criterion.cuda()

        if distributed:
            self.model = DDP(self.model)

        if math == 'fp16':
            self.model = self.model.half()
            self.fp_optimizer = Fp16Optimizer(self.model, grad_clip)
            params = self.fp_optimizer.fp32_params
        elif math == 'fp32':
            self.fp_optimizer = Fp32Optimizer(self.model, grad_clip)
            params = self.model.parameters()

        opt_name = opt_config['optimizer']
        lr = opt_config['lr']
        self.optimizer = torch.optim.__dict__[opt_name](params, lr=lr)
        mlperf_log.gnmt_print(key=mlperf_log.OPT_NAME, value=opt_name)
        mlperf_log.gnmt_print(key=mlperf_log.OPT_LR, value=lr)
        mlperf_log.gnmt_print(key=mlperf_log.OPT_HP_ADAM_BETA1,
                              value=self.optimizer.defaults['betas'][0])
        mlperf_log.gnmt_print(key=mlperf_log.OPT_HP_ADAM_BETA2,
                              value=self.optimizer.defaults['betas'][1])
        mlperf_log.gnmt_print(key=mlperf_log.OPT_HP_ADAM_EPSILON,
                              value=self.optimizer.defaults['eps'])
Ejemplo n.º 3
0
def skyline_iteration_provider(model):
    args = get_args()
    opt_config = {
        'optimizer': args.optimizer,
        'lr': args.lr,
    }
    scheduler_config = {
        'warmup_steps': args.warmup_steps,
        'remain_steps': args.remain_steps,
        'decay_interval': args.decay_interval,
        'decay_steps': args.decay_steps,
        'decay_factor': args.decay_factor,
    }

    train_loader_len = 437268
    total_train_iters = train_loader_len // args.train_iter_size * args.epochs
    opt_name = opt_config.pop('optimizer')
    optimizer = torch.optim.__dict__[opt_name](model.parameters(),
                                               **opt_config)
    scheduler = WarmupMultiStepLR(optimizer, total_train_iters,
                                  **scheduler_config)
    fp_optimizer = Fp32Optimizer(model, args.grad_clip)

    def iteration(src, src_len, tgt, tgt_len):
        loss = model(src, src_len, tgt, tgt_len)
        loss.backward()
        fp_optimizer.step(optimizer, scheduler, update=True)

    return iteration
Ejemplo n.º 4
0
    def __init__(self,
                 model,
                 criterion,
                 opt_config,
                 print_freq=10,
                 save_freq=1000,
                 grad_clip=float('inf'),
                 batch_first=False,
                 save_info={},
                 save_path='.',
                 checkpoint_filename='checkpoint%s.pth',
                 keep_checkpoints=5,
                 math='fp32',
                 cuda=True,
                 distributed=False,
                 intra_epoch_eval=0,
                 translator=None,
                 verbose=False):
        super(Seq2SeqTrainer, self).__init__()
        self.model = model
        self.criterion = criterion
        self.epoch = 0
        self.save_info = save_info
        self.save_path = save_path
        self.save_freq = save_freq
        self.save_counter = 0
        self.checkpoint_filename = checkpoint_filename
        self.checkpoint_counter = cycle(range(keep_checkpoints))
        self.opt_config = opt_config
        self.cuda = cuda
        self.distributed = distributed
        self.print_freq = print_freq
        self.batch_first = batch_first
        self.verbose = verbose
        self.loss = None
        self.translator = translator
        self.intra_epoch_eval = intra_epoch_eval

        if cuda:
            self.model = self.model.cuda()
            self.criterion = self.criterion.cuda()

        if distributed:
            self.model = DDP(self.model)

        if math == 'fp16':
            self.model = self.model.half()
            self.fp_optimizer = Fp16Optimizer(self.model, grad_clip)
            params = self.fp_optimizer.fp32_params
        elif math == 'fp32':
            self.fp_optimizer = Fp32Optimizer(self.model, grad_clip)
            params = self.model.parameters()

        opt_name = opt_config.pop('optimizer')
        self.optimizer = torch.optim.__dict__[opt_name](params, **opt_config)
        logging.info(f'Using optimizer: {self.optimizer}')
Ejemplo n.º 5
0
    def __init__(self,
                 model,
                 criterion,
                 opt_config,
                 scheduler_config,
                 print_freq=10,
                 save_freq=1000,
                 grad_clip=float('inf'),
                 batch_first=False,
                 save_info={},
                 save_path='.',
                 train_iterations=0,
                 checkpoint_filename='checkpoint%s.pth',
                 keep_checkpoints=5,
                 math='fp32',
                 cuda=True,
                 distributed=False,
                 intra_epoch_eval=0,
                 iter_size=1,
                 translator=None,
                 verbose=False):
        """
        Constructor for the Seq2SeqTrainer.

        :param model: model to train
        :param criterion: criterion (loss function)
        :param opt_config: dictionary with options for the optimizer
        :param scheduler_config: dictionary with options for the learning rate
            scheduler
        :param print_freq: prints short summary every 'print_freq' iterations
        :param save_freq: saves checkpoint every 'save_freq' iterations
        :param grad_clip: coefficient for gradient clipping
        :param batch_first: if True the model uses (batch,seq,feature) tensors,
            if false the model uses (seq, batch, feature)
        :param save_info: dict with additional state stored in each checkpoint
        :param save_path: path to the directiory for checkpoints
        :param train_iterations: total number of training iterations to execute
        :param checkpoint_filename: name of files with checkpoints
        :param keep_checkpoints: max number of checkpoints to keep
        :param math: arithmetic type
        :param cuda: if True use cuda, if False train on cpu
        :param distributed: if True run distributed training
        :param intra_epoch_eval: number of additional eval runs within each
            training epoch
        :param iter_size: number of iterations between weight updates
        :param translator: instance of Translator, runs inference on test set
        :param verbose: enables verbose logging
        """
        super(Seq2SeqTrainer, self).__init__()
        self.model = model
        self.criterion = criterion
        self.epoch = 0
        self.save_info = save_info
        self.save_path = save_path
        self.save_freq = save_freq
        self.save_counter = 0
        self.checkpoint_filename = checkpoint_filename
        self.checkpoint_counter = cycle(range(keep_checkpoints))
        self.opt_config = opt_config
        self.cuda = cuda
        self.distributed = distributed
        self.print_freq = print_freq
        self.batch_first = batch_first
        self.verbose = verbose
        self.loss = None
        self.translator = translator
        self.intra_epoch_eval = intra_epoch_eval
        self.iter_size = iter_size

        if cuda:
            self.model = self.model.cuda()
            self.criterion = self.criterion.cuda()

        if math == 'fp16':
            self.model = self.model.half()

        if distributed:
            self.model = DDP(self.model)

        if math == 'fp16':
            self.fp_optimizer = Fp16Optimizer(self.model, grad_clip)
            params = self.fp_optimizer.fp32_params
        elif math == 'fp32':
            self.fp_optimizer = Fp32Optimizer(self.model, grad_clip)
            params = self.model.parameters()

        opt_name = opt_config.pop('optimizer')
        self.optimizer = torch.optim.__dict__[opt_name](params, **opt_config)
        logging.info(f'Using optimizer: {self.optimizer}')

        self.scheduler = WarmupMultiStepLR(self.optimizer, train_iterations,
                                           **scheduler_config)
Ejemplo n.º 6
0
    def __init__(self,
                 model,
                 criterion,
                 opt_config,
                 print_freq=10,
                 save_freq=1000,
                 grad_clip=float('inf'),
                 batch_first=False,
                 save_info={},
                 save_path='.',
                 train_iterations=0,
                 checkpoint_filename='checkpoint%s.pth',
                 keep_checkpoints=5,
                 math='fp32',
                 loss_scaling={},
                 cuda=True,
                 distributed=False,
                 distributed_overlap_allreduce=False,
                 distributed_overlap_num_allreduce_streams=1,
                 distributed_overlap_allreduce_messagesize=1e7,
                 distributed_overlap_allreduce_communicators=None,
                 intra_epoch_eval=0,
                 prealloc_mode='always',
                 iter_size=1,
                 verbose=False,
                 args=None):
        """
        Constructor for the Seq2SeqTrainer.

        :param model: model to train
        :param criterion: criterion (loss function)
        :param opt_config: dictionary with options for the optimizer
        :param print_freq: prints short summary every 'print_freq' iterations
        :param save_freq: saves checkpoint every 'save_freq' iterations
        :param grad_clip: coefficient for gradient clipping
        :param batch_first: if True the model uses (batch,seq,feature) tensors,
            if false the model uses (seq, batch, feature)
        :param save_info: dict with additional state stored in each checkpoint
        :param save_path: path to the directiory for checkpoints
        :param train_iterations: total number of training iterations to execute
        :param checkpoint_filename: name of files with checkpoints
        :param keep_checkpoints: max number of checkpoints to keep
        :param math: arithmetic type
        :param loss_scaling: options for dynamic loss scaling
        :param cuda: if True use cuda, if False train on cpu
        :param distributed: if True run distributed training
        :param intra_epoch_eval: number of additional eval runs within each
            training epoch
        :param prealloc_mode: controls preallocation,
            choices=['off', 'once', 'always']
        :param iter_size: number of iterations between weight updates
        :param verbose: enables verbose logging
        """
        super(Seq2SeqTrainer, self).__init__()
        self.model = model
        self.criterion = criterion
        self.epoch = 0
        self.save_info = save_info
        self.save_path = save_path
        self.save_freq = save_freq
        self.save_counter = 0
        self.checkpoint_filename = checkpoint_filename
        self.checkpoint_counter = cycle(range(keep_checkpoints))
        self.opt_config = opt_config
        self.cuda = cuda
        self.distributed = distributed
        self.print_freq = print_freq
        self.batch_first = batch_first
        self.verbose = verbose
        self.loss = None
        self.translator = None
        self.scheduler = None
        self.intra_epoch_eval = intra_epoch_eval
        self.iter_size = iter_size
        self.prealloc_mode = prealloc_mode
        self.preallocated = False
        
        # Assume multi-tensor apply if with APEX DDP
        self.args = args
        self.use_mt = (distributed  and iter_size == 1 and \
            opt_config['optimizer'] == 'FusedAdam')

        # Use APEX gradient average if gradient accumulation option enabled
        self.retain_allreduce_buffers = True if iter_size == 1 else False
        self.gradient_average = False if iter_size == 1 else True

        if cuda:
            self.model = self.model.cuda()
            self.criterion = self.criterion.cuda()

        params = self.model.parameters()
        if math == 'fp16':
            self.model = self.model.half()
            if distributed and self.args.distributed_weight_update != 2:
                self.model = DDP(self.model,
                                 message_size=distributed_overlap_allreduce_messagesize,
                                 delay_allreduce=(not distributed_overlap_allreduce),
                                 num_allreduce_streams=distributed_overlap_num_allreduce_streams,
                                 allreduce_communicators=distributed_overlap_allreduce_communicators,
                                 retain_allreduce_buffers=self.retain_allreduce_buffers,
                                 gradient_average=self.gradient_average)

            if self.args.distributed_weight_update == 2:
                # gradient clipping maintained by DistributedFusedAdam
                self.fp_optimizer = DwuFp16Optimizer(
                    self.model,
                    loss_scale=loss_scaling['init_scale'],
                    dls_upscale_interval=loss_scaling['upscale_interval']
                    )
                params = list(self.model.parameters())
            else:
                self.fp_optimizer = Fp16Optimizer(
                    self.model, grad_clip,
                    use_mt=self.use_mt,
                    loss_scale=loss_scaling['init_scale'],
                    dls_upscale_interval=loss_scaling['upscale_interval']
                    )
                params = self.fp_optimizer.fp32_params if isinstance(self.fp_optimizer.fp32_params, list) \
                    else [self.fp_optimizer.fp32_params]
        elif math == 'fp32':
            if distributed:
                self.model = DDP(self.model,
                                 message_size=distributed_overlap_allreduce_messagesize,
                                 delay_allreduce=(not distributed_overlap_allreduce))
            self.fp_optimizer = Fp32Optimizer(self.model, grad_clip)
            # params = self.model.parameters()

        opt_name = opt_config.pop('optimizer')
        if opt_name == 'FusedAdam':
            if math == 'fp16' or math == 'fp32':
                if self.args.distributed_weight_update == 2:
                    dwu_args = self.distributed_weight_update_config
                    self.optimizer = DistributedFusedAdam(params, max_grad_norm=grad_clip,
                                                          **dwu_args, **opt_config)
                    self.optimizer.set_global_scale(1.0) # used for grad norm clipping in step function
                else:
                    # Maintain grad norm and scaling by ourselves
                    self.optimizer = FusedAdam(params, use_mt=self.use_mt, **opt_config)
            else:
                self.optimizer = FusedAdam(params, use_mt=self.use_mt, max_grad_norm=grad_clip,
                                           amp_scale_adjustment=get_world_size(), **opt_config)
        else:
            self.optimizer = torch.optim.__dict__[opt_name](params,
                                                            **opt_config)
        logging.info(f'Using optimizer: {self.optimizer}')

        log_event(key=constants.OPT_NAME,
                  value=constants.ADAM, sync=False)
        log_event(key=constants.OPT_BASE_LR,
                  value=opt_config['lr'], sync=False)
        log_event(key=constants.OPT_ADAM_BETA_1,
                  value=self.optimizer.defaults['betas'][0], sync=False)
        log_event(key=constants.OPT_ADAM_BETA_2,
                  value=self.optimizer.defaults['betas'][1], sync=False)
        log_event(key=constants.OPT_ADAM_EPSILON,
                  value=self.optimizer.defaults['eps'], sync=False)
    def __init__(self,
                 model,
                 criterion,
                 opt_config,
                 print_freq=10,
                 save_freq=1000,
                 grad_clip=float('inf'),
                 batch_first=False,
                 save_info={},
                 save_path='.',
                 train_iterations=0,
                 checkpoint_filename='checkpoint%s.pth',
                 keep_checkpoints=5,
                 math='fp32',
                 loss_scaling={},
                 cuda=True,
                 distributed=False,
                 distributed_overlap_allreduce=False,
                 distributed_overlap_num_allreduce_streams=1,
                 distributed_overlap_allreduce_messagesize=1e7,
                 distributed_overlap_allreduce_communicators=None,
                 intra_epoch_eval=0,
                 prealloc_mode='always',
                 iter_size=1,
                 verbose=False):
        """
        Constructor for the Seq2SeqTrainer.

        :param model: model to train
        :param criterion: criterion (loss function)
        :param opt_config: dictionary with options for the optimizer
        :param print_freq: prints short summary every 'print_freq' iterations
        :param save_freq: saves checkpoint every 'save_freq' iterations
        :param grad_clip: coefficient for gradient clipping
        :param batch_first: if True the model uses (batch,seq,feature) tensors,
            if false the model uses (seq, batch, feature)
        :param save_info: dict with additional state stored in each checkpoint
        :param save_path: path to the directiory for checkpoints
        :param train_iterations: total number of training iterations to execute
        :param checkpoint_filename: name of files with checkpoints
        :param keep_checkpoints: max number of checkpoints to keep
        :param math: arithmetic type
        :param loss_scaling: options for dynamic loss scaling
        :param cuda: if True use cuda, if False train on cpu
        :param distributed: if True run distributed training
        :param intra_epoch_eval: number of additional eval runs within each
            training epoch
        :param prealloc_mode: controls preallocation,
            choices=['off', 'once', 'always']
        :param iter_size: number of iterations between weight updates
        :param verbose: enables verbose logging
        """
        super(Seq2SeqTrainer, self).__init__()
        self.model = model
        self.criterion = criterion
        self.epoch = 0
        self.save_info = save_info
        self.save_path = save_path
        self.save_freq = save_freq
        self.save_counter = 0
        self.checkpoint_filename = checkpoint_filename
        self.checkpoint_counter = cycle(range(keep_checkpoints))
        self.opt_config = opt_config
        self.cuda = cuda
        self.distributed = distributed
        self.print_freq = print_freq
        self.batch_first = batch_first
        self.verbose = verbose
        self.loss = None
        self.translator = None
        self.scheduler = None
        self.intra_epoch_eval = intra_epoch_eval
        self.iter_size = iter_size
        self.prealloc_mode = prealloc_mode
        self.preallocated = False

        self.retain_allreduce_buffers = True
        self.gradient_average = False

        if cuda:
            self.model = self.model.cuda()
            self.criterion = self.criterion.cuda()

        params = self.model.parameters()
        if math == 'fp16':
            self.model = self.model.half()
            if distributed:
                self.model = DDP(
                    self.model,
                    message_size=distributed_overlap_allreduce_messagesize,
                    delay_allreduce=(not distributed_overlap_allreduce),
                    retain_allreduce_buffers=self.retain_allreduce_buffers,
                    gradient_average=self.gradient_average)
            self.fp_optimizer = Fp16Optimizer(
                self.model,
                grad_clip,
                loss_scale=loss_scaling['init_scale'],
                dls_upscale_interval=loss_scaling['upscale_interval'])
            params = [self.fp_optimizer.fp32_params]
        elif math == 'fp32':
            if distributed:
                self.model = DDP(
                    self.model,
                    message_size=distributed_overlap_allreduce_messagesize,
                    delay_allreduce=(not distributed_overlap_allreduce))
            self.fp_optimizer = Fp32Optimizer(self.model, grad_clip)
            # params = self.model.parameters()

        opt_name = opt_config.pop('optimizer')
        if opt_name == 'FusedAdam':
            if math == 'fp16' or math == 'fp32':
                self.optimizer = FusedAdam(params, **opt_config)
            else:
                self.optimizer = FusedAdam(
                    params,
                    use_mt=True,
                    max_grad_norm=grad_clip,
                    amp_scale_adjustment=get_world_size(),
                    **opt_config)
        else:
            self.optimizer = torch.optim.__dict__[opt_name](params,
                                                            **opt_config)
        if math == 'amp_fp16':
            self.model, self.optimizer = amp.initialize(
                self.model,
                self.optimizer,
                cast_model_outputs=torch.float16,
                keep_batchnorm_fp32=False,
                opt_level='O2')
            self.fp_optimizer = AMPOptimizer(
                self.model,
                grad_clip,
                loss_scale=loss_scaling['init_scale'],
                dls_upscale_interval=loss_scaling['upscale_interval'])
            if distributed:
                self.model = DDP(
                    self.model,
                    message_size=distributed_overlap_allreduce_messagesize,
                    delay_allreduce=(not distributed_overlap_allreduce),
                    num_allreduce_streams=
                    distributed_overlap_num_allreduce_streams,
                    allreduce_communicators=
                    distributed_overlap_allreduce_communicators,
                    retain_allreduce_buffers=self.retain_allreduce_buffers,
                    gradient_average=self.gradient_average)

        logging.info(f'Using optimizer: {self.optimizer}')

        mlperf_print(key=mlperf_compliance.constants.OPT_BASE_LR,
                     value=opt_config['lr'])
Ejemplo n.º 8
0
    def __init__(self,
                 model,
                 criterion,
                 opt_config,
                 scheduler_config,
                 print_freq=10,
                 save_freq=1000,
                 grad_clip=float('inf'),
                 batch_first=False,
                 save_info={},
                 save_path='.',
                 checkpoint_filename='checkpoint%s.pth',
                 keep_checkpoints=5,
                 math='fp32',
                 cuda=True,
                 distributed=False,
                 distributed_overlap_allreduce=False,
                 distributed_overlap_allreduce_messagesize=1e7,
                 intra_epoch_eval=0,
                 translator=None,
                 verbose=False,
                 arch="gnmt"):
        super(Seq2SeqTrainer, self).__init__()
        self.model = model
        self.criterion = criterion
        self.epoch = 0
        self.save_info = save_info
        self.save_path = save_path
        self.save_freq = save_freq
        self.save_counter = 0
        self.checkpoint_filename = checkpoint_filename
        self.checkpoint_counter = cycle(range(keep_checkpoints))
        self.opt_config = opt_config
        self.cuda = cuda
        self.distributed = distributed
        self.print_freq = print_freq
        self.batch_first = batch_first
        self.verbose = verbose
        self.loss = None
        self.translator = translator
        self.intra_epoch_eval = intra_epoch_eval
        self.arch = arch

        self.retain_allreduce_buffers = True
        self.gradient_average = False

        if cuda:
            self.model = self.model.cuda()
            self.criterion = self.criterion.cuda()

        if math == 'fp16':
            self.model = self.model.half()
            if distributed:
                # self.model = apex.parallel.DistributedDataParallel(self.model, message_size=10000000, delay_allreduce=True)
                self.model = apex.parallel.DistributedDataParallel(
                    self.model,
                    message_size=distributed_overlap_allreduce_messagesize,
                    delay_allreduce=(not distributed_overlap_allreduce),
                    retain_allreduce_buffers=self.retain_allreduce_buffers,
                    gradient_average=self.gradient_average)
            self.fp_optimizer = Fp16Optimizer(self.model, grad_clip)
            params = [self.fp_optimizer.fp32_params]
        elif math == 'fp32':
            if distributed:
                # self.model = apex.parallel.DistributedDataParallel(self.model, message_size=10000000, delay_allreduce=True)
                self.model = apex.parallel.DistributedDataParallel(
                    self.model,
                    message_size=distributed_overlap_allreduce_messagesize,
                    delay_allreduce=(not distributed_overlap_allreduce))
            self.fp_optimizer = Fp32Optimizer(self.model, grad_clip)
            params = self.model.parameters()

        opt_name = opt_config.pop('optimizer')
        if opt_name == 'FusedAdam':
            self.optimizer = apex.optimizers.FusedAdam(params, **opt_config)
        else:
            self.optimizer = torch.optim.__dict__[opt_name](params,
                                                            **opt_config)

        gnmt_print(key=mlperf_log.OPT_NAME, value=mlperf_log.ADAM)
        gnmt_print(key=mlperf_log.OPT_LR, value=opt_config['lr'])
        gnmt_print(key=mlperf_log.OPT_HP_ADAM_BETA1,
                   value=self.optimizer.defaults['betas'][0])
        gnmt_print(key=mlperf_log.OPT_HP_ADAM_BETA2,
                   value=self.optimizer.defaults['betas'][1])
        gnmt_print(key=mlperf_log.OPT_HP_ADAM_EPSILON,
                   value=self.optimizer.defaults['eps'])

        self.scheduler = WarmupMultiStepLR(
            self.optimizer,
            lr_method=scheduler_config["lr_method"],
            warmup_iters=scheduler_config["warmup_iters"],
            remain_steps=scheduler_config["remain_steps"],
            decay_steps=scheduler_config["decay_steps"])

        logging.info(f'Using optimizer: {self.optimizer}')