def _build_optimizer(self): params = list( filter(lambda p: p.requires_grad, self.model.parameters())) # for n, p in list(self.model.named_parameters()): # if p.requires_grad and n.startswith('encoder.bert'): # print(n) # else: # print('=====%s',n) # params = [(n, p) for n, p in list(self.model.parameters()) if n.startswith('bert.model')] if self.args.fp16: if self.cuda and torch.cuda.get_device_capability(0)[0] < 7: print( '| WARNING: your device does NOT support faster training with --fp16, ' 'please switch to FP32 which is likely to be faster') if self.args.memory_efficient_fp16: self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer( self.args, params) else: self._optimizer = optim.FP16Optimizer.build_optimizer( self.args, params) else: if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7: print( '| NOTICE: your device may support faster training with --fp16' ) if self.args.sep_optim: # bert_params = [(n, p) for n, p in list(self.model.named_parameters()) if n.startswith('encoder.bert')] # dec_params = [(n, p) for n, p in list(self.model.named_parameters()) if not n.startswith('encoder.bert')] bert_params = [ p for n, p in list(self.model.named_parameters()) if n.startswith('encoder.bert') ] dec_params = [ p for n, p in list(self.model.named_parameters()) if not n.startswith('encoder.bert') ] self._optimizer = optim.build_optimizer_bert( self.args, bert_params) self._dec_optimizer = optim.build_optimizer_dec( self.args, dec_params) else: self._optimizer = optim.build_optimizer(self.args, params) # We should initialize the learning rate scheduler immediately after # building the optimizer, so that the initial learning rate is set. if self.args.sep_optim: self._lr_scheduler = lr_scheduler.build_lr_scheduler( self.args, self.optimizer) self._dec_lr_scheduler = lr_scheduler.build_lr_scheduler( self.args, self._dec_optimizer, decoder=True) else: self._lr_scheduler = lr_scheduler.build_lr_scheduler( self.args, self.optimizer)
def load_checkpoint(self, filename, load_optim=True): """Load all training state from a checkpoint file.""" extra_state, optim_history, last_optim_state = \ utils.load_model_state(filename, self.get_model()) if last_optim_state is not None: # rebuild optimizer after loading model, since params may have changed #self.optimizer = optim.build_optimizer(self.args, self.model.parameters()) self.lr_scheduler = lr_scheduler.build_lr_scheduler(self.args, self.optimizer) if load_optim: self._optim_history = optim_history # only reload optimizer and lr_scheduler if they match last_optim = self._optim_history[-1] if last_optim['criterion_name'] == self.criterion.__class__.__name__: self.lr_scheduler.load_state_dict(last_optim['lr_scheduler_state']) if last_optim['optimizer_name'] == self.optimizer.__class__.__name__: self.optimizer.load_state_dict(last_optim_state) self._num_updates = last_optim['num_updates'] if self.args.amp and extra_state is not None and 'amp_state_dict' in extra_state: self.optimizer.optimizer._lazy_init_maybe_master_weights() self.optimizer.optimizer._amp_stash.lazy_init_called = True self.optimizer.optimizer.load_state_dict(last_optim_state) for param, saved_param in zip(amp.master_params(self.optimizer.optimizer), extra_state['amp_master_params']): param.data.copy_(saved_param.data) amp.load_state_dict(extra_state['amp_state_dict']) return extra_state
def __init__(self, args, model, criterion): if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') self.args = args # copy model and criterion to current device self.model = model.cuda() self.criterion = criterion.cuda() # initialize optimizer and LR scheduler self.optimizer = optim.build_optimizer(self.args, self.model.parameters()) self.lr_scheduler = lr_scheduler.build_lr_scheduler(self.args, self.optimizer) # initialize meters self.meters = OrderedDict() self.meters['train_loss'] = AverageMeter() self.meters['train_nll_loss'] = AverageMeter() self.meters['valid_loss'] = AverageMeter() self.meters['valid_nll_loss'] = AverageMeter() self.meters['wps'] = TimeMeter() # words per second self.meters['ups'] = TimeMeter() # updates per second self.meters['wpb'] = AverageMeter() # words per batch self.meters['bsz'] = AverageMeter() # sentences per batch self.meters['gnorm'] = AverageMeter() # gradient norm self.meters['clip'] = AverageMeter() # % of updates clipped self.meters['oom'] = AverageMeter() # out of memory self._max_bsz_seen = 0 self._num_updates = 0
def __init__(self, args, model): if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') self.args = args self.model = model.cuda() self.criterion = CRITERION_REGISTRY[args.criterion](args).cuda() self.optimizer = optim.build_optimizer(self.args, self.model.parameters()) self.lr_scheduler = lr_scheduler.build_lr_scheduler(self.args, self.optimizer) if args.amp: model, optimizer = amp.initialize( self.model, self.optimizer._optimizer, opt_level=self.args.amp_level if self.args.amp_level else 'O2', max_loss_scale=2**15, cast_model_outputs=torch.float16 ) if self.args.distributed_world_size > 1: self.model = DDP(model) self._buffered_stats = defaultdict(lambda: []) self._flat_grads = None self._num_updates = 0 self._num_val_iterations = 0 self._optim_history = None self.throughput_meter = TimeMeter()
def load_checkpoint(self, filename): """Load all training state from a checkpoint file.""" extra_state, self._optim_history, last_optim_state = utils.load_model_state( filename, self.model, cuda_device=torch.cuda.current_device()) if last_optim_state is not None: # rebuild optimizer after loading model, since params may have changed self.optimizer = optim.build_optimizer(self.args, self.model.parameters()) self.lr_scheduler = lr_scheduler.build_lr_scheduler( self.args, self.optimizer) # only reload optimizer and lr_scheduler if they match last_optim = self._optim_history[-1] if last_optim[ 'criterion_name'] == self.criterion.__class__.__name__: self.lr_scheduler.load_state_dict( last_optim['lr_scheduler_state']) if last_optim[ 'optimizer_name'] == self.optimizer.__class__.__name__: self.optimizer.load_state_dict(last_optim_state) self._num_updates = last_optim['num_updates'] return extra_state
def _build_optimizer(self): params = list( filter( lambda p: p.requires_grad, chain(self.model.parameters(), self.criterion.parameters()), ) ) if self.args.fp16: if self.cuda and torch.cuda.get_device_capability(0)[0] < 7: print('| WARNING: your device does NOT support faster training with --fp16, ' 'please switch to FP32 which is likely to be faster') if self.args.memory_efficient_fp16: self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(self.args, params) else: self._optimizer = optim.FP16Optimizer.build_optimizer(self.args, params) else: if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7: print('| NOTICE: your device may support faster training with --fp16') self._optimizer = optim.build_optimizer(self.args, params) if self.args.use_bmuf: self._optimizer = optim.FairseqBMUF(self.args, self._optimizer) # We should initialize the learning rate scheduler immediately after # building the optimizer, so that the initial learning rate is set. self._lr_scheduler = lr_scheduler.build_lr_scheduler(self.args, self.optimizer) self._lr_scheduler.step_update(0)
def __init__(self, cfg: CompositeOptimizerConfig, params): super().__init__(cfg) assert ( len(params) > 1 ), "Composite optimizer only works when there are multiple parameter groups (try fp16_no_flatten_grads: true)" groupped_params = defaultdict(list) for p in params: group = getattr(p, "param_group", "default") groupped_params[group].append(p) assert groupped_params.keys() == cfg.groups.keys(), ( f"Parameter groups {groupped_params.keys()} and optimizer groups {cfg.groups.keys()} are not the same! " "Try setting 'param_group' on your parameters in the model.") for group, group_params in groupped_params.items(): group_cfg = cfg.groups[group] with open_dict(group_cfg): group_cfg.optimizer.lr = group_cfg.lr group_cfg.lr_scheduler.lr = group_cfg.lr self.optimizers[group] = _build_optimizer(group_cfg.optimizer, group_params) if group_cfg.lr_scheduler is not None: self.lr_schedulers[group] = build_lr_scheduler( group_cfg.lr_scheduler, self.optimizers[group]) if len(self.lr_schedulers) > 0: assert len(self.lr_schedulers) == len(self.optimizers), ( f"Please provide an lr scheduler for each optimizer to use pass_through scheduler. " f"Optimizers: {self.optimizers}; Lr scheds: {self.lr_schedulers}" ) self.lr_scheduler = CompositeLRScheduler(self.lr_schedulers) self._optimizer = CompositeOptimizer(self.optimizers)
def build_optimizer(self): params = list( filter( lambda p: p.requires_grad, self.model.parameters(), )) if self.args.fp16: self.args.fp16_scale_window = 2**14 / self.args.world_size / self.args.gradient_accumulation_steps if self.cuda and torch.cuda.get_device_capability(0)[0] < 7: print( '| WARNING: your device does NOT support faster training with --fp16, ' 'please switch to FP32 which is likely to be faster') if self.args.memory_efficient_fp16: self.optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer( self.args, params) else: self.optimizer = optim.FP16Optimizer.build_optimizer( self.args, params) else: if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7: print( '| NOTICE: your device may support faster training with --fp16' ) self.optimizer = optim.build_optimizer(self.args, params) self.lr_scheduler = lr_scheduler.build_lr_scheduler( self.args, self.optimizer) self.lr_scheduler.step_update(0)
def _build_optimizer(self): params = list( filter(lambda p: p.requires_grad, self.model.parameters())) if self.args.fp16: if self.cuda and torch.cuda.get_device_capability(0)[0] < 7: print( '| WARNING: your device does NOT support faster training with --fp16, ' 'please switch to FP32 which is likely to be faster') if self.args.memory_efficient_fp16: self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer( self.args, params) else: self._optimizer = optim.FP16Optimizer.build_optimizer( self.args, params) else: if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7: print( '| NOTICE: your device may support faster training with --fp16' ) self._optimizer = optim.build_optimizer(self.args, params) print('| num. model params: {} (num. optimized: {})'.format( sum(p.numel() for p in self.model.parameters()), sum(p.numel() for p in self._optimizer.params if p.requires_grad), )) # We should initialize the learning rate scheduler immediately after # building the optimizer, so that the initial learning rate is set. self._lr_scheduler = lr_scheduler.build_lr_scheduler( self.args, self.optimizer)
def _build_optimizer(self): if self.args.optimizer != 'adam_cbn': params = list( filter( lambda p: p.requires_grad, chain(self.model.parameters(), self.criterion.parameters()), )) else: # selection from fairseq.modules.norms.constraint_bn_v2 import Constraint_Lagrangian constraint_param = [] for m in self.model.modules(): if isinstance(m, Constraint_Lagrangian): constraint_param.extend(list(map(id, m.parameters()))) params_lag = list( filter(lambda p: id(p) in constraint_param, chain(self.model.parameters()))) params = list( filter( lambda p: id(p) not in constraint_param and p. requires_grad, chain(self.model.parameters(), self.criterion.parameters()))) if self.args.fp16: if self.cuda and torch.cuda.get_device_capability(0)[0] < 7: print( '| WARNING: your device does NOT support faster training with --fp16, ' 'please switch to FP32 which is likely to be faster') if self.args.memory_efficient_fp16: self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer( self.args, params) else: self._optimizer = optim.FP16Optimizer.build_optimizer( self.args, params) else: if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7: print( '| NOTICE: your device may support faster training with --fp16' ) # check cbn if self.args.optimizer != 'adam_cbn': self._optimizer = optim.build_optimizer(self.args, params) else: self._optimizer = optim.build_optimizer( self.args, params, params_lag) if self.args.use_bmuf: self._optimizer = optim.FairseqBMUF(self.args, self._optimizer) # We should initialize the learning rate scheduler immediately after # building the optimizer, so that the initial learning rate is set. self._lr_scheduler = lr_scheduler.build_lr_scheduler( self.args, self.optimizer) self._lr_scheduler.step_update(0)
def _build_optimizer(self): params = list( filter( lambda p: p.requires_grad, chain(self.model.parameters(), self.criterion.parameters()), ) ) if self.cfg.common.fp16 or self.cfg.common.bf16: if self.cuda and torch.cuda.get_device_capability(0)[0] < 7: logger.info( "NOTE: your device does NOT support faster training with --fp16, " "please switch to FP32 which is likely to be faster" ) if ( self.cfg.common.memory_efficient_fp16 or self.cfg.common.memory_efficient_bf16 ): self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer( self.cfg, params ) else: self._optimizer = optim.FP16Optimizer.build_optimizer(self.cfg, params) else: if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7: logger.info("NOTE: your device may support faster training with --fp16") self._optimizer = optim.build_optimizer(self.cfg.optimizer, params) if self.cfg.optimization.use_bmuf: self._optimizer = optim.FairseqBMUF( self.cfg.bmuf, self._optimizer, ) if self.cfg.distributed_training.zero_sharding == "os": if ( self.cfg.common.fp16 and not self.cfg.common.memory_efficient_fp16 and not self.cfg.common.memory_efficient_bf16 ) and not self.cfg.common.fp16_no_flatten_grads: raise ValueError( "ZeRO is incomptabile with fp16 and flattened grads. " "Please use --fp16-no-flatten-grads" ) else: optim.shard_(self._optimizer, self.data_parallel_process_group) # We should initialize the learning rate scheduler immediately after # building the optimizer, so that the initial learning rate is set. self._lr_scheduler = lr_scheduler.build_lr_scheduler( self.cfg.lr_scheduler, self.optimizer, ) self._lr_scheduler.step_update(0)
def _build_optimizer(self): # params = list( # filter( # lambda p: p.requires_grad, # chain(self.model.parameters(), self.criterion.parameters()), # ) # ) params_dict = {} _default_manifold = Euclidean() for name, p in chain(self.model.named_parameters(), self.criterion.named_parameters()): if not p.requires_grad: continue if isinstance(p, (ManifoldParameter, ManifoldTensor)): _manifold = p.manifold else: _manifold = _default_manifold _manifold_name = _manifold.__class__.__name__ if not _manifold_name in params_dict: ref_grad = _manifold.egrad2rgrad(p.new_zeros(1), p.new_ones(1)) coef = 1 if ref_grad == 1 else 1 #print(f"lr={self.args.lr}, ref={ref_grad.item()}") params_dict[_manifold_name] = dict( params=[], lr_rectifier=ref_grad.reciprocal().item() * coef) params_dict[_manifold_name]['params'].append(p) params = params_dict.values() if self.args.fp16: if self.cuda and torch.cuda.get_device_capability(0)[0] < 7: logger.info( "NOTE: your device does NOT support faster training with --fp16, " "please switch to FP32 which is likely to be faster") if self.args.memory_efficient_fp16: self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer( self.args, params) else: self._optimizer = optim.FP16Optimizer.build_optimizer( self.args, params) else: if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7: logger.info( "NOTE: your device may support faster training with --fp16" ) self._optimizer = optim.build_optimizer(self.args, params) if self.args.use_bmuf: self._optimizer = optim.FairseqBMUF(self.args, self._optimizer) # We should initialize the learning rate scheduler immediately after # building the optimizer, so that the initial learning rate is set. self._lr_scheduler = lr_scheduler.build_lr_scheduler( self.args, self.optimizer) self._lr_scheduler.step_update(0)
def _build_optimizer(self): if self.ulmfit: params = [] multiplier_map = [] for n, p in self.model.named_parameters(): if p.requires_grad: params.append(p) param_name_split = n.split('.') if param_name_split[2] == 'lm_head': # last layer multiplier = 1. elif param_name_split[4].isdigit(): # encoder layer layer = int(param_name_split[4]) multiplier = self.decay_rate_lrc**-(self.num_layers - layer) else: # first layer multiplier = self.decay_rate_lrc**-(self.num_layers + 1) multiplier_map.append(multiplier) else: params = list( filter(lambda p: p.requires_grad, self.model.parameters())) if self.args.fp16: if self.cuda and torch.cuda.get_device_capability(0)[0] < 7: print( '| WARNING: your device does NOT support faster training with --fp16, ' 'please switch to FP32 which is likely to be faster') if self.args.memory_efficient_fp16: self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer( self.args, params) else: self._optimizer = optim.FP16Optimizer.build_optimizer( self.args, params) else: if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7: print( '| NOTICE: your device may support faster training with --fp16' ) self._optimizer = optim.build_optimizer(self.args, params) if self.args.use_bmuf: self._optimizer = optim.FairseqBMUF(self.args, params, self._optimizer) # We should initialize the learning rate scheduler immediately after # building the optimizer, so that the initial learning rate is set. self._lr_scheduler = lr_scheduler.build_lr_scheduler( self.args, self.optimizer) self._lr_scheduler.step_update(0)
def _build_optimizer(self): if self.args.fp16: if torch.cuda.get_device_capability(0)[0] < 7: print('| WARNING: your device does NOT support faster training with --fp16, ' 'please switch to FP32 which is likely to be faster') params = list(filter(lambda p: p.requires_grad, self.model.parameters())) self._optimizer = optim.FP16Optimizer.build_optimizer(self.args, params) else: if torch.cuda.get_device_capability(0)[0] >= 7: print('| NOTICE: your device may support faster training with --fp16') self._optimizer = optim.build_optimizer(self.args, self.model.parameters()) self.lr_scheduler = lr_scheduler.build_lr_scheduler(self.args, self._optimizer)
def _build_optimizer(self): if self.freeze_bart: for name, param in self.model.named_parameters(): if name.startswith('encoder') and name not in [ "encoder.structure_att.exparam", "encoder.structure_att.tp_linear.weight", "encoder.structure_att.tp_linear.bias", "encoder.structure_att.tc_linear.weight", "encoder.structure_att.tc_linear.bias", "encoder.structure_att.fi_linear.weight", "encoder.structure_att.bilinear._weight_matrix", "encoder.structure_att.bilinear._bias", "encoder.structure_att.fzlinear.weight", "encoder.structure_att.fzlinear.bias", "encoder.str_to_enc_linear.weight", "encoder.str_to_enc_linear.bias" ]: param.requires_grad = False print("Freezing parameters") params = list( filter( lambda p: p.requires_grad, chain(self.model.parameters(), self.criterion.parameters()), )) if self.args.fp16: if self.cuda and torch.cuda.get_device_capability(0)[0] < 7: logger.info( "NOTE: your device does NOT support faster training with --fp16, " "please switch to FP32 which is likely to be faster") if self.args.memory_efficient_fp16: self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer( self.args, params) else: self._optimizer = optim.FP16Optimizer.build_optimizer( self.args, params) else: if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7: logger.info( "NOTE: your device may support faster training with --fp16" ) self._optimizer = optim.build_optimizer(self.args, params) if self.args.use_bmuf: self._optimizer = optim.FairseqBMUF(self.args, self._optimizer) # We should initialize the learning rate scheduler immediately after # building the optimizer, so that the initial learning rate is set. self._lr_scheduler = lr_scheduler.build_lr_scheduler( self.args, self.optimizer) self._lr_scheduler.step_update(0)
def _build_optimizer(self): from itertools import chain if hasattr(self.args, 'encoder_layers'): params = get_decayed_param_groups( chain(self.model.named_parameters(), self.criterion.named_parameters()), num_layers=self.args.encoder_layers, weight_decay=self.args.weight_decay, weight_decay_exclude=self.args.weight_decay_exclude, freeze_encoder=self.args.freeze_encoder, freeze_embedding=self.args.freeze_embedding, lr=float(self.args.lr[0]), lr_decay=float(self.args.lr_decay), ) else: params = list( filter( lambda p: p.requires_grad, chain(self.model.parameters(), self.criterion.parameters()), )) if self.args.fp16: if self.cuda and torch.cuda.get_device_capability(0)[0] < 7: logger.info( "NOTE: your device does NOT support faster training with --fp16, " "please switch to FP32 which is likely to be faster") if self.args.memory_efficient_fp16: self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer( self.args, params) else: self._optimizer = optim.FP16Optimizer.build_optimizer( self.args, params) else: if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7: logger.info( "NOTE: your device may support faster training with --fp16" ) self._optimizer = optim.build_optimizer(self.args, params) if self.args.use_bmuf: self._optimizer = optim.FairseqBMUF(self.args, self._optimizer) # We should initialize the learning rate scheduler immediately after # building the optimizer, so that the initial learning rate is set. self._lr_scheduler = lr_scheduler.build_lr_scheduler( self.args, self.optimizer) self._lr_scheduler.step_update(0)
def _build_optimizer(self): # create FP32 copy of parameters and grads params = [p for p in self.model.parameters() if p.requires_grad] total_param_size = sum(p.data.numel() for p in params) self.fp32_params = params[0].new(0).float().new(total_param_size) offset = 0 for p in params: numel = p.data.numel() self.fp32_params[offset:offset+numel].copy_(p.data.view(-1)) offset += numel self.fp32_params = torch.nn.Parameter(self.fp32_params) self.fp32_params.grad = self.fp32_params.data.new(total_param_size) # create optimizer using the copied FP32 params self._optimizer = optim.build_optimizer(self.args, [self.fp32_params]) self.lr_scheduler = lr_scheduler.build_lr_scheduler(self.args, self.optimizer)
def _build_optimizer(self): param_groups = self.task.get_task_params(self.model, self.criterion) if (not hasattr(self.args, "lr_list")) or (self.args.lr_list is None): lr_list = [self.args.lr[0] for _ in param_groups] else: lr_list = [ float(lr.strip()) for lr in self.args.lr_list.split(",") ] for params, curr_lr in zip(param_groups, lr_list): if self.args.fp16: if self.cuda and torch.cuda.get_device_capability(0)[0] < 7: print( '| WARNING: your device does NOT support faster training with --fp16, ' 'please switch to FP32 which is likely to be faster') if self.args.memory_efficient_fp16: optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer( self.args, params) else: optimizer = optim.FP16Optimizer.build_optimizer( self.args, params) else: if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7: print( '| NOTICE: your device may support faster training with --fp16' ) optimizer = optim.build_optimizer(self.args, params) if self.args.use_bmuf: optimizer = optim.FairseqBMUF(self.args, self._optimizer) self._optimizers.append(optimizer) # We should initialize the learning rate scheduler immediately after # building the optimizer, so that the initial learning rate is set. self.args.lr = [curr_lr] lrs = lr_scheduler.build_lr_scheduler(self.args, optimizer) lrs.step_update(0) self._lr_schedulers.append(lrs) self.args.lr = None self.set_current_optimizer()
def _build_optimizer(self): params_nmt = self.model.get_nmt_parameters() if self.args.fp16: if self.cuda and torch.cuda.get_device_capability(0)[0] < 7: print( '| WARNING: your device does NOT support faster training with --fp16, ' 'please switch to FP32 which is likely to be faster') if self.args.memory_efficient_fp16: raise NotImplementedError else: self._optimizer_nmt = optim.FP16Optimizer.build_optimizer( self.args, params_nmt) self._optimizer_adv = optim.FP16Optimizer.build_optimizer( self.args, self.model.get_adv_parameters()) else: if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7: print( '| NOTICE: your device may support faster training with --fp16' ) self._optimizer_nmt = optim.build_optimizer(self.args, params_nmt) self._optimizer_adv = optim.build_optimizer( self.args, self.model.get_adv_parameters()) self._optimizer = {} self._optimizer['nmt'] = self._optimizer_nmt self._optimizer['adv'] = self._optimizer_adv print( '| num. model params: {} (num. optimized: {} ( nmt: {}, adv classifier: {} ) )' .format( sum(p.numel() for p in self.model.parameters()), sum(p.numel() for optim in self._optimizer.values() for p in optim.params if p.requires_grad), sum(p.numel() for p in self._optimizer['nmt'].params if p.requires_grad), sum(p.numel() for p in self._optimizer['adv'].params if p.requires_grad))) # We should initialize the learning rate scheduler immediately after # building the optimizer, so that the initial learning rate is set. self._lr_scheduler = lr_scheduler.build_lr_scheduler( self.args, self.optimizer['nmt'])
def __init__(self, args, model): if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') self.args = args self.model = model.cuda() self.criterion = CRITERION_REGISTRY[args.criterion](args).cuda() self.optimizer = optim.build_optimizer(self.args, self.model.parameters()) self.lr_scheduler = lr_scheduler.build_lr_scheduler( self.args, self.optimizer) self.scaler = amp.GradScaler(enabled=self.args.amp, init_scale=2**15) if self.args.distributed_world_size > 1: self.model = DDP(model) self._buffered_stats = defaultdict(lambda: []) self._num_updates = 0 self._optim_history = None self.throughput_meter = TimeMeter() self.avg_loss_meter = AverageMeter()
def _build_optimizer(self): params = list( filter( lambda p: p.requires_grad, ## chain函数:接受一个可迭代对象列表作为输入,并返回一个迭代器, 顺次迭代所有对象的内容 chain(self.model.parameters(), self.criterion.parameters()), ) ) ## 获取需要优化的参数列表, 此参数列表是通过nn.Module.parameters()获取,通过递归遍历Module的所有submodule和其所有的torch.nn.Parameter对象获取。这些submodule和parameter都是在Module的属性赋值时根据类型进行分类存储得到的 if self.args.fp16: if self.cuda and torch.cuda.get_device_capability(0)[0] < 7: print( "| WARNING: your device does NOT support faster training with --fp16, " "please switch to FP32 which is likely to be faster") if self.args.memory_efficient_fp16: self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer( self.args, params) else: self._optimizer = optim.FP16Optimizer.build_optimizer( self.args, params) else: if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7: print( "| NOTICE: your device may support faster training with --fp16" ) #通过optim/__init__.py构造的build_optimizer和args.optimizer标定的choice,直接调用对应optimizer的构造函数 #比如如果args.optimier指定为adam,则此处build_optimizer就是FairseqAdam类的构造函数 self._optimizer = optim.build_optimizer(self.args, params) if self.args.use_bmuf: self._optimizer = optim.FairseqBMUF(self.args, self._optimizer) # We should initialize the learning rate scheduler immediately after # building the optimizer, so that the initial learning rate is set. self._lr_scheduler = lr_scheduler.build_lr_scheduler( self.args, self.optimizer) self._lr_scheduler.step_update(0)
def _build_optimizer(self): # params = list(filter(lambda p: p.requires_grad, self.model.parameters())) model = self.model no_decay = [ 'bias', 'gamma', 'beta', 'LayerNorm.bias', 'LayerNorm.weight' ] params = [ { 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and ( 'bert' in n or 'embedding_token' in n) ], 'weight_decay': 0.01, 'lr_scale': self.args.encoder_lr_scale }, { 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and ( 'bert' not in n and 'embedding_token' not in n) ], 'weight_decay': 0.01, 'lr_scale': self.args.decoder_lr_scale }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and ( 'bert' in n or 'embedding_token' in n) ], 'weight_decay': 0.0, 'lr_scale': self.args.encoder_lr_scale }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and ( 'bert' not in n and 'embedding_token' not in n) ], 'weight_decay': 0.0, 'lr_scale': self.args.decoder_lr_scale }, ] # params = [{"params":[p for n, p in model.named_parameters()], "lr_scale":1}] # params = [p for n, p in model.named_parameters() if p.require_grad] if self.args.fp16: if self.cuda and torch.cuda.get_device_capability(0)[0] < 7: print( '| WARNING: your device does NOT support faster training with --fp16, ' 'please switch to FP32 which is likely to be faster') if self.args.memory_efficient_fp16: self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer( self.args, params) else: self._optimizer = optim.FP16Optimizer.build_optimizer( self.args, params) else: if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7: print( '| NOTICE: your device may support faster training with --fp16' ) self._optimizer = optim.build_optimizer(self.args, params) # We should initialize the learning rate scheduler immediately after # building the optimizer, so that the initial learning rate is set. self._lr_scheduler = lr_scheduler.build_lr_scheduler( self.args, self.optimizer)
def build_model(self, args): from fairseq import models copied_lang_pairs = [p for p in self.lang_pairs] for lang_pair in copied_lang_pairs: src, tgt = lang_pair.split('-') key = '{}-{}'.format(tgt, src) self.lang_pairs.append(key) model = models.build_model(args, self) self.lang_pairs = copied_lang_pairs if not isinstance(model, FairseqMultiModel): raise ValueError( 'SemisupervisedTranslationTask requires a FairseqMultiModel architecture' ) if self.args.bt_dds: # set up data actor finetune optimizer bt_params = [] for lang_pair in self.lang_pairs: bt_lang_pair = _get_dds_bt_key(lang_pair) for p in model.models[bt_lang_pair].parameters(): if p.requires_grad: bt_params.append(p) if self.args.bt_optimizer == "SGD": self.data_optimizer = torch.optim.SGD( bt_params, lr=self.args.data_actor_lr[0], momentum=self.args.bt_optimizer_momentum, nesterov=self.args.bt_optimizer_nesterov) #t_optim = self.args.optimizer #self.args.optimizer = "data_sgd" #self.data_optimizer = build_optimizer(self.args, bt_params) #self.args.optimizer = t_optim elif self.args.bt_optimizer == "ASGD": self.data_optimizer = torch.optim.ASGD( bt_params, lr=self.args.data_actor_lr[0]) if self.args.data_lr_scheduler is not None: print("Building lr scheduler {} for BT model...".format( self.args.data_lr_scheduler)) t_scheduler = self.args.lr_scheduler self.args.lr_scheduler = self.args.data_lr_scheduler self.data_lr_scheduler = lr_scheduler.build_lr_scheduler( self.args, self.data_optimizer) self.data_lr_scheduler.step_update(0) self.args.lr_scheduler = t_scheduler else: self.data_lr_scheduler = None self.step = 0 # create SequenceGenerator for each model that has backtranslation dependency on it self.sequence_generators = {} #if (self.lambda_otf_bt > 0.0 or self.lambda_otf_bt_steps is not None) and self.training: if self.training: for lang_pair in self.lang_pairs: src, tgt = lang_pair.split('-') key = '{}-{}'.format(tgt, src) self.sequence_generators[key] = SequenceGenerator( tgt_dict=self.dicts[src], beam_size=args.bt_beam_size, max_len_a=args.bt_max_len_a, max_len_b=args.bt_max_len_b, sampling=self.args.sampling, sampling_topk=self.args.sampling_topk, temperature=self.args.temperature, ) decoder_lang_tok_idx = self.get_decoder_langtok(src) def backtranslate_fn( sample, model=model.models[key], bos_token=decoder_lang_tok_idx, sequence_generator=self.sequence_generators[key], ): return sequence_generator.generate( [model], sample, bos_token=bos_token, ) self.backtranslators[lang_pair] = backtranslate_fn return model
def _build_optimizer(self): params = list( filter( lambda p: p.requires_grad, chain(self.model.parameters(), self.criterion.parameters()), ) ) if getattr(self.args, 'multiple_lr', False): assert self.args.lr_scheduler == 'multi_lr_inverse_sqrt', 'only multi_lr_inverse_sqrt supports multiple_lr now' assert len(self.args.lr) == 3, 'Three learning rates for roberta, sents_encoder and decoder should be provided' named_params = [(n, p) for n, p in self.model.named_parameters() if p.requires_grad] encoder_params = [(n, p) for n, p in self.model.encoder.named_parameters() if p.requires_grad] decoder_params = [(n, p) for n, p in self.model.decoder.named_parameters() if p.requires_grad] if hasattr(self.model, 'decoder_perm'): decoder_params += [(n, p) for n, p in self.model.decoder_perm.named_parameters() if p.requires_grad] # params = [ # {'params': [p for n, p in named_params if 'roberta' in n and (n.startswith('encoder') or n.startswith('module.encoder'))]}, # {'params': [p for n, p in named_params if not 'roberta' in n and (n.startswith('encoder') or n.startswith('module.encoder'))]}, # {'params': [p for n, p in named_params if n.startswith('decoder') or n.startswith('module.decoder')]} # ] params = [ {'params': [p for n, p in encoder_params if 'roberta' in n]}, {'params': [p for n, p in encoder_params if not 'roberta' in n]}, {'params': [p for n, p in decoder_params]} ] assert len(named_params) == sum([len(p['params']) for p in params]), named_params if self.args.fp16: if self.cuda and torch.cuda.get_device_capability(0)[0] < 7: print( "| WARNING: your device does NOT support faster training with --fp16, " "please switch to FP32 which is likely to be faster" ) if self.args.memory_efficient_fp16: if getattr(self.args, 'multiple_lr', False): self._optimizer = optim.ConcatOptimizer(self.args, [ optim.MemoryEfficientFP16Optimizer.build_optimizer(self.args, param['params']) for param in params]) else: self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer( self.args, params ) else: if getattr(self.args, 'multiple_lr', False): self._optimizer = optim.ConcatOptimizer(self.args, [optim.FP16Optimizer.build_optimizer(self.args, param['params']) for param in params]) else: self._optimizer = optim.FP16Optimizer.build_optimizer(self.args, params) else: if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7: print("| NOTICE: your device may support faster training with --fp16") if getattr(self.args, 'multiple_lr', False): self._optimizer = optim.ConcatOptimizer(self.args, [optim.build_optimizer(self.args, param['params']) for param in params]) else: self._optimizer = optim.build_optimizer(self.args, params) if self.args.use_bmuf: self._optimizer = optim.FairseqBMUF(self.args, self._optimizer) # We should initialize the learning rate scheduler immediately after # building the optimizer, so that the initial learning rate is set. self._lr_scheduler = lr_scheduler.build_lr_scheduler(self.args, self.optimizer) self._lr_scheduler.step_update(0)
def _build_optimizer(self): self._optimizer = optim.build_optimizer(self.args, self.model.parameters()) self.lr_scheduler = lr_scheduler.build_lr_scheduler( self.args, self._optimizer)
def _build_optimizer(self): #if not self.args.balance: ''' count = 1 for name, param in chain(self.model.encoder.named_parameters(), self.model.decoder.named_parameters(), self.criterion.named_parameters()): if param.requires_grad: print(count, name) count += 1 print("------") for name, param in chain(self.model.section_positions.named_parameters(), self.model.section_layernorm_embedding.named_parameters(), self.model.section.named_parameters(), self.model.w_context_vector.named_parameters(), self.model.w_proj.named_parameters()): if param.requires_grad: print(count, name) count += 1 ''' #print(len(self.model.named_parameters())) #print(list( # filter( # lambda p: p.requires_grad, # chain(self.model.encoder.parameters(), self.model.decoder.parameters()), # ) #)) #params = list( # filter( # lambda p: p.requires_grad, # chain(self.model.parameters(), self.criterion.parameters()), # ) #) #print("Total: ") #print(len(params)) ''' base_params = list(map(id, chain(self.model.encoder.parameters(), self.model.decoder.parameters()))) logits_params = filter(lambda p: id(p) not in base_params and p.requires_grad, self.model.parameters()) base_params_id = list(map(id, self.model.section_positions.parameters())) + list(map(id,net.bn1.parameters()))+\ list(map(id,net.layer1.parameters())) + list(map(id,net.layer2.parameters())) \ + list(map(id,net.layer3.parameters())) + list(map(id,net.layer4.parameters())) new_params = filter(lambda p: id(p) not in base_params_id , net.parameters()) base_params = filter(lambda p: id(p) in base_params_id, net.parameters()) ''' new_params_id = list(map(id, self.model.section_positions.parameters())) + list(map(id,self.model.section_layernorm_embedding.parameters()))+\ list(map(id,self.model.section.parameters())) + list(map(id, self.model.w_proj.parameters())) \ + list(map(id,self.model.w_context_vector.parameters())) + list(map(id,self.model.w_proj_layer_norm.parameters())) base_params = list( filter(lambda p: id(p) not in new_params_id and p.requires_grad, self.model.parameters())) print("group1: ") print(len(base_params)) new_params = list( filter(lambda p: id(p) in new_params_id and p.requires_grad, self.model.parameters())) print("group2: ") print(len(new_params)) params = [ { "params": base_params }, { "params": new_params }, ] # "weight_decay": 0.01 params2 = None ''' if self.args.balance: params = list( filter( lambda p: p.requires_grad, chain(self.model.encoder.parameters(), self.model.decoder.parameters(), self.criterion.parameters()), ) ) params2 = list( filter( lambda p: p.requires_grad, chain(self.model.w_proj.parameters(), self.model.w_context_vector.parameters(), self.model.section_positions.parameters(), self.model.section_layernorm_embedding.parameters()), ) ) ''' if self.args.fp16: if self.cuda and torch.cuda.get_device_capability(0)[0] < 7: logger.info( "NOTE: your device does NOT support faster training with --fp16, " "please switch to FP32 which is likely to be faster") if self.args.memory_efficient_fp16: self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer( self.args, params) else: self._optimizer = optim.FP16Optimizer.build_optimizer( self.args, params) else: if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7: logger.info( "NOTE: your device may support faster training with --fp16" ) self._optimizer = optim.build_optimizer(self.args, params) self._optimizer2 = None if self.args.balance and params2 is not None: self._optimizer2 = optim.build_optimizer(self.args, params2) if self.args.use_bmuf: self._optimizer = optim.FairseqBMUF(self.args, self._optimizer) # We should initialize the learning rate scheduler immediately after # building the optimizer, so that the initial learning rate is set. self._lr_scheduler = lr_scheduler.build_lr_scheduler( self.args, self.optimizer, self._optimizer2) self._lr_scheduler.step_update(0)
def _build_optimizer(self): self._optimizer = optim.build_optimizer(self.args, self.model) self.lr_scheduler = lr_scheduler.build_lr_scheduler( self.args, self._optimizer)
def _build_optimizer(self): self._optimizer = optim.build_optimizer(self.args, self.model.parameters()) self.lr_scheduler = lr_scheduler.build_lr_scheduler(self.args, self._optimizer)
def main(): global args, best_prec1 args = parser.parse_args() args.data = args.data_dir os.environ["CUDA_VISIBLE_DEVICES"] = f"{args.local_rank}" # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load dataset splits load_dataset_splits(task, ['train', 'valid']) # Build criterion criterion = task.build_criterion(args) # create stages of the model module = importlib.import_module(args.module) args.arch = module.arch() model = module.model(criterion) max_positions = (args.max_source_positions, args.max_target_positions) dummy_batch = task.dataset('train').get_dummy_batch( args.max_tokens, max_positions) inputs = dummy_batch['net_input'] input0 = inputs['src_tokens'] input1 = inputs['prev_output_tokens'] target = dummy_batch['target'] training_tensor_shapes = { "input0": list(input0.size()), "input1": list(input1.size()), "target": list(target.size()), "ntokens": [1] } dtypes = { "input0": input0.dtype, "input1": input1.dtype, "target": target.dtype, "ntokens": torch.float32 } inputs_module_destinations = {"input0": 0, "input1": 0} target_tensor_names = {"target", "ntokens"} for module_id, (stage, inputs, outputs) in enumerate( model[:-1]): # Skip last layer (loss). input_tensors = [] for module_input in inputs: if module_input in inputs_module_destinations: inputs_module_destinations[module_input] = module_id input_tensor = torch.ones(tuple( training_tensor_shapes[module_input]), dtype=dtypes[module_input]).cuda() input_tensors.append(input_tensor) stage.cuda() # PyTorch should not maintain metadata for a backward pass on # synthetic inputs. Without the following line, the runtime is # as much as 1.5x slower in a full DP configuration. with torch.no_grad(): output_tensors = stage(*tuple(input_tensors)) if not type(output_tensors) is tuple: output_tensors = [output_tensors] for output, output_tensor in zip(outputs, list(output_tensors)): training_tensor_shapes[output] = list(output_tensor.size()) dtypes[output] = output_tensor.dtype eval_tensor_shapes = {} for key in training_tensor_shapes: eval_tensor_shapes[key] = tuple(training_tensor_shapes[key]) training_tensor_shapes[key] = tuple(training_tensor_shapes[key]) configuration_maps = { 'module_to_stage_map': None, 'stage_to_rank_map': None, 'stage_to_depth_map': None } if args.config_path is not None: json_config_file = json.load(open(args.config_path, 'r')) configuration_maps['module_to_stage_map'] = json_config_file.get( "module_to_stage_map", None) configuration_maps['stage_to_rank_map'] = json_config_file.get( "stage_to_rank_map", None) configuration_maps['stage_to_rank_map'] = { int(k): v for (k, v) in configuration_maps['stage_to_rank_map'].items() } configuration_maps['stage_to_depth_map'] = json_config_file.get( "stage_to_depth_map", None) r = runtime.StageRuntime( model=model, distributed_backend=args.distributed_backend, fp16=args.fp16, loss_scale=args.loss_scale, training_tensor_shapes=training_tensor_shapes, eval_tensor_shapes=eval_tensor_shapes, training_tensor_dtypes=dtypes, inputs_module_destinations=inputs_module_destinations, target_tensor_names=target_tensor_names, configuration_maps=configuration_maps, master_addr=args.master_addr, rank=args.rank, local_rank=args.local_rank, num_ranks_in_server=args.num_ranks_in_server, verbose_freq=args.verbose_frequency, model_type=runtime.TRANSLATION, enable_recompute=args.recompute) # stage needed to determine if current stage is the first stage # num_stages needed to determine if current stage is the last stage # num_ranks needed to determine number of warmup_minibatches in case of pipelining args.stage = r.stage args.num_stages = r.num_stages args.num_ranks = r.num_ranks if not is_first_stage(): args.synthetic_data = True # define optimizer if args.no_input_pipelining: num_versions = 1 else: # number of versions is the total number of machines following the current # stage, shared amongst all replicas in this stage num_versions = r.num_warmup_minibatches + 1 # if specified, resume from checkpoint if args.resume: checkpoint_file_path = os.path.join( args.checkpoint_dir, f"checkpoint.{r.stage}.pth.tar.epoch.{args.start_epoch}") assert os.path.isfile(checkpoint_file_path) print("=> loading checkpoint '{}'".format(checkpoint_file_path)) checkpoint = torch.load(checkpoint_file_path) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] r.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}' (epoch {})".format( checkpoint_file_path, checkpoint['epoch'])) # TODO: make this configurable by args use_adam_optimizer = True if use_adam_optimizer: optimizer = adam.Adam(r.master_parameters, lr=args.lr, betas=(0.9, 0.98), weight_decay=args.weight_decay) else: optimizer = sgd.SGD(r.master_parameters, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) scheduler = lr_scheduler.build_lr_scheduler(args, optimizer) if args.resume: optimizer.load_state_dict(checkpoint['optimizer']) cudnn.benchmark = True # epoch_itr = data.EpochBatchIterator( # dataset=task.dataset(args.train_subset), # max_tokens=args.max_tokens, # max_sentences=args.max_sentences_valid, # max_positions=max_positions, # ignore_invalid_inputs=True, # required_batch_size_multiple=8, # seed=1, # num_shards=1, # shard_id=0, # ) def epoch_itr(): return task.dataset('train').get_dummy_batch(args.max_tokens, max_positions) distributed_sampler = False if configuration_maps['stage_to_rank_map'] is not None: num_ranks_in_first_stage = len( configuration_maps['stage_to_rank_map'][0]) if num_ranks_in_first_stage > 1: distributed_sampler = True for epoch in range(args.start_epoch, args.epochs): if distributed_sampler: train_loader.sampler.set_epoch(epoch) # train or run forward pass only for one epoch if args.forward_only: validate(val_loader, r, epoch) else: train(epoch_itr, r, optimizer, epoch, scheduler) # evaluate on validation set # prec1 = validate(val_loader, r, epoch) prec1 = 0 if r.stage != r.num_stages: prec1 = 0 # remember best prec@1 and save checkpoint best_prec1 = max(prec1, best_prec1) should_save_checkpoint = args.checkpoint_dir_not_nfs or r.rank_in_stage == 0 if args.checkpoint_dir and should_save_checkpoint: save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': r.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict() }, args.checkpoint_dir, r.stage, epoch)
def lr_scheduler(self): if self._lr_scheduler is None: self._lr_scheduler = lr_scheduler.build_lr_scheduler( self.args, self.optimizer) return self._lr_scheduler
def _build_optimizer(self): # params = list(filter(lambda p: p.requires_grad, self.model.parameters())) params = [] id_params = [] # 获得每一层encoder_layer #params.append({"params":torch.nn.ModuleList([self.model.encoder.]).parameters()}) en_embed_tokens = torch.nn.ModuleList( [self.model.encoder.embed_tokens]).parameters() en_embed_positions = torch.nn.ModuleList( [self.model.encoder.embed_positions]).parameters() en_layer_norm = torch.nn.ModuleList([self.model.encoder.layer_norm ]).parameters() de_embed_tokens = torch.nn.ModuleList( [self.model.decoder.embed_tokens]).parameters() de_embed_positions = torch.nn.ModuleList( [self.model.decoder.embed_positions]).parameters() de_layer_norm = torch.nn.ModuleList([self.model.decoder.layer_norm ]).parameters() other_join = [] for i in list(en_embed_tokens): other_join.append(i) for i in list(en_embed_positions): other_join.append(i) for i in list(en_layer_norm): other_join.append(i) for i in list(de_embed_tokens): other_join.append(i) for i in list(de_embed_positions): other_join.append(i) for i in list(de_layer_norm): other_join.append(i) params.append({"params": other_join}) for i in range(self.encoder_layer_num): encoder = torch.nn.ModuleList([self.model.encoder.layers[i] ]).parameters() params.append({"params": list(encoder)}) # 获得每一层decoder_layer for i in range(self.decoder_layer_num): decoder = torch.nn.ModuleList([self.model.decoder.layers[i] ]).parameters() params.append({"params": list(decoder)}) ''' for i in range(self.encoder_layer_num): encoder = filter(lambda p: id(p) in torch.nn.ModuleList([self.model.encoder.layers[i]]), self.model.parameters()) id_params.append(encoder) params.append({"params": encoder}) # 获得每一层decoder_layer for i in range(self.decoder_layer_num): decoder = filter(lambda p: id(p) in torch.nn.ModuleList([self.model.decoder.layers[i]]), self.model.parameters()) id_params.append(decoder) params.append({"params": decoder}) # 获得其他层的参数 base_params = filter(lambda p: id(p) not in id_params, self.model.parameters()) params.append({"params": base_params}) ''' if self.args.fp16: if self.cuda and torch.cuda.get_device_capability(0)[0] < 7: print( '| WARNING: your device does NOT support faster training with --fp16, ' 'please switch to FP32 which is likely to be faster') if self.args.memory_efficient_fp16: self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer( self.args, params) else: self._optimizer = optim.FP16Optimizer.build_optimizer( self.args, params) else: if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7: print( '| NOTICE: your device may support faster training with --fp16' ) self._optimizer = optim.build_optimizer(self.args, params) # We should initialize the learning rate scheduler immediately after # building the optimizer, so that the initial learning rate is set. self._lr_scheduler = lr_scheduler.build_lr_scheduler( self.args, self.optimizer)