def _build_optimizer(self): if self.args.optimizer != 'adam_cbn': params = list( filter( lambda p: p.requires_grad, chain(self.model.parameters(), self.criterion.parameters()), )) else: # selection from fairseq.modules.norms.constraint_bn_v2 import Constraint_Lagrangian constraint_param = [] for m in self.model.modules(): if isinstance(m, Constraint_Lagrangian): constraint_param.extend(list(map(id, m.parameters()))) params_lag = list( filter(lambda p: id(p) in constraint_param, chain(self.model.parameters()))) params = list( filter( lambda p: id(p) not in constraint_param and p. requires_grad, chain(self.model.parameters(), self.criterion.parameters()))) if self.args.fp16: if self.cuda and torch.cuda.get_device_capability(0)[0] < 7: print( '| WARNING: your device does NOT support faster training with --fp16, ' 'please switch to FP32 which is likely to be faster') if self.args.memory_efficient_fp16: self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer( self.args, params) else: self._optimizer = optim.FP16Optimizer.build_optimizer( self.args, params) else: if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7: print( '| NOTICE: your device may support faster training with --fp16' ) # check cbn if self.args.optimizer != 'adam_cbn': self._optimizer = optim.build_optimizer(self.args, params) else: self._optimizer = optim.build_optimizer( self.args, params, params_lag) if self.args.use_bmuf: self._optimizer = optim.FairseqBMUF(self.args, self._optimizer) # We should initialize the learning rate scheduler immediately after # building the optimizer, so that the initial learning rate is set. self._lr_scheduler = lr_scheduler.build_lr_scheduler( self.args, self.optimizer) self._lr_scheduler.step_update(0)
def build_optimizer(cls, args, params): """ Args: args (argparse.Namespace): fairseq args params (iterable): iterable of parameters to optimize """ flatten = not getattr(args, 'fp16_no_flatten_grads', False) fp32_params = cls.build_fp32_params(params, flatten=flatten) if flatten: fp32_optimizer = optim.build_optimizer(args, [fp32_params]) else: fp32_optimizer = optim.build_optimizer(args, fp32_params) return cls(args, params, fp32_optimizer, fp32_params)
def __init__(self, args, model): if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') self.args = args self.model = model.cuda() self.criterion = CRITERION_REGISTRY[args.criterion](args).cuda() self.optimizer = optim.build_optimizer(self.args, self.model.parameters()) self.lr_scheduler = lr_scheduler.build_lr_scheduler(self.args, self.optimizer) if args.amp: model, optimizer = amp.initialize( self.model, self.optimizer._optimizer, opt_level=self.args.amp_level if self.args.amp_level else 'O2', max_loss_scale=2**15, cast_model_outputs=torch.float16 ) if self.args.distributed_world_size > 1: self.model = DDP(model) self._buffered_stats = defaultdict(lambda: []) self._flat_grads = None self._num_updates = 0 self._num_val_iterations = 0 self._optim_history = None self.throughput_meter = TimeMeter()
def __init__(self, args, model, criterion): if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') self.args = args # copy model and criterion to current device self.model = model.cuda() self.criterion = criterion.cuda() # initialize optimizer and LR scheduler self.optimizer = optim.build_optimizer(self.args, self.model.parameters()) self.lr_scheduler = lr_scheduler.build_lr_scheduler(self.args, self.optimizer) # initialize meters self.meters = OrderedDict() self.meters['train_loss'] = AverageMeter() self.meters['train_nll_loss'] = AverageMeter() self.meters['valid_loss'] = AverageMeter() self.meters['valid_nll_loss'] = AverageMeter() self.meters['wps'] = TimeMeter() # words per second self.meters['ups'] = TimeMeter() # updates per second self.meters['wpb'] = AverageMeter() # words per batch self.meters['bsz'] = AverageMeter() # sentences per batch self.meters['gnorm'] = AverageMeter() # gradient norm self.meters['clip'] = AverageMeter() # % of updates clipped self.meters['oom'] = AverageMeter() # out of memory self._max_bsz_seen = 0 self._num_updates = 0
def _build_optimizer(self): params = list( filter(lambda p: p.requires_grad, self.model.parameters())) if self.args.fp16: if self.cuda and torch.cuda.get_device_capability(0)[0] < 7: print( '| WARNING: your device does NOT support faster training with --fp16, ' 'please switch to FP32 which is likely to be faster') if self.args.memory_efficient_fp16: self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer( self.args, params) else: self._optimizer = optim.FP16Optimizer.build_optimizer( self.args, params) else: if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7: print( '| NOTICE: your device may support faster training with --fp16' ) self._optimizer = optim.build_optimizer(self.args, params) print('| num. model params: {} (num. optimized: {})'.format( sum(p.numel() for p in self.model.parameters()), sum(p.numel() for p in self._optimizer.params if p.requires_grad), )) # We should initialize the learning rate scheduler immediately after # building the optimizer, so that the initial learning rate is set. self._lr_scheduler = lr_scheduler.build_lr_scheduler( self.args, self.optimizer)
def load_checkpoint(self, filename): """Load all training state from a checkpoint file.""" extra_state, self._optim_history, last_optim_state = utils.load_model_state( filename, self.model, cuda_device=torch.cuda.current_device()) if last_optim_state is not None: # rebuild optimizer after loading model, since params may have changed self.optimizer = optim.build_optimizer(self.args, self.model.parameters()) self.lr_scheduler = lr_scheduler.build_lr_scheduler( self.args, self.optimizer) # only reload optimizer and lr_scheduler if they match last_optim = self._optim_history[-1] if last_optim[ 'criterion_name'] == self.criterion.__class__.__name__: self.lr_scheduler.load_state_dict( last_optim['lr_scheduler_state']) if last_optim[ 'optimizer_name'] == self.optimizer.__class__.__name__: self.optimizer.load_state_dict(last_optim_state) self._num_updates = last_optim['num_updates'] return extra_state
def build_optimizer(self): params = list( filter( lambda p: p.requires_grad, self.model.parameters(), )) if self.args.fp16: self.args.fp16_scale_window = 2**14 / self.args.world_size / self.args.gradient_accumulation_steps if self.cuda and torch.cuda.get_device_capability(0)[0] < 7: print( '| WARNING: your device does NOT support faster training with --fp16, ' 'please switch to FP32 which is likely to be faster') if self.args.memory_efficient_fp16: self.optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer( self.args, params) else: self.optimizer = optim.FP16Optimizer.build_optimizer( self.args, params) else: if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7: print( '| NOTICE: your device may support faster training with --fp16' ) self.optimizer = optim.build_optimizer(self.args, params) self.lr_scheduler = lr_scheduler.build_lr_scheduler( self.args, self.optimizer) self.lr_scheduler.step_update(0)
def _build_optimizer(self): params = list( filter( lambda p: p.requires_grad, chain(self.model.parameters(), self.criterion.parameters()), ) ) if self.args.fp16: if self.cuda and torch.cuda.get_device_capability(0)[0] < 7: print('| WARNING: your device does NOT support faster training with --fp16, ' 'please switch to FP32 which is likely to be faster') if self.args.memory_efficient_fp16: self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(self.args, params) else: self._optimizer = optim.FP16Optimizer.build_optimizer(self.args, params) else: if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7: print('| NOTICE: your device may support faster training with --fp16') self._optimizer = optim.build_optimizer(self.args, params) if self.args.use_bmuf: self._optimizer = optim.FairseqBMUF(self.args, self._optimizer) # We should initialize the learning rate scheduler immediately after # building the optimizer, so that the initial learning rate is set. self._lr_scheduler = lr_scheduler.build_lr_scheduler(self.args, self.optimizer) self._lr_scheduler.step_update(0)
def build_optimizer(cls, cfg: DictConfig, params, **kwargs): """ Args: args (argparse.Namespace): fairseq args params (iterable): iterable of parameters to optimize """ fp16_optimizer = optim.build_optimizer(cfg.optimizer, params) return cls(cfg, params, fp16_optimizer, **kwargs)
def build_optimizer(cls, cfg: DictConfig, params, **kwargs): """ Args: cfg (omegaconf.DictConfig): fairseq args params (iterable): iterable of parameters to optimize """ fp32_optimizer = optim.build_optimizer(cfg.optimizer, params) return cls(cfg, params, fp32_optimizer, **kwargs)
def build_optimizer(cls, args, params): """ Args: args (argparse.Namespace): fairseq args params (iterable): iterable of parameters to optimize """ fp16_optimizer = optim.build_optimizer(args, params) return cls(args, params, fp16_optimizer)
def build_optimizer(cls, args, params): """ Args: args (argparse.Namespace): fairseq args params (iterable): iterable of parameters to optimize """ flatten = not getattr(args, 'fp16_no_flatten_grads', False) fp32_params = cls.build_fp32_params(params, flatten=flatten) if flatten: fp32_optimizer = optim.build_optimizer(args, [fp32_params]) else: fp32_optimizer = optim.build_optimizer(args, fp32_params) if flatten and not fp32_optimizer.supports_flat_params: raise RuntimeError( 'chosen optimizer does not support flat params, ' 'please set --fp16-no-flatten-grads') return cls(args, params, fp32_optimizer, fp32_params)
def load_existing_checkpoint(checkpoint_path, trainer, restore_state=True): extra_state = None loaded = False if restore_state: extra_state = trainer.load_checkpoint(checkpoint_path) if extra_state is None: loaded = False print( f"Failed to load checkpoint and state from {checkpoint_path}.") else: loaded = True print( f"| loaded checkpoint {checkpoint_path} (epoch {extra_state['epoch']})\n" f"| extra_state {extra_state}") # batch_offset being None denotes this was a checkpoint saved at # the end of an epoch (after the last batch). if extra_state["batch_offset"] is None: trainer.lr_step(extra_state["epoch"]) extra_state["epoch"] += 1 extra_state["batch_offset"] = 0 # check availability for checkpoint backward compatiblity if "start_time" not in extra_state: extra_state["start_time"] = time.time() if "last_bleu_eval" not in extra_state: extra_state["last_bleu_eval"] = 0 else: # TODO(weiho): use trainer.load_checkpoint(load_optim=False) after # that's been synced to open-source fairseq. dummy_state, _, _ = utils.load_model_state( checkpoint_path, trainer.model, cuda_device=torch.cuda.current_device()) trainer.optimizer = optim.build_optimizer(trainer.args, trainer.model.parameters()) trainer.lr_scheduler = optim.lr_scheduler.build_lr_scheduler( trainer.args, trainer.optimizer) trainer._optim_history = [] if dummy_state is None: loaded = False print(f"Failed to load checkpoint weights from {checkpoint_path}.") else: loaded = True print(f"Loaded checkpoint weights from {checkpoint_path}.") if extra_state is None: extra_state = { "epoch": 1, "batch_offset": 0, "val_loss": None, "start_time": time.time(), "last_bleu_eval": 0, } return loaded, extra_state
def build_optimizer(cls, args, params): """ Args: args (argparse.Namespace): fairseq args params (iterable): iterable of parameters to optimize """ fp32_params = cls.build_fp32_params(params) fp32_optimizer = optim.build_optimizer(args, [fp32_params]) return cls(args, params, fp32_optimizer, fp32_params)
def _build_optimizer(self): params_nmt = self.model.get_nmt_parameters() if self.args.fp16: if self.cuda and torch.cuda.get_device_capability(0)[0] < 7: print( '| WARNING: your device does NOT support faster training with --fp16, ' 'please switch to FP32 which is likely to be faster') if self.args.memory_efficient_fp16: raise NotImplementedError else: self._optimizer_nmt = optim.FP16Optimizer.build_optimizer( self.args, params_nmt) self._optimizer_adv = optim.FP16Optimizer.build_optimizer( self.args, self.model.get_adv_parameters()) else: if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7: print( '| NOTICE: your device may support faster training with --fp16' ) self._optimizer_nmt = optim.build_optimizer(self.args, params_nmt) self._optimizer_adv = optim.build_optimizer( self.args, self.model.get_adv_parameters()) self._optimizer = {} self._optimizer['nmt'] = self._optimizer_nmt self._optimizer['adv'] = self._optimizer_adv print( '| num. model params: {} (num. optimized: {} ( nmt: {}, adv classifier: {} ) )' .format( sum(p.numel() for p in self.model.parameters()), sum(p.numel() for optim in self._optimizer.values() for p in optim.params if p.requires_grad), sum(p.numel() for p in self._optimizer['nmt'].params if p.requires_grad), sum(p.numel() for p in self._optimizer['adv'].params if p.requires_grad))) # We should initialize the learning rate scheduler immediately after # building the optimizer, so that the initial learning rate is set. self._lr_scheduler = lr_scheduler.build_lr_scheduler( self.args, self.optimizer['nmt'])
def build_optimizer(cls, cfg: DictConfig, params, **kwargs): """ Args: cfg (omegaconf.DictConfig): fairseq args params (iterable): iterable of parameters to optimize """ flatten = not getattr(cfg.common, "fp16_no_flatten_grads", False) if getattr(cfg.common, "bf16", False): flatten = False # mixed precision is faster on TPUs without flat grads fp32_params = cls.build_fp32_params(cfg.optimizer, params, flatten=flatten) if flatten: fp32_optimizer = optim.build_optimizer(cfg.optimizer, [fp32_params]) else: fp32_optimizer = optim.build_optimizer(cfg.optimizer, fp32_params) if flatten and not fp32_optimizer.supports_flat_params: raise RuntimeError( f"chosen optimizer {fp32_optimizer.__class__.__name__} does not support flat params, please set --fp16-no-flatten-grads" ) return cls(cfg, params, fp32_optimizer, fp32_params, **kwargs)
def build_optimizer(cls, args, params): """ Args: args (argparse.Namespace): fairseq args params (iterable): iterable of parameters to optimize """ flatten = not getattr(args, "fp16_no_flatten_grads", False) if getattr(args, "bf16", False): flatten = False # mixed precision is faster on TPUs without flat grads fp32_params = cls.build_fp32_params(args, params, flatten=flatten) if flatten: fp32_optimizer = optim.build_optimizer(args, [fp32_params]) else: fp32_optimizer = optim.build_optimizer(args, fp32_params) if flatten and not fp32_optimizer.supports_flat_params: raise RuntimeError( "chosen optimizer does not support flat params, " "please set --fp16-no-flatten-grads") return cls(args, params, fp32_optimizer, fp32_params)
def _build_optimizer(self): # params = list( # filter( # lambda p: p.requires_grad, # chain(self.model.parameters(), self.criterion.parameters()), # ) # ) params_dict = {} _default_manifold = Euclidean() for name, p in chain(self.model.named_parameters(), self.criterion.named_parameters()): if not p.requires_grad: continue if isinstance(p, (ManifoldParameter, ManifoldTensor)): _manifold = p.manifold else: _manifold = _default_manifold _manifold_name = _manifold.__class__.__name__ if not _manifold_name in params_dict: ref_grad = _manifold.egrad2rgrad(p.new_zeros(1), p.new_ones(1)) coef = 1 if ref_grad == 1 else 1 #print(f"lr={self.args.lr}, ref={ref_grad.item()}") params_dict[_manifold_name] = dict( params=[], lr_rectifier=ref_grad.reciprocal().item() * coef) params_dict[_manifold_name]['params'].append(p) params = params_dict.values() if self.args.fp16: if self.cuda and torch.cuda.get_device_capability(0)[0] < 7: logger.info( "NOTE: your device does NOT support faster training with --fp16, " "please switch to FP32 which is likely to be faster") if self.args.memory_efficient_fp16: self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer( self.args, params) else: self._optimizer = optim.FP16Optimizer.build_optimizer( self.args, params) else: if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7: logger.info( "NOTE: your device may support faster training with --fp16" ) self._optimizer = optim.build_optimizer(self.args, params) if self.args.use_bmuf: self._optimizer = optim.FairseqBMUF(self.args, self._optimizer) # We should initialize the learning rate scheduler immediately after # building the optimizer, so that the initial learning rate is set. self._lr_scheduler = lr_scheduler.build_lr_scheduler( self.args, self.optimizer) self._lr_scheduler.step_update(0)
def _build_optimizer(self): params = list( filter(lambda p: p.requires_grad, self.model.parameters())) # for n, p in list(self.model.named_parameters()): # if p.requires_grad and n.startswith('encoder.bert'): # print(n) # else: # print('=====%s',n) # params = [(n, p) for n, p in list(self.model.parameters()) if n.startswith('bert.model')] if self.args.fp16: if self.cuda and torch.cuda.get_device_capability(0)[0] < 7: print( '| WARNING: your device does NOT support faster training with --fp16, ' 'please switch to FP32 which is likely to be faster') if self.args.memory_efficient_fp16: self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer( self.args, params) else: self._optimizer = optim.FP16Optimizer.build_optimizer( self.args, params) else: if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7: print( '| NOTICE: your device may support faster training with --fp16' ) if self.args.sep_optim: # bert_params = [(n, p) for n, p in list(self.model.named_parameters()) if n.startswith('encoder.bert')] # dec_params = [(n, p) for n, p in list(self.model.named_parameters()) if not n.startswith('encoder.bert')] bert_params = [ p for n, p in list(self.model.named_parameters()) if n.startswith('encoder.bert') ] dec_params = [ p for n, p in list(self.model.named_parameters()) if not n.startswith('encoder.bert') ] self._optimizer = optim.build_optimizer_bert( self.args, bert_params) self._dec_optimizer = optim.build_optimizer_dec( self.args, dec_params) else: self._optimizer = optim.build_optimizer(self.args, params) # We should initialize the learning rate scheduler immediately after # building the optimizer, so that the initial learning rate is set. if self.args.sep_optim: self._lr_scheduler = lr_scheduler.build_lr_scheduler( self.args, self.optimizer) self._dec_lr_scheduler = lr_scheduler.build_lr_scheduler( self.args, self._dec_optimizer, decoder=True) else: self._lr_scheduler = lr_scheduler.build_lr_scheduler( self.args, self.optimizer)
def _build_optimizer(self): params = list( filter( lambda p: p.requires_grad, chain(self.model.parameters(), self.criterion.parameters()), ) ) if self.cfg.common.fp16 or self.cfg.common.bf16: if self.cuda and torch.cuda.get_device_capability(0)[0] < 7: logger.info( "NOTE: your device does NOT support faster training with --fp16, " "please switch to FP32 which is likely to be faster" ) if ( self.cfg.common.memory_efficient_fp16 or self.cfg.common.memory_efficient_bf16 ): self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer( self.cfg, params ) else: self._optimizer = optim.FP16Optimizer.build_optimizer(self.cfg, params) else: if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7: logger.info("NOTE: your device may support faster training with --fp16") self._optimizer = optim.build_optimizer(self.cfg.optimizer, params) if self.cfg.optimization.use_bmuf: self._optimizer = optim.FairseqBMUF( self.cfg.bmuf, self._optimizer, ) if self.cfg.distributed_training.zero_sharding == "os": if ( self.cfg.common.fp16 and not self.cfg.common.memory_efficient_fp16 and not self.cfg.common.memory_efficient_bf16 ) and not self.cfg.common.fp16_no_flatten_grads: raise ValueError( "ZeRO is incomptabile with fp16 and flattened grads. " "Please use --fp16-no-flatten-grads" ) else: optim.shard_(self._optimizer, self.data_parallel_process_group) # We should initialize the learning rate scheduler immediately after # building the optimizer, so that the initial learning rate is set. self._lr_scheduler = lr_scheduler.build_lr_scheduler( self.cfg.lr_scheduler, self.optimizer, ) self._lr_scheduler.step_update(0)
def _build_optimizer(self): if self.ulmfit: params = [] multiplier_map = [] for n, p in self.model.named_parameters(): if p.requires_grad: params.append(p) param_name_split = n.split('.') if param_name_split[2] == 'lm_head': # last layer multiplier = 1. elif param_name_split[4].isdigit(): # encoder layer layer = int(param_name_split[4]) multiplier = self.decay_rate_lrc**-(self.num_layers - layer) else: # first layer multiplier = self.decay_rate_lrc**-(self.num_layers + 1) multiplier_map.append(multiplier) else: params = list( filter(lambda p: p.requires_grad, self.model.parameters())) if self.args.fp16: if self.cuda and torch.cuda.get_device_capability(0)[0] < 7: print( '| WARNING: your device does NOT support faster training with --fp16, ' 'please switch to FP32 which is likely to be faster') if self.args.memory_efficient_fp16: self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer( self.args, params) else: self._optimizer = optim.FP16Optimizer.build_optimizer( self.args, params) else: if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7: print( '| NOTICE: your device may support faster training with --fp16' ) self._optimizer = optim.build_optimizer(self.args, params) if self.args.use_bmuf: self._optimizer = optim.FairseqBMUF(self.args, params, self._optimizer) # We should initialize the learning rate scheduler immediately after # building the optimizer, so that the initial learning rate is set. self._lr_scheduler = lr_scheduler.build_lr_scheduler( self.args, self.optimizer) self._lr_scheduler.step_update(0)
def _build_optimizer(self): if self.args.fp16: if torch.cuda.get_device_capability(0)[0] < 7: print('| WARNING: your device does NOT support faster training with --fp16, ' 'please switch to FP32 which is likely to be faster') params = list(filter(lambda p: p.requires_grad, self.model.parameters())) self._optimizer = optim.FP16Optimizer.build_optimizer(self.args, params) else: if torch.cuda.get_device_capability(0)[0] >= 7: print('| NOTICE: your device may support faster training with --fp16') self._optimizer = optim.build_optimizer(self.args, self.model.parameters()) self.lr_scheduler = lr_scheduler.build_lr_scheduler(self.args, self._optimizer)
def _build_optimizer(self): if self.freeze_bart: for name, param in self.model.named_parameters(): if name.startswith('encoder') and name not in [ "encoder.structure_att.exparam", "encoder.structure_att.tp_linear.weight", "encoder.structure_att.tp_linear.bias", "encoder.structure_att.tc_linear.weight", "encoder.structure_att.tc_linear.bias", "encoder.structure_att.fi_linear.weight", "encoder.structure_att.bilinear._weight_matrix", "encoder.structure_att.bilinear._bias", "encoder.structure_att.fzlinear.weight", "encoder.structure_att.fzlinear.bias", "encoder.str_to_enc_linear.weight", "encoder.str_to_enc_linear.bias" ]: param.requires_grad = False print("Freezing parameters") params = list( filter( lambda p: p.requires_grad, chain(self.model.parameters(), self.criterion.parameters()), )) if self.args.fp16: if self.cuda and torch.cuda.get_device_capability(0)[0] < 7: logger.info( "NOTE: your device does NOT support faster training with --fp16, " "please switch to FP32 which is likely to be faster") if self.args.memory_efficient_fp16: self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer( self.args, params) else: self._optimizer = optim.FP16Optimizer.build_optimizer( self.args, params) else: if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7: logger.info( "NOTE: your device may support faster training with --fp16" ) self._optimizer = optim.build_optimizer(self.args, params) if self.args.use_bmuf: self._optimizer = optim.FairseqBMUF(self.args, self._optimizer) # We should initialize the learning rate scheduler immediately after # building the optimizer, so that the initial learning rate is set. self._lr_scheduler = lr_scheduler.build_lr_scheduler( self.args, self.optimizer) self._lr_scheduler.step_update(0)
def build_optimizer(args, params): # create FP32 copy of parameters and grads total_param_size = sum(p.data.numel() for p in params) fp32_params = params[0].new(0).float().new(total_param_size) offset = 0 for p in params: numel = p.data.numel() fp32_params[offset:offset + numel].copy_(p.data.view(-1)) offset += numel fp32_params = torch.nn.Parameter(fp32_params) fp32_params.grad = fp32_params.data.new(total_param_size) fp32_optimizer = optim.build_optimizer(args, [fp32_params]) return FP16Optimizer(args, params, fp32_optimizer, fp32_params)
def build_model(self, args): self.main_model = super().build_model(args) if not self.baseline: from copy import deepcopy model_args = deepcopy(args) model_args.classifier_att_context_size = getattr( args, 'classifier_att_context_size', 512) model_args.classifier_num_layers = getattr( args, 'classifier_num_layers', 2) model_args.classifier_input_size = getattr( args, 'classifier_input_size', 768) model_args.classifier_hidden_size = getattr( args, 'classifier_hidden_size', 512) model_args.fp16 = True model_args.arch = 'lang_test' self.adversary_model = super().build_model(model_args) # self.adversary_model.half() # self.adversary_model.decoder.half() use_cuda = torch.cuda.is_available() and not self.args.cpu if use_cuda: self.adversary_model.cuda() from copy import deepcopy optim_args = deepcopy(self.args) optim_args.lr = [0.0001] optim_args.learning_rate = [0.0001] self._adversarial_optimizer = optim.build_optimizer( optim_args, self.adversary_model.decoder.parameters()) optim_args = deepcopy(self.args) optim_args.lr = [0.00001] optim_args.learning_rate = [0.00001] self._wasserstein_optimizer = optim.build_optimizer( optim_args, self.main_model.encoder.parameters()) return self.main_model
def _build_optimizer(self, model): if self.args.fp16: if torch.cuda.get_device_capability(0)[0] < 7: print( "| WARNING: your device does NOT support faster training " "with --fp16, please switch to FP32 which is likely to be" " faster" ) params = list(filter(lambda p: p.requires_grad, model.parameters())) self._optimizer = optim.FP16Optimizer.build_optimizer(self.args, params) else: if torch.cuda.get_device_capability(0)[0] >= 7: print("| NOTICE: your device may support faster training with --fp16") self._optimizer = optim.build_optimizer(self.args, model.parameters()) return self._optimizer
def _build_optimizer(self): from itertools import chain if hasattr(self.args, 'encoder_layers'): params = get_decayed_param_groups( chain(self.model.named_parameters(), self.criterion.named_parameters()), num_layers=self.args.encoder_layers, weight_decay=self.args.weight_decay, weight_decay_exclude=self.args.weight_decay_exclude, freeze_encoder=self.args.freeze_encoder, freeze_embedding=self.args.freeze_embedding, lr=float(self.args.lr[0]), lr_decay=float(self.args.lr_decay), ) else: params = list( filter( lambda p: p.requires_grad, chain(self.model.parameters(), self.criterion.parameters()), )) if self.args.fp16: if self.cuda and torch.cuda.get_device_capability(0)[0] < 7: logger.info( "NOTE: your device does NOT support faster training with --fp16, " "please switch to FP32 which is likely to be faster") if self.args.memory_efficient_fp16: self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer( self.args, params) else: self._optimizer = optim.FP16Optimizer.build_optimizer( self.args, params) else: if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7: logger.info( "NOTE: your device may support faster training with --fp16" ) self._optimizer = optim.build_optimizer(self.args, params) if self.args.use_bmuf: self._optimizer = optim.FairseqBMUF(self.args, self._optimizer) # We should initialize the learning rate scheduler immediately after # building the optimizer, so that the initial learning rate is set. self._lr_scheduler = lr_scheduler.build_lr_scheduler( self.args, self.optimizer) self._lr_scheduler.step_update(0)
def train_step(self, sample, model, criterion, optimizer, ignore_grad=False): """ Do forward and backward, and return the loss as computed by *criterion* for the given *model* and *sample*. Args: sample (dict): the mini-batch. The format is defined by the :class:`~fairseq.data.FairseqDataset`. model (~fairseq.models.BaseFairseqModel): the model criterion (~fairseq.criterions.FairseqCriterion): the criterion optimizer (~fairseq.optim.FairseqOptimizer): the optimizer ignore_grad (bool): multiply loss by 0 if this is set to True Returns: tuple: - the loss - the sample size, which is used as the denominator for the gradient - logging outputs to display while training """ p = self.get_mask_rate() sample = self.process_sample(sample, p=p) if self.discriminator_optimizer is None: params = list( filter(lambda p: p.requires_grad, criterion.discriminator.parameters())) self.discriminator_optimizer = optim.build_optimizer( self.args, params) if self._step_counter % self.update_discr_every == 0: discriminator_logging_output = self.train_discriminator( model, criterion.discriminator, ignore_grad) else: discriminator_logging_output = {"loss": -1.} loss, sample_size, generator_logging_output = self.generator_train_step( sample, model, criterion, optimizer) logging_output = self.merge_logging_outputs( generator_logging_output, discriminator_logging_output) self._step_counter += 1 return loss, sample_size, logging_output
def _build_optimizer(self): # create FP32 copy of parameters and grads params = [p for p in self.model.parameters() if p.requires_grad] total_param_size = sum(p.data.numel() for p in params) self.fp32_params = params[0].new(0).float().new(total_param_size) offset = 0 for p in params: numel = p.data.numel() self.fp32_params[offset:offset+numel].copy_(p.data.view(-1)) offset += numel self.fp32_params = torch.nn.Parameter(self.fp32_params) self.fp32_params.grad = self.fp32_params.data.new(total_param_size) # create optimizer using the copied FP32 params self._optimizer = optim.build_optimizer(self.args, [self.fp32_params]) self.lr_scheduler = lr_scheduler.build_lr_scheduler(self.args, self.optimizer)
def _build_optimizer(self): param_groups = self.task.get_task_params(self.model, self.criterion) if (not hasattr(self.args, "lr_list")) or (self.args.lr_list is None): lr_list = [self.args.lr[0] for _ in param_groups] else: lr_list = [ float(lr.strip()) for lr in self.args.lr_list.split(",") ] for params, curr_lr in zip(param_groups, lr_list): if self.args.fp16: if self.cuda and torch.cuda.get_device_capability(0)[0] < 7: print( '| WARNING: your device does NOT support faster training with --fp16, ' 'please switch to FP32 which is likely to be faster') if self.args.memory_efficient_fp16: optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer( self.args, params) else: optimizer = optim.FP16Optimizer.build_optimizer( self.args, params) else: if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7: print( '| NOTICE: your device may support faster training with --fp16' ) optimizer = optim.build_optimizer(self.args, params) if self.args.use_bmuf: optimizer = optim.FairseqBMUF(self.args, self._optimizer) self._optimizers.append(optimizer) # We should initialize the learning rate scheduler immediately after # building the optimizer, so that the initial learning rate is set. self.args.lr = [curr_lr] lrs = lr_scheduler.build_lr_scheduler(self.args, optimizer) lrs.step_update(0) self._lr_schedulers.append(lrs) self.args.lr = None self.set_current_optimizer()
def _build_optimizer(self): self._optimizer = optim.build_optimizer(self.args, self.model.parameters()) self.lr_scheduler = lr_scheduler.build_lr_scheduler(self.args, self._optimizer)