def _setup_optim(self, optimizer_parameters, state_dict=None, num_train_step=-1): if self.config['optimizer'] == 'sgd': self.optimizer = optim.SGD(optimizer_parameters, self.config['learning_rate'], weight_decay=self.config['weight_decay']) elif self.config['optimizer'] == 'adamax': self.optimizer = Adamax(optimizer_parameters, self.config['learning_rate'], warmup=self.config['warmup'], t_total=num_train_step, max_grad_norm=self.config['grad_clipping'], schedule=self.config['warmup_schedule'], weight_decay=self.config['weight_decay']) if self.config.get('have_lr_scheduler', False): self.config['have_lr_scheduler'] = False elif self.config['optimizer'] == 'radam': self.optimizer = RAdam(optimizer_parameters, self.config['learning_rate'], warmup=self.config['warmup'], t_total=num_train_step, max_grad_norm=self.config['grad_clipping'], schedule=self.config['warmup_schedule'], eps=self.config['adam_eps'], weight_decay=self.config['weight_decay']) if self.config.get('have_lr_scheduler', False): self.config['have_lr_scheduler'] = False # The current radam does not support FP16. self.config['fp16'] = False elif self.config['optimizer'] == 'adam': self.optimizer = Adam(optimizer_parameters, lr=self.config['learning_rate'], warmup=self.config['warmup'], t_total=num_train_step, max_grad_norm=self.config['grad_clipping'], schedule=self.config['warmup_schedule'], weight_decay=self.config['weight_decay']) if self.config.get('have_lr_scheduler', False): self.config['have_lr_scheduler'] = False else: raise RuntimeError('Unsupported optimizer: %s' % opt['optimizer']) if state_dict and 'optimizer' in state_dict: self.optimizer.load_state_dict(state_dict['optimizer']) if self.config['fp16']: try: from apex import amp global amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(self.network, self.optimizer, opt_level=self.config['fp16_opt_level']) self.network = model self.optimizer = optimizer if self.config.get('have_lr_scheduler', False): if self.config.get('scheduler_type', 'rop') == 'rop': self.scheduler = ReduceLROnPlateau(self.optimizer, mode='max', factor=self.config['lr_gamma'], patience=3) elif self.config.get('scheduler_type', 'rop') == 'exp': self.scheduler = ExponentialLR(self.optimizer, gamma=self.config.get('lr_gamma', 0.95)) else: milestones = [int(step) for step in self.config.get('multi_step_lr', '10,20,30').split(',')] self.scheduler = MultiStepLR(self.optimizer, milestones=milestones, gamma=self.config.get('lr_gamma')) else: self.scheduler = None
def __init__(self, opt, state_dict=None, num_train_step=-1): self.config = opt self.updates = state_dict['updates'] if state_dict and 'updates' in state_dict else 0 self.train_loss = AverageMeter() self.network = SANBertNetwork(opt) if state_dict: new_state = set(self.network.state_dict().keys()) for k in list(state_dict['state'].keys()): if k not in new_state: del state_dict['state'][k] for k, v in list(self.network.state_dict().items()): if k not in state_dict['state']: state_dict['state'][k] = v self.network.load_state_dict(state_dict['state']) self.mnetwork = nn.DataParallel(self.network) if opt['multi_gpu_on'] else self.network self.total_param = sum([p.nelement() for p in self.network.parameters() if p.requires_grad]) no_decay = ['bias', 'gamma', 'beta', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_parameters = [ {'params': [p for n, p in self.network.named_parameters() if n not in no_decay], 'weight_decay_rate': 0.01}, {'params': [p for n, p in self.network.named_parameters() if n in no_decay], 'weight_decay_rate': 0.0} ] # note that adamax are modified based on the BERT code if opt['optimizer'] == 'sgd': self.optimizer = optim.sgd(optimizer_parameters, opt['learning_rate'], weight_decay=opt['weight_decay']) elif opt['optimizer'] == 'adamax': self.optimizer = Adamax(optimizer_parameters, opt['learning_rate'], warmup=opt['warmup'], t_total=num_train_step, max_grad_norm=opt['grad_clipping'], schedule=opt['warmup_schedule']) if opt.get('have_lr_scheduler', False): opt['have_lr_scheduler'] = False elif opt['optimizer'] == 'adadelta': self.optimizer = optim.Adadelta(optimizer_parameters, opt['learning_rate'], rho=0.95) elif opt['optimizer'] == 'adam': self.optimizer = Adam(optimizer_parameters, lr=opt['learning_rate'], warmup=opt['warmup'], t_total=num_train_step, max_grad_norm=opt['grad_clipping'], schedule=opt['warmup_schedule']) if opt.get('have_lr_scheduler', False): opt['have_lr_scheduler'] = False else: raise RuntimeError('Unsupported optimizer: %s' % opt['optimizer']) if state_dict and 'optimizer' in state_dict: self.optimizer.load_state_dict(state_dict['optimizer']) if opt.get('have_lr_scheduler', False): if opt.get('scheduler_type', 'rop') == 'rop': self.scheduler = ReduceLROnPlateau(self.optimizer, mode='max', factor=opt['lr_gamma'], patience=3) elif opt.get('scheduler_type', 'rop') == 'exp': self.scheduler = ExponentialLR(self.optimizer, gamma=opt.get('lr_gamma', 0.95)) else: milestones = [int(step) for step in opt.get('multi_step_lr', '10,20,30').split(',')] self.scheduler = MultiStepLR(self.optimizer, milestones=milestones, gamma=opt.get('lr_gamma')) else: self.scheduler = None self.ema = None if opt['ema_opt'] > 0: self.ema = EMA(self.config['ema_gamma'], self.network) self.para_swapped = False
def __init__(self, opt, state_dict=None, num_train_step=-1): self.config = opt self.updates = state_dict[ 'updates'] if state_dict and 'updates' in state_dict else 0 self.local_updates = 0 self.train_loss = AverageMeter() self.network = SANBertNetwork(opt) if state_dict: self.network.load_state_dict(state_dict['state'], strict=False) self.mnetwork = nn.DataParallel( self.network) if opt['multi_gpu_on'] else self.network self.total_param = sum([ p.nelement() for p in self.network.parameters() if p.requires_grad ]) if opt['cuda']: self.network.cuda() no_decay = [ 'bias', 'gamma', 'beta', 'LayerNorm.bias', 'LayerNorm.weight' ] optimizer_parameters = [{ 'params': [ p for n, p in self.network.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in self.network.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] # note that adamax are modified based on the BERT code if opt['optimizer'] == 'sgd': self.optimizer = optim.SGD(optimizer_parameters, opt['learning_rate'], weight_decay=opt['weight_decay']) elif opt['optimizer'] == 'adamax': self.optimizer = Adamax(optimizer_parameters, opt['learning_rate'], warmup=opt['warmup'], t_total=num_train_step, max_grad_norm=opt['grad_clipping'], schedule=opt['warmup_schedule'], weight_decay=opt['weight_decay']) if opt.get('have_lr_scheduler', False): opt['have_lr_scheduler'] = False elif opt['optimizer'] == 'radam': self.optimizer = RAdam(optimizer_parameters, opt['learning_rate'], warmup=opt['warmup'], t_total=num_train_step, max_grad_norm=opt['grad_clipping'], schedule=opt['warmup_schedule'], eps=opt['adam_eps'], weight_decay=opt['weight_decay']) if opt.get('have_lr_scheduler', False): opt['have_lr_scheduler'] = False # The current radam does not support FP16. opt['fp16'] = False elif opt['optimizer'] == 'adadelta': self.optimizer = optim.Adadelta(optimizer_parameters, opt['learning_rate'], rho=0.95) elif opt['optimizer'] == 'adam': self.optimizer = Adam(optimizer_parameters, lr=opt['learning_rate'], warmup=opt['warmup'], t_total=num_train_step, max_grad_norm=opt['grad_clipping'], schedule=opt['warmup_schedule'], weight_decay=opt['weight_decay']) if opt.get('have_lr_scheduler', False): opt['have_lr_scheduler'] = False else: raise RuntimeError('Unsupported optimizer: %s' % opt['optimizer']) if state_dict and 'optimizer' in state_dict: self.optimizer.load_state_dict(state_dict['optimizer']) if opt['fp16']: try: from apex import amp global amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(self.network, self.optimizer, opt_level=opt['fp16_opt_level']) self.network = model self.optimizer = optimizer if opt.get('have_lr_scheduler', False): if opt.get('scheduler_type', 'rop') == 'rop': self.scheduler = ReduceLROnPlateau(self.optimizer, mode='max', factor=opt['lr_gamma'], patience=3) elif opt.get('scheduler_type', 'rop') == 'exp': self.scheduler = ExponentialLR(self.optimizer, gamma=opt.get('lr_gamma', 0.95)) else: milestones = [ int(step) for step in opt.get('multi_step_lr', '10,20,30').split(',') ] self.scheduler = MultiStepLR(self.optimizer, milestones=milestones, gamma=opt.get('lr_gamma')) else: self.scheduler = None self.ema = None if opt['ema_opt'] > 0: self.ema = EMA(self.config['ema_gamma'], self.network) if opt['cuda']: self.ema.cuda() self.para_swapped = False # zero optimizer grad self.optimizer.zero_grad()
def _setup_optim(self, optimizer_parameters, state_dict: dict = None, num_train_step: int = -1): # Setup optimizer parameters if self.config.optimizer == "sgd": self.optimizer = optim.SGD( optimizer_parameters, self.config.learning_rate, weight_decay=self.config.weight_decay, ) elif self.config.optimizer == "adamax": self.optimizer = Adamax( optimizer_parameters, self.config.learning_rate, warmup=self.config.warmup, t_total=num_train_step, max_grad_norm=self.config.grad_clipping, schedule=self.config.warmup_schedule, weight_decay=self.config.weight_decay, ) elif self.config.optimizer == "radam": self.optimizer = RAdam( optimizer_parameters, self.config.learning_rate, warmup=self.config.warmup, t_total=num_train_step, max_grad_norm=self.config.grad_clipping, schedule=self.config.warmup_schedule, eps=self.config.adam_eps, weight_decay=self.config.weight_decay, ) # The current radam does not support FP16. self.config.fp16 = False elif self.config.optimizer == "adam": self.optimizer = Adam( optimizer_parameters, lr=self.config.learning_rate, warmup=self.config.warmup, t_total=num_train_step, max_grad_norm=self.config.grad_clipping, schedule=self.config.warmup_schedule, weight_decay=self.config.weight_decay, ) else: raise RuntimeError( f"Unsupported optimizer: {self.config.optimizer}") # Clear scheduler for certain optimizer choices if self.config.optimizer in ["adam", "adamax", "radam"]: if self.config.have_lr_scheduler: self.config.have_lr_scheduler = False if state_dict and "optimizer" in state_dict: self.optimizer.load_state_dict(state_dict["optimizer"]) if self.config.fp16: try: global amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize( self.network, self.optimizer, opt_level=self.config.fp16_opt_level) self.network = model self.optimizer = optimizer if self.config.have_lr_scheduler: if self.config.scheduler_type == "rop": self.scheduler = ReduceLROnPlateau(self.optimizer, mode="max", factor=self.config.lr_gamma, patience=3) elif self.config.scheduler_type == "exp": self.scheduler = ExponentialLR(self.optimizer, gamma=self.config.lr_gamma or 0.95) else: milestones = [ int(step) for step in ( self.config.multi_step_lr or "10,20,30").split(",") ] self.scheduler = MultiStepLR(self.optimizer, milestones=milestones, gamma=self.config.lr_gamma) else: self.scheduler = None