def test_warmup_constant_scheduler(self): scheduler = get_constant_schedule_with_warmup(self.optimizer, num_warmup_steps=4) lrs = unwrap_schedule(scheduler, self.num_steps) expected_learning_rates = [2.5, 5.0, 7.5, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0] self.assertEqual(len(lrs[0]), 1) self.assertListEqual([l[0] for l in lrs], expected_learning_rates) scheduler = get_constant_schedule_with_warmup(self.optimizer, num_warmup_steps=4) lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps) self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2])
def get_scheduler(optimizer, scheduler: str, warmup_steps: int, num_total: int): assert scheduler in [ "constantlr", "warmuplinear", "warmupconstant", "warmupcosine", "warmupcosinewithhardrestarts" ], ('scheduler should be one of ["constantlr","warmupconstant","warmupcosine","warmupcosinewithhardrestarts"]' ) if scheduler == 'constantlr': return transformers.get_constant_schedule(optimizer) elif scheduler == 'warmupconstant': return transformers.get_constant_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps) elif scheduler == 'warmuplinear': return transformers.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_total) elif scheduler == 'warmupcosine': return transformers.get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_total) elif scheduler == 'warmupcosinewithhardrestarts': return transformers.get_cosine_with_hard_restarts_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_total)
def configure_optimizers(self): optimizers = [ LookaheadRMSprop( params=[ { "params": self.gate.g_hat.parameters(), "lr": self.hparams.learning_rate, }, { "params": self.gate.placeholder.parameters() if isinstance(self.gate.placeholder, torch.nn.ParameterList) else [self.gate.placeholder], "lr": self.hparams.learning_rate_placeholder, }, ], centered=True, ), LookaheadRMSprop( params=[self.alpha] if isinstance(self.alpha, torch.Tensor) else self.alpha.parameters(), lr=self.hparams.learning_rate_alpha, ), ] schedulers = [ { "scheduler": get_constant_schedule_with_warmup(optimizers[0], 12 * 100), "interval": "step", }, get_constant_schedule(optimizers[1]), ] return optimizers, schedulers
def __init__(self, model_cls, tokenizer, sampling_function, scoring_function, **params): """ Initialize PGTrainer. Args: model (torch.model): pi_theta(x) Policy to be trained e.g. Hugging Face transformer GPT2 model with value head orig_model (torch.model): original model before any training: a(x) in the equation above. e.g. Hugging Face transformer GPT2 original model params (dict or None): Vanilla PG parameters for training. Can include following keys: 'lr' (float): Adam learning rate, default: 1.41e-5 'batch_size' (int): Number of samples per optimisation step, default: 256 'forward_batch_size' (int): Number of samples forward passed through model at a time, default: 16 'minibatch_epochs' (int): Number of optimisation epochs per batch of samples, default: 4 """ super().__init__(tokenizer=tokenizer, sampling_function=sampling_function, scoring_function=scoring_function) self.params = self.default_params self.params.update(params) # pi_theta policy to be learned self.model = model_cls.from_pretrained(params['lm_name']).to( params['gpt2_device']) # original model for computing kl(pi||a) self.orig_model = model_cls.from_pretrained(params['lm_name']).to( params['gpt2_orig_device']) self.ref_model = self.orig_model self.is_policy_eval = True #optimzier self.optimizer = Adam(self.model.parameters(), lr=self.params['lr']) # scheduler scheduler_ = self.params['scheduler'] assert scheduler_ in ['cosine', 'constant', 'linear'], "unknown scheduler: {}".format( self.params['scheduler']) if scheduler_ == 'constant': self.scheduler = get_constant_schedule_with_warmup( self.optimizer, self.params['warmup_steps']) elif scheduler_ == 'cosine': print("Cosine scheduler...") self.scheduler = get_cosine_schedule_with_warmup( self.optimizer, self.params['warmup_steps'], self.params['steps'] // self.params['batch_size']) elif scheduler_ == 'linear': self.scheduler = get_linear_schedule_with_warmup( self.optimizer, self.params['warmup_steps']) self.params['gradient_accumulation_steps'] = self.params[ 'batch_size'] // self.params['forward_batch_size']
def create_optimizer(models, exp_config): # Prepare optimizer and schedule (linear warmup and decay) if isinstance( models, torch.nn.Module): # if it is just one model, put it into a list models = [models] parameters = itertools.chain( *[model.named_parameters() for model in models]) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in parameters if not any(nd in n for nd in no_decay)], "weight_decay": exp_config["weight_decay"], }, { "params": [p for n, p in parameters if any(nd in n for nd in no_decay)], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=exp_config["learning_rate"], eps=exp_config["adam_epsilon"]) scheduler = get_constant_schedule_with_warmup( optimizer, num_warmup_steps=exp_config["warmup_steps"]) return optimizer, scheduler
def _get_scheduler(self, optimizer, scheduler: str, warmup_steps: int, t_total: int): """ Returns the correct learning rate scheduler """ scheduler = scheduler.lower() if scheduler == 'constantlr': return transformers.get_constant_schedule(optimizer) elif scheduler == 'warmupconstant': return transformers.get_constant_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps) elif scheduler == 'warmuplinear': return transformers.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total) elif scheduler == 'warmupcosine': return transformers.get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total) elif scheduler == 'warmupcosinewithhardrestarts': return transformers.get_cosine_with_hard_restarts_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total) else: raise ValueError("Unknown scheduler {}".format(scheduler))
def load_model(self, checkpoint=None): print('Loading model from', self.config['model']) self.model, self.tokenizer = instantiate_model_and_tokenizer( self.config['model'], additional_tokens_smart_init=self.config['smart_init'], dropout=self.config['dropout'], attention_dropout=self.config['attention_dropout'], penman_linearization=self.config['penman_linearization'], collapse_name_ops=self.config['collapse_name_ops'], use_pointer_tokens=self.config['use_pointer_tokens'], raw_graph=self.config['raw_graph']) self.model.to(self.device) # Load optimization components self.optimizer = AdamW(self.model.parameters(), lr=self.config['learning_rate'], weight_decay=self.config['weight_decay']) self.scheduler = transformers.get_constant_schedule_with_warmup( self.optimizer, num_warmup_steps=self.config['warmup_steps']) self.scaler = GradScaler(enabled=self.config['fp16']) # Reload checkpoint model weights and optimizer params if loading from a checkpoint if checkpoint is not None: print('Checkpoint %s restored' % checkpoint) load_state_dict_from_checkpoint(checkpoint, self.model, self.optimizer, self.scheduler) # Try to load the smatch score and last_epoch from the config in the model directory. try: with open(os.path.join(self.model_dir, 'config.json')) as f: model_config = json.load(f) self.best_smatch = model_config['smatch_dev'] self.start_epoch = model_config['last_epoch'] + 1 except: logger.exception( 'Unable to load config file in model directory')
def init_opt(args, model, logger): if args.optimizer == 'adam': # Adam with transformer schedule has a different set of default hyperparameters: if args.lr_schedule == 'transformer': opt = torch.optim.Adam(model.params, lr=args.lr_multiply, betas=(0.9, 0.98), eps=1e-9, weight_decay=args.weight_decay) else: opt = torch.optim.Adam(model.params, lr=args.lr_multiply, betas=(args.beta0, 0.999), weight_decay=args.weight_decay) elif args.optimizer == 'adamw': opt = AdamW(model.params, lr=args.lr_multiply, weight_decay=args.weight_decay) elif args.optimizer == 'radam': import radam if args.warmup > 1: logger.warning('With RAdam optimizer, warmup is never applied') opt = radam.RAdam(model.params, lr=args.lr_multiply, betas=(args.beta0, 0.999), weight_decay=args.weight_decay) else: assert args.optimizer == 'sgd' opt = torch.optim.SGD(model.params, lr=args.lr_multiply, weight_decay=args.weight_decay) if args.lr_schedule == 'transformer': lr_lambda = partial(get_transformer_learning_rate, dimension=args.dimension, warmup=args.warmup) scheduler = torch.optim.lr_scheduler.LambdaLR(opt, lr_lambda) elif args.lr_schedule == 'constant': scheduler = get_constant_schedule_with_warmup( opt, num_warmup_steps=args.warmup) elif args.lr_schedule == 'linear': scheduler = get_linear_schedule_with_warmup( opt, num_training_steps=sum(args.train_iterations) // args.gradient_accumulation_steps, num_warmup_steps=args.warmup) elif args.lr_schedule == 'cosine': scheduler = get_cosine_schedule_with_warmup( opt, num_training_steps=sum(args.train_iterations) // args.gradient_accumulation_steps, num_warmup_steps=args.warmup, num_cycles=0.5) elif args.lr_schedule == 'sgd': lr_lambda = partial(get_sgd_learning_rate, warmup=args.warmup) scheduler = torch.optim.lr_scheduler.LambdaLR(opt, lr_lambda) else: raise ValueError('Invalid learning rate scheduler.') return opt, scheduler
def init_scheduler(self, optimizer, total_steps, warmup_steps): """ Initialization of lr scheduler. :param optimizer: The optimizer that is used for the training. :type optimizer: Optimizer :return: Created scheduler. :rtype: LambdaLR """ lastEpoch = -1 if self.config["scheduler"] == "linear": scheduler = transformers.get_linear_schedule_with_warmup( optimizer=optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps, last_epoch=lastEpoch) elif self.config["scheduler"] == "cosine": scheduler = transformers.get_cosine_schedule_with_warmup( optimizer=optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps, num_cycles=0.5, last_epoch=lastEpoch) elif self.config["scheduler"] == "constant": scheduler = transformers.get_constant_schedule_with_warmup( optimizer=optimizer, num_warmup_steps=warmup_steps, last_epoch=lastEpoch) else: scheduler = None return scheduler
def _create_scheduler(self, optimizer): warmup_steps = int(self.num_train_steps * self.args.warmup_proportion) if self.args.lr_schedule == "warmup_linear": return get_linear_schedule_with_warmup(optimizer, warmup_steps, self.num_train_steps) if self.args.lr_schedule == "warmup_constant": return get_constant_schedule_with_warmup(optimizer, warmup_steps) raise RuntimeError("Unsupported scheduler: " + self.args.lr_schedule)
def initialize_optimizer_scheduler(self): learner_params = [p for p in self.learner.parameters() if p.requires_grad] if isinstance(self.learner, BERTSequenceModel): self.optimizer = AdamW(learner_params, lr=self.lr, weight_decay=self.weight_decay) self.lr_scheduler = get_constant_schedule_with_warmup(self.optimizer, num_warmup_steps=100) else: self.optimizer = optim.Adam(learner_params, lr=self.lr, weight_decay=self.weight_decay) self.lr_scheduler = optim.lr_scheduler.StepLR(self.optimizer, step_size=500, gamma=0.5)
def build_optimizer(self, trn, lr, epochs, gradient_accumulation, warmup_steps, weight_decay, **kwargs): num_training_steps = len(trn) * epochs // gradient_accumulation if isinstance(warmup_steps, float): warmup_steps = int(num_training_steps * warmup_steps) optimizer = RAdam(self.model.parameters(), lr=lr, weight_decay=weight_decay) scheduler = get_constant_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps) return optimizer, scheduler
def prepare_optimizer(self): # differential lr for each sub module first self.differential_lr() # optimizer if self.config.optimizer_name == "Adam": self.optimizer = torch.optim.Adam(self.optimizer_grouped_parameters, eps=self.config.adam_epsilon) elif self.config.optimizer_name == "Ranger": self.optimizer = Ranger(self.optimizer_grouped_parameters) elif self.config.optimizer_name == "AdamW": self.optimizer = AdamW(self.optimizer_grouped_parameters, eps=self.config.adam_epsilon, betas=(0.9, 0.999)) elif self.config.optimizer_name == "FusedAdam": self.optimizer = FusedAdam(self.optimizer_grouped_parameters, bias_correction=False) else: raise NotImplementedError # lr scheduler if self.config.lr_scheduler_name == "WarmupCosineAnealing": num_train_optimization_steps = self.config.num_epoch * len(self.train_data_loader) \ // self.config.accumulation_steps self.scheduler = get_cosine_schedule_with_warmup(self.optimizer, num_warmup_steps=self.config.warmup_steps, num_training_steps=num_train_optimization_steps) self.lr_scheduler_each_iter = True elif self.config.lr_scheduler_name == "WarmRestart": self.scheduler = WarmRestart(self.optimizer, T_max=5, T_mult=1, eta_min=1e-6) self.lr_scheduler_each_iter = False elif self.config.lr_scheduler_name == "WarmupLinear": num_train_optimization_steps = self.config.num_epoch * len(self.train_data_loader) \ // self.config.accumulation_steps self.scheduler = get_linear_schedule_with_warmup(self.optimizer, num_warmup_steps=self.config.warmup_steps, num_training_steps=num_train_optimization_steps) self.lr_scheduler_each_iter = True elif self.config.lr_scheduler_name == "ReduceLROnPlateau": self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, mode='max', factor=0.6, patience=1, min_lr=1e-7) self.lr_scheduler_each_iter = False elif self.config.lr_scheduler_name == "WarmupConstant": self.scheduler = get_constant_schedule_with_warmup(self.optimizer, num_warmup_steps=self.config.warmup_steps) self.lr_scheduler_each_iter = True else: raise NotImplementedError # lr scheduler step for checkpoints if self.lr_scheduler_each_iter: self.scheduler.step(self.step) else: self.scheduler.step(self.epoch)
def configure_optimizers(self): optimizers = [ torch.optim.Adam(self.parameters(), self.hparams.learning_rate), ] schedulers = [ { "scheduler": get_constant_schedule_with_warmup(optimizers[0], 200), "interval": "step", }, ] return optimizers, schedulers
def _initialise_lr_scheduler(self, optimizer): num_batches = len(self.datasets['train']) // self.hparams.batch_size num_training_steps = num_batches // self.hparams.accumulate_grad_batches * self.hparams.max_epochs warmup_steps = int(num_training_steps * self.hparams.warmup_proportion) if self.hparams.learning_rate_scheduler == 'linear_with_warmup': scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps) elif self.hparams.learning_rate_scheduler == 'cosine_with_hard_restarts_warmup': scheduler = get_cosine_with_hard_restarts_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps, num_cycles=1) elif self.hparams.learning_rate_scheduler == 'cosine_schedule_with_warmup': scheduler = get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps) elif self.hparams.learning_rate_scheduler == 'constant_schedule_with_warmup': scheduler = get_constant_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps) elif self.hparams.learning_rate_scheduler == 'cosine_annealing_warm_restarts': scheduler = CosineAnnealingWarmRestarts(optimizer, warmup_steps) elif self.hparams.learning_rate_scheduler == 'reduce_on_plateau': scheduler = ReduceLROnPlateau(optimizer) elif self.hparams.learning_rate_scheduler == 'constant': scheduler = StepLR(optimizer, 10, gamma=1.0) else: raise ValueError( f'learning_rate_scheduler needs to be one of ' f'linear_with_warmup, cosine_with_hard_restarts_warmup, cosine_schedule_with_warmup, ' f'constant_schedule_with_warmup, cosine_annealing_warm_restarts, reduce_on_plateau, ' f'step_lr. ' f'Given: {self.hparams.learning_rate_scheduler}') logger.info(f'SCHEDULER: {self.hparams.learning_rate_scheduler} ' f'num_batches={num_batches} ' f'num_training_steps={num_training_steps} ' f'warmup_steps={warmup_steps}') return { 'scheduler': scheduler, 'monitor': 'valid_loss', 'interval': 'step', 'frequency': 1 }
def configure_optimizers(self): no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in self.generator.named_parameters() if not any(nd in n for nd in no_decay) ] + [ p for n, p in self.discriminator.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": self.config.weight_decay, }, { "params": [ p for n, p in self.generator.named_parameters() if any(nd in n for nd in no_decay) ] + [ p for n, p in self.discriminator.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] t_total = self.config.num_steps optimizer = Lamb(optimizer_grouped_parameters, lr=self.config.learning_rate, eps=self.config.epsilon) if self.config.lr_schedule == 'linear': scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=self.config.warmup_steps, num_training_steps=t_total) elif self.config.lr_schedule == 'cosine': scheduler = get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=self.config.warmup_steps, num_training_steps=t_total) elif self.config.lr_schedule == 'constant': scheduler = get_constant_schedule_with_warmup( optimizer, num_warmup_steps=self.config.warmup_steps) scheduler_config = {'scheduler': scheduler, 'interval': 'step'} return [optimizer], [scheduler_config]
def configure_optimizers(self): """ Sets different Learning rates for different parameter groups. """ parameters = [ {"params": self.classification_head.parameters()}, { "params": self.bert.parameters(), "lr": self.hparams.encoder_learning_rate, }, ] optimizer = optim.Adam(parameters, lr=self.hparams.learning_rate) scheduler = get_constant_schedule_with_warmup( optimizer, self.hparams.warmup_steps ) return [optimizer], []
def set_optimizer(self, model_0): # OPTIMIZER # do not regularize biases paramsbert = [] paramsbert0reg = [] paramsothers = [] paramsothers0reg = [] for p_name, p_value in model_0.named_parameters(): if not p_value.requires_grad: continue if 'bert' in p_name or 'pretrain_lm' in p_name or 'word_embed' in p_name: if '.bias' in p_name: paramsbert0reg += [p_value] else: paramsbert += [p_value] else: if '.bias' in p_name: paramsothers0reg += [p_value] else: paramsothers += [p_value] groups = [ dict(params=paramsbert, lr=self.params['bert_lr']), dict(params=paramsothers), dict(params=paramsbert0reg, lr=self.params['bert_lr'], weight_decay=0.0), dict(params=paramsothers0reg, weight_decay=0.0) ] scheduler = None if self.o_name == 'adam': optimizer = optim.Adam(groups, lr=self.params['lr'], weight_decay=float(self.params['reg']), amsgrad=True) elif self.o_name == 'adamw': optimizer = AdamW(groups, lr=self.params['lr'], weight_decay=float(self.params['reg']), correct_bias=False) scheduler = get_constant_schedule_with_warmup( optimizer, num_warmup_steps=2000) # Train Model print_options(self.params) for p_name, p_value in model_0.named_parameters(): if p_value.requires_grad: print(p_name) return optimizer, scheduler
def configure_optimizers(self): optimizer = torch.optim.Adam(self.model.parameters(), lr=self._lr) if self._num_training_steps: scheduler = transformers.get_linear_schedule_with_warmup( optimizer=optimizer, num_warmup_steps=self._num_warmup_steps, num_training_steps=self._num_training_steps) else: scheduler = transformers.get_constant_schedule_with_warmup( optimizer=optimizer, num_warmup_steps=self._num_warmup_steps, ) optimizers = [optimizer] schedulers = [{"scheduler": scheduler, "interval": "step"}] return optimizers, schedulers
def get_model(): if args.model == 'trans': transformer_config = BertConfig.from_pretrained('bert-base-uncased', num_labels=args.labels) if args.init_only: model = BertForSequenceClassification( config=transformer_config).to(device) else: model = BertForSequenceClassification.from_pretrained( 'bert-base-uncased', config=transformer_config).to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.lr) es = EarlyStopping(patience=args.patience, percentage=False, mode='max', min_delta=0.0) scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=0.05) else: if args.model == 'cnn': model = CNN_MODEL(tokenizer, args, n_labels=args.labels).to(device) elif args.model == 'lstm': model = LSTM_MODEL(tokenizer, args, n_labels=args.labels).to(device) optimizer = AdamW(model.parameters(), lr=args.lr) scheduler = ReduceLROnPlateau(optimizer, verbose=True) es = EarlyStopping(patience=args.patience, percentage=False, mode='max', min_delta=0.0) return model, optimizer, scheduler, es
def configure_optimizer(m, h): prelim_groups = {} opt_config = h["opt_config"] for (name, param) in m.named_parameters(): add_to_group(opt_config, prelim_groups, name, param) groups = [] for _, gps in prelim_groups.items(): for gp in gps: if len(gp["params"]) > 0: groups.append(gp) optimizer = transformers.AdamW(params=groups) lr_scheduler = transformers.get_constant_schedule_with_warmup( optimizer, num_warmup_steps=h["num_warmup_steps"]) return {"optimizer": optimizer, "lr_scheduler": lr_scheduler}
def build_scheduler(optimizer, cfg, num_training_steps): scheduler_type = cfg['scheduler'] warmup_steps = cfg['scheduler_warmup_steps'] if scheduler_type == 'constant_schedule_with_warmup': scheduler = transformers.get_constant_schedule_with_warmup( optimizer, warmup_steps) return scheduler elif scheduler_type == 'cosine_schedule_with_warmup': scheduler = transformers.get_cosine_schedule_with_warmup( optimizer, warmup_steps, num_training_steps) return scheduler else: raise Exception( 'Scheduler name invalid, choices are: "constant_schedule_with_warmup"' + '\n' + 'or "cosine_schedule_with_warmup"')
def get_model(model_args, device, embeddings=None): if model_args.model == 'transformer': transformer_config = BertConfig.from_pretrained('bert-base-uncased', num_labels=model_args.labels) if model_args.init_only: transformer_model = BertForSequenceClassification( config=transformer_config).to( device) model = BertWrapper(transformer_model) else: transformer_model = BertForSequenceClassification.from_pretrained( 'bert-base-uncased', config=transformer_config).to(device) model = BertWrapper(transformer_model) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ { 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=model_args.lr) scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=0.05) else: if model_args.model == 'cnn': model = CNN_MODEL(embeddings, model_args, n_labels=model_args.labels).to(device) elif model_args.model == 'lstm': model = LSTM_MODEL(embeddings, model_args, n_labels=model_args.labels).to(device) optimizer = AdamW(model.parameters(), lr=model_args.lr) scheduler = ReduceLROnPlateau(optimizer, verbose=True) return model, optimizer, scheduler
def get_schedule(args, optimizer, num_training_steps): num_warmup_steps = args.num_warmup_steps if 0 < num_warmup_steps and num_training_steps < 1: num_warmup_steps *= num_training_steps if args.schedule == 'linear': return get_linear_schedule_with_warmup( optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps) elif args.schedule == 'cosine': return get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps, num_cycles=0.5) else: # constant return get_constant_schedule_with_warmup( optimizer, num_warmup_steps=num_warmup_steps)
def configure_optimizers(self): optimizers = [ LookaheadRMSprop( params=list(self.gate.parameters()) + [self.placeholder], lr=self.hparams.learning_rate, centered=True, ), LookaheadRMSprop( params=[self.alpha], lr=self.hparams.learning_rate_alpha, ), ] schedulers = [ { "scheduler": get_constant_schedule_with_warmup(optimizers[0], 200), "interval": "step", }, get_constant_schedule(optimizers[1]), ] return optimizers, schedulers
def configure_optimizers(self): # Prepare optimizer param_optimizer = list(self.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW( optimizer_grouped_parameters, lr=self.learning_rate, ) if self.use_bert_adam else torch.optim.Adam( optimizer_grouped_parameters, lr=self.learning_rate) scheduler = get_constant_schedule_with_warmup( optimizer, num_warmup_steps=self.warmup_steps, # num_training_steps=self.total_steps ) scheduler = { 'scheduler': scheduler, 'interval': 'step', 'frequency': 1 } return [optimizer], [scheduler]
def get_optimizer_scheduler(model, args): optimizer = torch.optim.AdamW( model.parameters(), lr=args.lr_base, betas=args.betas, eps=args.eps, weight_decay=args.wd, ) if args.opt_lookahead: optimizer = Lookahead(optimizer) scheduler_warmup = get_constant_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode="max", factor=args.lr_drop_rate, patience=args.lr_patience, threshold=0, verbose=True, ) return optimizer, scheduler_warmup, scheduler
def init_scheduler(self, optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int, last_step: int = -1) -> LambdaLR: """ Initialization of lr scheduler. :param optimizer: The optimizer that is used for the training. :type optimizer: Optimizer :return: Created scheduler. :rtype: LambdaLR """ if last_step > 0: # We need initial_lr, because scheduler demands it. for group in optimizer.param_groups: group.setdefault('initial_lr', group['lr']) if self.config["scheduler"] == "linear": scheduler = transformers.get_linear_schedule_with_warmup( optimizer=optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps, last_epoch=last_step) elif self.config["scheduler"] == "cosine": scheduler = transformers.get_cosine_schedule_with_warmup( optimizer=optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps, num_cycles=0.5, last_epoch=last_step) elif self.config["scheduler"] == "constant": scheduler = transformers.get_constant_schedule_with_warmup( optimizer=optimizer, num_warmup_steps=num_warmup_steps, last_epoch=last_step) else: scheduler = None return scheduler
def train(args): random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if torch.cuda.is_available() and args.cuda: torch.cuda.manual_seed(args.seed) print('configuration:') print('\n'.join('\t{:15} {}'.format(k + ':', str(v)) for k, v in sorted(dict(vars(args)).items()))) print() config_path = os.path.join(args.save_dir, 'config.json') model_path = os.path.join(args.save_dir, 'model.pt') log_path = os.path.join(args.save_dir, 'log.csv') if args.save: export_config(args, config_path) check_path(model_path) with open(log_path, 'w') as fout: fout.write('step,train_acc,dev_acc\n') ################################################################################################### # Load data # ################################################################################################### cp_emb = [np.load(path) for path in args.ent_emb_paths] cp_emb = torch.tensor(np.concatenate(cp_emb, 1)) concept_num, concept_dim = cp_emb.size(0), cp_emb.size(1) print('num_concepts: {}, concept_dim: {}'.format(concept_num, concept_dim)) device = torch.device( "cuda:0" if torch.cuda.is_available() and args.cuda else "cpu") dataset = GconAttnDataLoader( train_statement_path=args.train_statements, train_concept_jsonl=args.train_concepts, dev_statement_path=args.dev_statements, dev_concept_jsonl=args.dev_concepts, test_statement_path=args.test_statements, test_concept_jsonl=args.test_concepts, concept2id_path=args.cpnet_vocab_path, batch_size=args.batch_size, eval_batch_size=args.eval_batch_size, device=device, model_name=args.encoder, max_cpt_num=max_cpt_num[args.dataset], max_seq_length=args.max_seq_len, is_inhouse=args.inhouse, inhouse_train_qids_path=args.inhouse_train_qids, subsample=args.subsample, format=args.format) print('len(train_set): {} len(dev_set): {} len(test_set): {}'.format( dataset.train_size(), dataset.dev_size(), dataset.test_size())) print() ################################################################################################### # Build model # ################################################################################################### lstm_config = get_lstm_config_from_args(args) model = LMGconAttn(model_name=args.encoder, concept_num=concept_num, concept_dim=args.cpt_out_dim, concept_in_dim=concept_dim, freeze_ent_emb=args.freeze_ent_emb, pretrained_concept_emb=cp_emb, hidden_dim=args.decoder_hidden_dim, dropout=args.dropoutm, encoder_config=lstm_config) if args.freeze_ent_emb: freeze_net(model.decoder.concept_emb) try: model.to(device) except RuntimeError as e: print(e) print('best dev acc: 0.0 (at epoch 0)') print('final test acc: 0.0') print() return no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] grouped_parameters = [ { 'params': [ p for n, p in model.encoder.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay, 'lr': args.encoder_lr }, { 'params': [ p for n, p in model.encoder.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0, 'lr': args.encoder_lr }, { 'params': [ p for n, p in model.decoder.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay, 'lr': args.decoder_lr }, { 'params': [ p for n, p in model.decoder.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0, 'lr': args.decoder_lr }, ] optimizer = OPTIMIZER_CLASSES[args.optim](grouped_parameters) if args.lr_schedule == 'fixed': scheduler = get_constant_schedule(optimizer) elif args.lr_schedule == 'warmup_constant': scheduler = get_constant_schedule_with_warmup( optimizer, warmup_steps=args.warmup_steps) elif args.lr_schedule == 'warmup_linear': max_steps = int(args.n_epochs * (dataset.train_size() / args.batch_size)) scheduler = get_linear_schedule_with_warmup( optimizer, warmup_steps=args.warmup_steps, t_total=max_steps) print('parameters:') for name, param in model.decoder.named_parameters(): if param.requires_grad: print('\t{:45}\ttrainable\t{}'.format(name, param.size())) else: print('\t{:45}\tfixed\t{}'.format(name, param.size())) num_params = sum(p.numel() for p in model.decoder.parameters() if p.requires_grad) print('\ttotal:', num_params) if args.loss == 'margin_rank': loss_func = nn.MarginRankingLoss(margin=0.1, reduction='mean') elif args.loss == 'cross_entropy': loss_func = nn.CrossEntropyLoss(reduction='mean') ################################################################################################### # Training # ################################################################################################### print('-' * 71) global_step, best_dev_epoch = 0, 0 best_dev_acc, final_test_acc, total_loss = 0.0, 0.0, 0.0 start_time = time.time() model.train() freeze_net(model.encoder) try: for epoch_id in range(args.n_epochs): if epoch_id == args.unfreeze_epoch: unfreeze_net(model.encoder) if epoch_id == args.refreeze_epoch: freeze_net(model.encoder) model.train() for qids, labels, *input_data in dataset.train(): optimizer.zero_grad() bs = labels.size(0) for a in range(0, bs, args.mini_batch_size): b = min(a + args.mini_batch_size, bs) logits, _ = model(*[x[a:b] for x in input_data], layer_id=args.encoder_layer) if args.loss == 'margin_rank': num_choice = logits.size(1) flat_logits = logits.view(-1) correct_mask = F.one_hot( labels, num_classes=num_choice).view( -1) # of length batch_size*num_choice correct_logits = flat_logits[ correct_mask == 1].contiguous().view(-1, 1).expand( -1, num_choice - 1).contiguous().view( -1) # of length batch_size*(num_choice-1) wrong_logits = flat_logits[ correct_mask == 0] # of length batch_size*(num_choice-1) y = wrong_logits.new_ones((wrong_logits.size(0), )) loss = loss_func(correct_logits, wrong_logits, y) # margin ranking loss elif args.loss == 'cross_entropy': loss = loss_func(logits, labels[a:b]) loss = loss * (b - a) / bs loss.backward() total_loss += loss.item() if args.max_grad_norm > 0: nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) scheduler.step() optimizer.step() if (global_step + 1) % args.log_interval == 0: total_loss /= args.log_interval ms_per_batch = 1000 * (time.time() - start_time) / args.log_interval print( '| step {:5} | lr: {:9.7f} | loss {:7.4f} | ms/batch {:7.2f} |' .format(global_step, scheduler.get_lr()[0], total_loss, ms_per_batch)) total_loss = 0 start_time = time.time() global_step += 1 model.eval() dev_acc = evaluate_accuracy(dataset.dev(), model) test_acc = evaluate_accuracy( dataset.test(), model) if args.test_statements else 0.0 print('-' * 71) print('| step {:5} | dev_acc {:7.4f} | test_acc {:7.4f} |'.format( global_step, dev_acc, test_acc)) print('-' * 71) if args.save: with open(log_path, 'a') as fout: fout.write('{},{},{}\n'.format(global_step, dev_acc, test_acc)) if dev_acc >= best_dev_acc: best_dev_acc = dev_acc final_test_acc = test_acc best_dev_epoch = epoch_id if args.save: torch.save([model, args], model_path) print(f'model saved to {model_path}') model.train() start_time = time.time() if epoch_id > args.unfreeze_epoch and epoch_id - best_dev_epoch >= args.max_epochs_before_stop: break except (KeyboardInterrupt, RuntimeError) as e: print(e) print() print('training ends in {} steps'.format(global_step)) print('best dev acc: {:.4f} (at epoch {})'.format(best_dev_acc, best_dev_epoch)) print('final test acc: {:.4f}'.format(final_test_acc)) print()
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriterP(args.output_dir) args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if p.requires_grad and not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if p.requires_grad and any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) warmup_steps = args.warmup_samples // args.train_batch_size if args.lr_decay: scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps=warmup_steps, t_total=t_total) else: scheduler = get_constant_schedule_with_warmup(optimizer, warmup_steps) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model.to('cuda') model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) try: with open(os.path.join(args.model_name_or_path, 'step.txt'), 'r') as c: global_step = int(c.readline()) except OSError as e: global_step = 0 tr_loss, logging_loss = 0.0, 0.0 moving_loss = MovingLoss(10000 // args.logging_steps) model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed( args) # Added here for reproducibility (even between python 2 and 3) try: for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): inputs, labels = mask_tokens( batch, tokenizer, args) if args.mlm else (batch, batch) inputs = inputs.to(args.device) labels = labels.to(args.device) model.train() outputs = model( inputs, masked_lm_labels=labels) if args.mlm else model( inputs, labels=labels) loss = outputs[ 0] # model outputs are always tuple in pytorch-transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() moving_loss.add(loss.item()) if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 # Log metrics if args.local_rank == -1 and args.evaluate_during_training and global_step % args.eval_steps == 0: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer, f"checkpoint-{global_step}") for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss epoch_iterator.set_postfix( MovingLoss=f'{moving_loss.loss:.2f}', Perplexity= f'{torch.exp(torch.tensor(moving_loss.loss)):.2f}') if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint save_state(args, model, tokenizer, global_step) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break print_sample(model, tokenizer, args.device, args) if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break except (KeyboardInterrupt, SystemExit): save_state(args, model, tokenizer, global_step) raise if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step