def __init__(self, emb_dim=768, hid_size=32, layers=1, weights_mat=None, tr_labs=None, b_size=24, cp_dir='models/checkpoints/cim', lr=0.001, start_epoch=0, patience=3, step=1, gamma=0.75, n_eps=10, cim_type='cim', context='art'): self.start_epoch = start_epoch self.cp_dir = cp_dir self.device, self.use_cuda = get_torch_device() self.emb_dim = emb_dim self.hidden_size = hid_size self.batch_size = b_size if cim_type == 'cim': self.criterion = CrossEntropyLoss(weight=torch.tensor([.20, .80], device=self.device), reduction='sum') # could be made to depend on classweight which should be set on input else: self.criterion = CrossEntropyLoss(weight=torch.tensor([.25, .75], device=self.device), reduction='sum') # could be made to depend on classweight which should be set on input # self.criterion = NLLLoss(weight=torch.tensor([.15, .85], device=self.device)) # set criterion on input # n_pos = len([l for l in tr_labs if l == 1]) # class_weight = 1 - (n_pos / len(tr_labs)) # print(class_weight) # self.criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([.85], reduction='sum', dtype=torch.float, device=self.device)) if start_epoch > 0: self.model = self.load_model() else: self.model = ContextAwareModel(input_size=self.emb_dim, hidden_size=self.hidden_size, bilstm_layers=layers, weights_matrix=weights_mat, device=self.device, cam_type=cim_type, context=context) self.model = self.model.to(self.device) if self.use_cuda: self.model.cuda() # empty now and set during or after training self.train_time = 0 self.prev_val_f1 = 0 self.cp_name = None # depends on split type and current fold self.full_patience = patience self.current_patience = self.full_patience self.test_perf = [] self.test_perf_string = '' # set optimizer nr_train_instances = len(tr_labs) nr_train_batches = int(nr_train_instances / b_size) half_tr_bs = int(nr_train_instances/2) self.optimizer = AdamW(self.model.parameters(), lr=lr, eps=1e-8) # set scheduler if desired # self.scheduler = lr_scheduler.CyclicLR(self.optimizer, base_lr=lr, step_size_up=half_tr_bs, # cycle_momentum=False, max_lr=lr * 30) num_train_warmup_steps = int(0.1 * (nr_train_batches * n_eps)) # warmup_proportion
def set_parameters(self, params): self.params = list(params) # careful: params may be a generator if self.method == 'sgd': self.optimizer = optim.SGD(self.params, lr=self.lr) elif self.method == 'adagrad': self.optimizer = optim.Adagrad(self.params, lr=self.lr) elif self.method == 'adadelta': self.optimizer = optim.Adadelta(self.params, lr=self.lr) elif self.method == 'adam': self.optimizer = optim.Adam(self.params, lr=self.lr) elif self.method == 'bertadam': self.optimizer = AdamW(self.params, lr=self.lr) else: raise RuntimeError("Invalid optim method: " + self.method)
def get_optimizer(named_parameters, learning_rate, weight_decay, train_dataloader, n_epoch): """ get the optimizer and the learning rate scheduler :param named_parameters: :param learning_rate: :param weight_decay: :param train_dataloader: :param n_epoch: :return: """ # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in named_parameters if not any( nd in n for nd in no_decay) and p.requires_grad], 'weight_decay': weight_decay}, {'params': [p for n, p in named_parameters if any( nd in n for nd in no_decay) and p.requires_grad], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate) # get a linear scheduler num_steps_epoch = len(train_dataloader) # ReduceLROnPlateau(self.optimizer, 'min') num_train_optimization_steps = int(num_steps_epoch * n_epoch) + 1 warmup_steps = 100 scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_train_optimization_steps) return optimizer, scheduler
def configure_optimizers(self): optimizer = AdamW(self.parameters(), lr=lr, correct_bias=False) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps) return {'optimizer': optimizer, 'lr_scheduler': scheduler}
def get_optimizer(self, num_training_steps: int): no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": self.args.weight_decay, }, { "params": [ p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=num_training_steps) return optimizer, scheduler
def create_optimizer(self, model): args = self.args optimizer_grouped_parameters = self.patch_coordinator.create_optimizer_groups(model, self.args, self.sparse_args) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) return optimizer
def _get_optimizer(self): """Get optimizer for different models. Returns: optimizer """ if self.config.model_type == 'bert': no_decay = ['bias', 'gamma', 'beta'] optimizer_parameters = [{ 'params': [ p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.01 }, { 'params': [ p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.0 }] optimizer = AdamW(optimizer_parameters, lr=self.config.lr, betas=(0.9, 0.999), weight_decay=1e-8, correct_bias=False) else: # rnn optimizer = Adam(self.model.parameters(), lr=self.config.lr) return optimizer
def get_optimizers( self, num_training_steps: int ) -> Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]: # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": self.args.weight_decay, }, { "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon) #scheduler = get_linear_schedule_with_warmup( # optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=num_training_steps #) if self.args.warmup_steps>0: logger.info("*****Linear warmup over %d warmup_steps *****"%self.args.warmup_steps) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=num_training_steps ) else: logger.info("*****Linear warmup over %.1f%% of training.*****"%(self.args.warmup_proportion*100)) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=int(self.args.warmup_proportion*num_training_steps), num_training_steps=num_training_steps ) return optimizer, scheduler
def get_optimizer_scheduler(params,num_training_steps,learning_rate=1e-5): no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in params if not any(nd in n for nd in no_decay) ], "weight_decay": 0.01, }, { "params": [ p for n, p in params if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=int(num_training_steps * args.warmup_ratio), num_training_steps=num_training_steps, ) return optimizer,scheduler
def initialize_optimizer(model, train_dataloader, args): no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=1e-8) model, optimizer = amp.initialize(model, optimizer, opt_level='O1') scheduler = WarmupLinearSchedule(optimizer, warmup_steps=0, t_total=len(train_dataloader) * args.num_train_epochs) return optimizer, scheduler, model
def configure_optimizers(self): param_optimizer = list(self.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.lr, correct_bias=False) num_train_steps = len( self.train_dataloader()) * self.hparams.max_epochs num_warmup_steps = int(num_train_steps * self.hparams.warmup_ratio) scheduler = get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_train_steps) lr_scheduler = { 'scheduler': scheduler, 'name': 'cosine_schedule_with_warmup', 'monitor': 'loss', 'interval': 'step', 'frequency': 1 } return [optimizer], [lr_scheduler]
def configure_optimizers(self): # Prepare optimizer param_optimizer = list(self.model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any( nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any( nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.lr, correct_bias=False) # warm up lr num_workers = (self.hparams.gpus if self.hparams.gpus is not None else 1) * (self.hparams.num_nodes if self.hparams.num_nodes is not None else 1) data_len = len(self.train_dataloader().dataset) logging.info(f'number of workers {num_workers}, data length {data_len}') num_train_steps = int(data_len / (self.hparams.batch_size * num_workers) * self.hparams.max_epochs) logging.info(f'num_train_steps : {num_train_steps}') num_warmup_steps = int(num_train_steps * self.hparams.warmup_ratio) logging.info(f'num_warmup_steps : {num_warmup_steps}') scheduler = get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_train_steps) lr_scheduler = {'scheduler': scheduler, 'monitor': 'loss', 'interval': 'step', 'frequency': 1} return [optimizer], [lr_scheduler]
def get_optimizers(model, learning_rate, adam_epsilon, weight_decay, num_training_steps): # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": weight_decay }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_training_steps) return optimizer, scheduler
def get_optimizer( model: nn.Module, learning_rate: float = 1e-5, adam_eps: float = 1e-8, weight_decay: float = 0.0, ) -> torch.optim.Optimizer: no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_eps) return optimizer
def configure_optimizers(self): optimizer = AdamW(self.parameters(), self.hparams.learning_rate, betas=(self.hparams.adam_beta1, self.hparams.adam_beta2), eps=self.hparams.adam_epsilon,) return optimizer
def get_optimizers(self): # Setup the optimizer and the learning rate scheduler. no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": cfg.weight_decay, }, { "params": [ p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=cfg.lr) num_training_steps = self.reader.set_stats['train']['num_dials'] * \ cfg.epoch_num // (cfg.gradient_accumulation_steps * cfg.batch_size) num_warmup_steps = cfg.warmup_steps if cfg.warmup_steps >= 0 else int( num_training_steps * 0.2) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps) return optimizer, scheduler
def configure_optimizers(self): param_optimizer = list(self.named_parameters()) no_decay = ['bias', 'ln_'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizers = [ AdamW(optimizer_grouped_parameters, lr=config.lr, eps=1e-8) ] total_steps = len(self.train_dataloader()) * 10 def lr_lambda(current_step): if current_step < config.warmup_steps: return float(current_step) / float(max(1, config.warmup_steps)) current_step = min(total_steps, current_step) progress = float(current_step - config.warmup_steps) / float( max(1, total_steps - config.warmup_steps)) return max(0.3, 0.5 * (1.0 + math.cos(math.pi * progress))) schedulers = [{ 'scheduler': LambdaLR(optimizers[0], lr_lambda), 'interval': 'step' }] return optimizers, schedulers
def get_optimizer( model: nn.Module, learning_rate: float = 1e-5, adam_eps: float = 1e-8, weight_decay: float = 0.0, ) -> torch.optim.Optimizer: no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_eps) return optimizer
def get_optimizers( self, num_training_steps: int ) -> Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]: """ Setup the optimizer and the learning rate scheduler. We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the Trainer's init, or override this method in a subclass. """ if self.optimizers is not None: return self.optimizers # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in self.model.named_parameters() if "relational_transformer" not in n and not any(nd in n for nd in no_decay)], "weight_decay": self.args.weight_decay, }, { "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, { "params": [p for n, p in self.model.named_parameters() if "relational_transformer" in n and not any(nd in n for nd in no_decay)], "weight_decay": self.args.weight_decay, "lr": 7e-5 } ] optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=num_training_steps ) return optimizer, scheduler
def configure_optimizers(self): optimizer = AdamW(self.parameters(), lr=self.args.learning_rate) scheduler = ExponentialLR(optimizer, gamma=0.9) return { 'optimizer': optimizer, 'scheduler': scheduler, }
def get_optimizers( self, num_training_steps: int ) -> Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]: if self.optimizers is not None: return self.optimizers # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [{ "params": [ p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": self.args.weight_decay, }, { "params": [ p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, { "params": self.fc.parameters(), "weight_decay": self.args.weight_decay }] optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=num_training_steps) return optimizer, scheduler
def configure_optimizers(self): no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in self.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad ], "weight_decay": self.args.weight_decay, }, { "params": [ p for n, p in self.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.lr, eps=self.args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=self.args.train_steps) return [optimizer], [{"scheduler": scheduler, "interval": "step"}]
def build_default_model(args): """ 自定义模型 规格要求返回模型(model)、优化器(optimizer)、调度器(scheduler)三元组。 """ # -------- model -------- model = load_pretrained_model(args) model.to(args.device) # -------- optimizer -------- from transformers.optimization import AdamW optimizer_parameters = get_default_optimizer_parameters( model, args.weight_decay) optimizer = AdamW(optimizer_parameters, lr=args.learning_rate, eps=args.adam_epsilon, correct_bias=False) # -------- scheduler -------- from transformers.optimization import get_linear_schedule_with_warmup scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.total_steps * args.warmup_rate, num_training_steps=args.total_steps) return model, optimizer, scheduler
def build_optimizer_scheduler(model, num_train_steps, learning_rate): optimizer = AdamW( model.parameters(), lr=learning_rate, correct_bias=False) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=100, num_training_steps=num_train_steps) return optimizer, scheduler
def _get_optimizer(model, train_size, config): num_total_steps = int(train_size / config.train_batch_size * config.epoch_count) num_warmup_steps = int(num_total_steps * config.warm_up) optimizer = AdamW(model.parameters(), lr=config.learning_rate, correct_bias=False) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps) return optimizer, scheduler
def __init__(self, model, state_key="last_hidden_state", lr=1e-5, clip=1.0, accumulate_gradients=1): super(BertClassifier, self).__init__() self.state_key = state_key self.model = model self.proj_layer = torch.nn.Linear(self.hidden_size, 1) self.sigmoid = torch.nn.Sigmoid() # self.criterion = torch.nn.BCELoss(reduction="sum") self.criterion = torch.nn.BCELoss(reduction="mean") self.optimizer = AdamW(self.parameters(), lr=lr, weight_decay=0.01) self.clip = clip self.accumulate_gradients = accumulate_gradients self._batches_accumulated = 0
def train(self, epochs): """ Runs the training. """ pretrained_model = self.config.get("pretrained_mtb_model", None) pretrained_model = ("pretrained" if pretrained_model else "no_pretraining") no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": 0.01, }, { "params": [ p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=self.config.get("lr")) ovr_steps = (epochs * len(self.data_loader.train_generator) * self.config.get("mini_batch_size") / self.config.get("batch_size")) scheduler = get_linear_schedule_with_warmup(optimizer, ovr_steps // 10, ovr_steps) results_path = os.path.join("results", "sem_eval", pretrained_model, str(epochs)) best_model_path = os.path.join(self.checkpoint_dir, "best_model.pth.tar") resume = self.config.get("resume", False) if resume and os.path.exists(best_model_path): ( self._start_epoch, self._best_test_f1, self._train_loss, self._train_acc, self._test_f1, self._train_acc, ) = self.load_best_model(self.checkpoint_dir) logger.info("Starting training process") pad_id = self.tokenizer.pad_token_id for epoch in range(self._start_epoch, epochs): self._train_epoch(epoch, pad_id, optimizer, scheduler) data = self._write_kpis(results_path) self._plot_results(data, results_path) logger.info("Finished Training.") return self.model
def create_optimizer_and_scheduler(self, num_training_steps: int): """ Based on Transformers' default one, we add fixing layer option where the bottom n layers' parameters are fixed and only the top layers are further fine-tuned. """ if self.optimizer is None: params = {} for n, p in self.model.named_parameters(): if self.args.fix_layers > 0: if 'encoder.layer' in n: try: layer_num = int(n[n.find('encoder.layer') + 14:].split('.')[0]) except: print(n) raise Exception("") if layer_num >= self.args.fix_layers: print('yes', n) params[n] = p else: print('no ', n) elif 'embeddings' in n: print('no ', n) else: print('yes', n) params[n] = p else: params[n] = p no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in params.items() if not any(nd in n for nd in no_decay) ], "weight_decay": self.args.weight_decay, }, { "params": [ p for n, p in params.items() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] self.optimizer = AdamW( optimizer_grouped_parameters, lr=self.args.learning_rate, betas=(self.args.adam_beta1, self.args.adam_beta2), eps=self.args.adam_epsilon, ) if self.lr_scheduler is None: self.lr_scheduler = get_linear_schedule_with_warmup( self.optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=num_training_steps)
def train(model, train_iter, dev_iter, test_iter): starttime = time.time() model.train() optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=decay) total_batch = 0 dev_best_loss = float("inf") last_improve = 0 no_improve_flag = False model.train() for epoch in range(num_epochs): print("Epoch {}/{}".format(epoch + 1, num_epochs)) for i, (X, y) in enumerate(train_iter): outputs = model(X) # batch_size * num_classes model.zero_grad() loss = F.binary_cross_entropy(outputs, y) loss.backward() optimizer.step() if total_batch % 100 == 0: truelabels = torch.max(y.data, 1)[1].cpu() pred = torch.max(outputs, 1)[1].cpu() train_acc = metrics.accuracy_score(truelabels, pred) dev_acc, dev_loss = evaluate(model, dev_iter) if dev_loss < dev_best_loss: dev_best_loss = dev_loss torch.save(model.state_dict(), save_path) improve = '*' last_improve = total_batch else: improve = ' ' time_dif = get_time_dif(starttime) # 打印训练信息,id : >右对齐,n 宽度,.3 小数位数 msg = 'Iter:{0:>6}, Train Loss:{1:>5.2}, Train Acc:{2:>6.2}, Val Loss:{3:>5.2}, val Acc :{4:>6.2%}, Time:{5} {6}' print( msg.format(total_batch, loss.item(), train_acc, dev_loss, dev_acc, time_dif, improve)) model.train() total_batch += 1 if total_batch - last_improve > early_stop_time: print( "no improve after {} times, stop!".format(early_stop_time)) no_improve_flag = True break if no_improve_flag: break test(model, test_iter)
def run(): """train the model""" # set the logger utils.set_logger(config.log_dir) logging.info("device: {}".format(config.device)) # 处理数据,分离文本和标签 processor = Processor(config) processor.process() logging.info("--------Process Done!--------") # 分离出验证集 word_train, word_dev, label_train, label_dev = load_dev('train') # build dataset train_dataset = NERDataset(word_train, label_train, config) dev_dataset = NERDataset(word_dev, label_dev, config) logging.info("--------Dataset Build!--------") # get dataset size train_size = len(train_dataset) # build data_loader train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, collate_fn=train_dataset.collate_fn) dev_loader = DataLoader(dev_dataset, batch_size=config.batch_size, shuffle=True, collate_fn=dev_dataset.collate_fn) logging.info("--------Get Dataloader!--------") # Prepare model device = config.device model = BertNER.from_pretrained(config.roberta_model, num_labels=len(config.label2id)) model.to(device) # Prepare optimizer if config.full_fine_tuning: # model.named_parameters(): [bert, classifier, crf] bert_optimizer = list(model.bert.named_parameters()) classifier_optimizer = list(model.classifier.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in bert_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': config.weight_decay}, {'params': [p for n, p in bert_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}, {'params': [p for n, p in classifier_optimizer if not any(nd in n for nd in no_decay)], 'lr': config.learning_rate * 5, 'weight_decay': config.weight_decay}, {'params': [p for n, p in classifier_optimizer if any(nd in n for nd in no_decay)], 'lr': config.learning_rate * 5, 'weight_decay': 0.0}, {'params': model.crf.parameters(), 'lr': config.learning_rate * 5} ] # only fine-tune the head classifier else: param_optimizer = list(model.classifier.named_parameters()) optimizer_grouped_parameters = [{'params': [p for n, p in param_optimizer]}] optimizer = AdamW(optimizer_grouped_parameters, lr=config.learning_rate, correct_bias=False) train_steps_per_epoch = train_size // config.batch_size scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=(config.epoch_num // 10) * train_steps_per_epoch, num_training_steps=config.epoch_num * train_steps_per_epoch) # Train the model logging.info("--------Start Training!--------") train(train_loader, dev_loader, model, optimizer, scheduler, config.model_dir)