def setup_default_optimizer(self, weight_decay: float = 0.0, learning_rate: float = 5e-5, adam_epsilon: float = 1e-8, warmup_steps: int = 0, total_steps: int = 0): # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': weight_decay }, { 'params': [ p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] self.optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon) self.scheduler = WarmupLinearSchedule(self.optimizer, warmup_steps=warmup_steps, t_total=total_steps)
def train(conf: GPT2ChatbotConf): logger.info("make dirs") conf.save_model_dir = join(conf.output_dir, conf.save_model_dir) if not os.path.exists(conf.save_model_dir): os.makedirs(conf.save_model_dir) ### logger.info("get train data") tokenizer = BertTokenizer.from_pretrained( join(conf.data_model_dir, conf.pretrained_model_dir)) train_data = LCCCDataGenerator(conf, tokenizer) ### logger.info("get pretrained model") os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = conf.device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = GPT2LMHeadModel.from_pretrained( join(conf.data_model_dir, conf.pretrained_model_dir)).to(device) model.train() # 设置优化器,并且在初始训练时,使用warmup策略 optimizer = AdamW(model.parameters(), lr=conf.lr, correct_bias=True) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=2000, t_total=train_data.steps * conf.epoch) ### logger.info("start train") for epoch in range(conf.epoch): for step, batch in enumerate(train_data): batch = {k: v.to(device) for k, v in batch.items()} batch["labels"] = batch["input_ids"] loss = model(**batch)[0] loss.backward() # 梯度裁剪解决的是梯度消失或爆炸的问题,即设定阈值 torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # 更新参数 optimizer.step() # 清空梯度信息 optimizer.zero_grad() # 进行warm up scheduler.step() if step % conf.log_step == 0: logger.info("{}/{}:{}".format(step, train_data.steps, loss.data)) if conf.save_step > 0 and step % conf.save_step == 0 and step > 0: save_dir = os.path.join(conf.save_model_dir, "{}-{}".format(epoch + 1, step)) os.makedirs(save_dir) logger.info("save model to: {}".format(save_dir)) model.save_pretrained(save_dir) tokenizer.save_pretrained(save_dir) if conf.save_step < 0: save_dir = os.path.join(conf.save_model_dir, "epoch-{}".format(epoch + 1)) os.makedirs(save_dir) logger.info("save model to: {}".format(save_dir)) model.save_pretrained(save_dir) tokenizer.save_pretrained(save_dir)
def train(args, train_dataset, model, tokenizer): """Train the model""" train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch") set_seed(args) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2], 'labels': batch[3]} outputs = model(**inputs) loss = outputs[0] if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: scheduler.step() # Update learning rate schedule optimizer.step() model.zero_grad() global_step += 1 if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break
def train_bert(train_dataset, model, tokenizer, device, n_epochs=1, batch_size=10, learning_rate=4e-5, warmup_ratio=0.05, **kwargs): """ Trains a BERT model on a train dataset. """ train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size) t_total = len(train_dataloader) * n_epochs optimizer_params = [{ "params": [p for n, p in model.named_parameters()], "weight_decay": 0.0 }] optimizer = AdamW(optimizer_params, lr=learning_rate, eps=1e-8) warmup_steps = int(warmup_ratio * t_total) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total) model.zero_grad() model.train() for _ in tqdm(range(n_epochs), desc="BERT Training"): for step, batch in enumerate( tqdm(train_dataloader, desc="Current Epoch")): batch = tuple(t.to(device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "labels": batch[3] } outputs = model(**inputs) loss = outputs[0] loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() scheduler.step() model.zero_grad()
def configure_optimizers(self): no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': self.hparams.weight_decay }, { 'params': [ p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0, }] optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_eps) scheduler = WarmupLinearSchedule(optimizer, self.hparams.warmup_steps, -1) return [optimizer], [scheduler]
def test_warmup_linear_scheduler(self): scheduler = WarmupLinearSchedule(self.optimizer, warmup_steps=2, t_total=10) lrs = unwrap_schedule(scheduler, self.num_steps) expected_learning_rates = [ 5.0, 10.0, 8.75, 7.5, 6.25, 5.0, 3.75, 2.5, 1.25, 0.0 ] self.assertEqual(len(lrs[0]), 1) self.assertListEqual([l[0] for l in lrs], expected_learning_rates) scheduler = WarmupLinearSchedule(self.optimizer, warmup_steps=2, t_total=10) lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps) self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2])
def get_optimizer(self, train, lr=5e-5, warmup=0.1, weight_decay=0.01): num_total_steps = self.get_num_train_steps(len(train), self.args.batch, self.args.epoch) # remove pooler param_optimizer = list(self.named_parameters()) param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ { 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': weight_decay }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }, ] # prepare optimizer and schedule (linear warmup and decay) optimizer = AdamW(optimizer_grouped_parameters, lr=lr, correct_bias=False) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_total_steps * warmup, t_total=num_total_steps) return optimizer, scheduler
def init_optimizer(self, num_total_steps): num_warmup_steps = int(self.args.warmup_proportion * num_total_steps) logger.info('warmup steps : %d' % num_warmup_steps) # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] # no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] param_optimizer = list(self.network.named_parameters()) optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': self.args.weight_decay}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] self.optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, correct_bias=False) self.scheduler = WarmupLinearSchedule(self.optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps)
def run(epoch,model,batch_size,trainData,valData,testData,id2label,w_padding): valResult=[] testResult=[] #LabelSmoothing(size=len(id2label), padding_idx=len(id2label), smoothing=0.0) t_total=(len(trainData[0])//BATCH_SIZE+1)*EPOCH criterion = LabelSmoothing(size=len(id2label), padding_idx=len(id2label), smoothing=0.0) optimizer = AdamW(model.parameters(), lr=INIT_LEARNING_RATE, eps=ADAM_EPSILON) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=WARM_UP_STEPS, t_total=t_total) #model_opt = NoamOpt(HIDDEN_DIM, 1, WARM_UP_STEPS, #torch.optim.Adam(model.parameters(), lr=INIT_LEARNING_RATE,betas=(0.9, 0.98), eps=1e-9)) for i in range(epoch): model.train() run_epoch(batchIter(trainData,batch_size,w_tag_pad=w_padding,t_tag_pad=len(id2label)), model, SimpleLossCompute( criterion, optimizer,scheduler),train=True) model.eval() print('Evaluation_val: epoch: %d' % (i)) loss,f=run_epoch(batchIter(valData, batch_size, w_tag_pad=w_padding,t_tag_pad=len(id2label)), model, SimpleLossCompute(criterion, optimizer,scheduler), train=False, id2label=id2label) print('Loss:', loss) valResult.append(f) print('Evaluation_test: epoch: %d' % (i)) loss,f=run_epoch(batchIter(testData, batch_size, w_tag_pad=w_padding,t_tag_pad=len(id2label)), model, SimpleLossCompute(criterion, optimizer,scheduler), train=False, id2label=id2label) print('Loss:', loss) testResult.append(f) valBest=max(valResult) print('ValBest epoch:', [i for i, j in enumerate(valResult) if j == valBest]) testBest = max(testResult) print('TestBest epoch:', [i for i, j in enumerate(testResult) if j == testBest])
def get_Adam_optim_v2(config, model): param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}] optimizer = AdamW(optimizer_grouped_parameters, lr=config.TRAIN.lr_base, weight_decay=0.01, correct_bias=False) scheduler = WarmupLinearSchedule(optimizer, num_training_steps=config.TRAIN.num_train_optimization_steps, num_warmup_steps=config.TRAIN.warmup_proportion * config.TRAIN.num_train_optimization_steps) return optimizer, scheduler
def _get_scheduler(self, optimizer): """Get scheduler for adjusting learning rate. """ if self.args.scheduler == 'warmup': scheduler = WarmupLinearSchedule(optimizer, warmup_steps=self.args.warmup_steps, t_total=self.args.num_epochs) elif self.args.scheduler == 'exponential': scheduler = ExponentialLR(optimizer, 0.95) return scheduler
def setup_opt(args, model): # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] if args.adam_betas is not None: adam_betas = tuple(float(_f) for _f in args.adam_betas.split(",")) assert len(adam_betas) == 2 else: adam_betas = (0.9, 0.999) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, betas=adam_betas, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=args.t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) return model, optimizer, scheduler
def init_optimizer(self, num_total_steps): """ `args.warmup_proportion` : Linear warmup over warmup_ratio warm_step / t_total, `named_parameters()` : yielding both the name of the parameter as well as the parameter itself """ num_warmup_steps = int(self.args.warmup_proportion * num_total_steps) logger.info('warmup steps : %d' % num_warmup_steps) # Prepare optimizer and schedule (linear warmup and decay) no_decay = [ 'bias', 'LayerNorm.weight' ] # no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] param_optimizer = list(self.network.named_parameters()) optimizer_grouped_parameters = [ { 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': self.args.weight_decay }, # weight_decay default=0.01 { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 } ] # `learning_rate` default=5e-5 self.optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, correct_bias=False) self.scheduler = WarmupLinearSchedule(self.optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps)
def init_optimizer(self, model, lr, t_total, fixed=None): args = self.args no_decay = ['bias', 'LayerNorm.weight'] if fixed is None: fixed = [] optimizer_grouped_parameters = [{ "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and not any(f in n for f in fixed) ], "weight_decay": args.weight_decay }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and not any(f in n for f in fixed) ], "weight_decay": 0.0 }] # TODO calculate t_total optimizer = AdamW(optimizer_grouped_parameters, lr=lr, eps=args.adam_epsilon) if args.scheduler == "linear": warmup_steps = t_total * args.warmup_ratio if args.warmup_steps == -1 else args.warmup_steps logger.info( "Setting scheduler, warmups=%d, lr=%.7f, total_updates=%d" % (warmup_steps, lr, t_total)) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total) elif args.scheduler == "constant": logger.info("Setting scheduler, ConstantLRSchedule") scheduler = ConstantLRSchedule(optimizer) else: raise ValueError return optimizer_grouped_parameters, optimizer, scheduler
def run(epoch,model,batch_size,trainData,valData,testData,tokenizer): valResult=[] testResult=[] t_total = (((len(trainData[0])-1)//batch_size)+1) * epoch optimizer = AdamW(model.parameters(), lr=INIT_LEARNING_RATE, eps=ADAM_EPSILON) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=WARMUP, t_total=t_total) lc=SimpleLossCompute(optimizer,scheduler) for i in range(epoch): model.train() run_epoch(batchIter(trainData,batch_size,tokenizer), model, lc,train=True) model.eval() print('Evaluation_val: epoch: %d' % (i)) loss,f=run_epoch(batchIter(valData, batch_size, tokenizer), model, lc, train=False) print('Loss:', loss) valResult.append(f) print('Evaluation_test: epoch: %d' % (i)) loss,f=run_epoch(batchIter(testData, batch_size, tokenizer), model, lc, train=False) print('Loss:', loss) testResult.append(f) valBest=max(valResult) print('ValBest epoch:', [i for i, j in enumerate(valResult) if j == valBest]) testBest = max(testResult) print('TestBest epoch:', [i for i, j in enumerate(testResult) if j == testBest])
def run_train(args): # --------- data processor = BertProcessor(vocab_path=config['bert_vocab_path'], do_lower_case=args.do_lower_case) label_list = processor.get_labels() label2id = {label: i for i, label in enumerate(label_list)} id2label = {i: label for i, label in enumerate(label_list)} train_data = processor.get_train(config['data_dir'] / f"{args.data_name}.label_train.pkl") print("Train data is:") print(train_data) train_examples = processor.create_examples( lines=train_data, example_type='train', cached_examples_file=config['data_cache'] / f"cached_train_label_examples_finetune{args.arch}") # print ("Training examples are:") # print (train_examples) train_features = processor.create_features( examples=train_examples, max_seq_len=args.train_max_seq_len, cached_features_file=config['data_cache'] / "cached_train_label_features_finetune{}_{}".format( args.train_max_seq_len, args.arch)) train_dataset = processor.create_dataset(train_features, is_sorted=args.sorted) if args.sorted: train_sampler = SequentialSampler(train_dataset) else: train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) valid_data = processor.get_dev(config['data_dir'] / f"{args.data_name}.label_valid.pkl") valid_examples = processor.create_examples( lines=valid_data, example_type='valid', cached_examples_file=config['data_cache'] / f"cached_valid_examples_label_finetune{args.arch}") valid_features = processor.create_features( examples=valid_examples, max_seq_len=args.eval_max_seq_len, cached_features_file=config['data_cache'] / "cached_valid_features_label_finetune{}_{}".format( args.eval_max_seq_len, args.arch)) valid_dataset = processor.create_dataset(valid_features) valid_sampler = SequentialSampler(valid_dataset) valid_dataloader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=args.eval_batch_size) # ------- model logger.info("initializing model") if args.resume_path: args.resume_path = Path(args.resume_path) model = BertForMultiLable.from_pretrained(args.resume_path, num_labels=len(label_list)) else: print("Labels are:") print(label_list) # model = BertForMultiLable.from_pretrained(config['bert_model_dir'], num_labels=len(label_list)) model = BertForMultiLable.from_pretrained("bert-base-uncased", num_labels=len(label_list)) t_total = int( len(train_dataloader) / args.gradient_accumulation_steps * args.epochs) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] warmup_steps = int(t_total * args.warmup_proportion) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) lr_scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # ---- callbacks logger.info("initializing callbacks") train_monitor = TrainingMonitor(file_dir=config['figure_dir'], arch=args.arch) model_checkpoint = ModelCheckpoint(checkpoint_dir=config['checkpoint_dir'], mode=args.mode, monitor=args.monitor, arch=args.arch, save_best_only=args.save_best) # **************************** training model *********************** logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Num Epochs = %d", args.epochs) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) trainer = Trainer( n_gpu=args.n_gpu, model=model, epochs=args.epochs, logger=logger, criterion=BCEWithLogLoss(), optimizer=optimizer, lr_scheduler=lr_scheduler, early_stopping=None, training_monitor=train_monitor, fp16=args.fp16, resume_path=args.resume_path, grad_clip=args.grad_clip, model_checkpoint=model_checkpoint, gradient_accumulation_steps=args.gradient_accumulation_steps, batch_metrics=[AccuracyThresh(thresh=0.5)], epoch_metrics=[ AUC(average='micro', task_type='binary'), MultiLabelReport(id2label=id2label) ]) # embeddings_dict = pickle.load(open("/home/rgaonkar/context_home/rgaonkar/label_embeddings/code/Bert_Masked_LM/label_embeddings_dict.p", "rb")) # label_similarity_matrix = get_label_similarity_matrix(embeddings_dict, label_list) trainer.train(train_data=train_dataloader, valid_data=valid_dataloader, seed=args.seed)
for j in range(len(tmp)): if not isinstance(tmp[j], float): tmp[j] = 0. outputs.append(ss.rankdata(tmp)) idxs.append(i) col_mask[i] = 1 return outputs, idxs, col_mask if args.do_train: with open('data/train_examples.json') as f: examples = json.load(f) optimizer = AdamW(model.parameters(), lr=args.lr_default, eps=1e-8) t_total = len(examples) * args.epochs scheduler = WarmupLinearSchedule(optimizer, warmup_steps=1000, t_total=t_total) cross_entropy = torch.nn.CrossEntropyLoss() files = list(examples.keys()) for epoch_ in range(args.epochs): random.shuffle(files) for f_idx, f in enumerate(files): table = pandas.read_csv('all_csv/{}'.format(f), '#') cols = table.columns model.zero_grad() optimizer.zero_grad() texts = paralell_table(table)
# %%time take = 6 # init model print("initializing model...") model = MutiLabelModel(encoder, 768, take) pos_weight = torch.FloatTensor([2., 2.25, 2., 2.87, 4., 8.7]).to(device) num_total_steps = np.ceil(len(train_x) / batch_size) * epochs num_warmup_steps = int(num_total_steps * 0.5) optim = AdamW( model.parameters(), lr=lr, correct_bias=False ) # To reproduce BertAdam specific behavior set correct_bias=False scheduler = WarmupLinearSchedule(optim, warmup_steps=num_warmup_steps, t_total=num_total_steps) model = model.to(device) # model.load_state_dict(torch.load("/tmp2/r08922010/aicup/model/task1/model_{}_state".format(seed), map_location=device)) # torch.save(model.encoder.state_dict(), "/tmp2/r08922010/aicup/model/task1/encoder_{}_state".format(seed)) thd = 1 / take if softmax else 0.7 thd2 = 0.6 thrld = np.ones((1, take)) * thd outs = [[] for _ in range(take)] dev_micro_f1 = -1 # train and validation
def train(train_dataset, model): """ Train the model """ train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, collate_fn=collate_fn, sampler=train_sampler, batch_size=train_batch_size) t_total = len(train_dataloader) // 1 * 3 # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=0, t_total=t_total) # Train! print("***** Running training *****") print(" Num examples = %d", len(train_dataset)) print(" Num Epochs = %d", 5) print(" Instantaneous batch size per GPU = %d", 8) print(" Gradient Accumulation steps = %d", 1) print(" Total optimization steps = %d", t_total) model.zero_grad() for e in range(5): global_step = 0 tr_loss, logging_loss = 0.0, 0.0 for batch in tqdm(train_dataloader): model.train() batch = tuple(t.to(device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2], 'labels': batch[3] } outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) loss.backward() tr_loss += loss.item() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 print(loss.item()) print(e, global_step, tr_loss / global_step) return global_step, tr_loss / global_step
config['weight_decay'] }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] t_total = len(data_loader) // config['gradient_accumulation_steps'] * config[ 'num_train_epochs'] optimizer = AdamW(optimizer_grouped_parameters, lr=config['learning_rate'], eps=config['adam_epsilon']) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=config['warmup_steps'], t_total=t_total) model.train() # batch_size = config['train_batch_size'] last_training_loss = 10000000000000000 for epoch in range(config['num_train_epochs']): training_loss = 0 for step, batch in tqdm(enumerate(data_loader), total=len(data_loader), desc='training'): # for item in batch: token_id, token_type, mask, label = batch size = token_id[0].shape[0] # token_ids.append(token_id) # token_types.append(token_type)
def train(args): print(args) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if torch.cuda.is_available() and args.cuda: torch.cuda.manual_seed(args.seed) model_path = os.path.join(args.save_dir, 'model.pt') check_path(model_path) ################################################################################################### # Load data # ################################################################################################### device = torch.device("cuda:0" if torch.cuda.is_available() and args.cuda else "cpu") dataset = LMDataLoader(args.train_statements, args.dev_statements, args.test_statements, batch_size=args.batch_size, eval_batch_size=args.eval_batch_size, device=device, model_name=args.encoder, max_seq_length=args.max_seq_len, is_inhouse=args.inhouse, inhouse_train_qids_path=args.inhouse_train_qids, subsample=args.subsample) ################################################################################################### # Build model # ################################################################################################### lstm_config = get_lstm_config_from_args(args) model = LMForMultipleChoice(args.encoder, from_checkpoint=args.from_checkpoint, encoder_config=lstm_config) try: model.to(device) except RuntimeError as e: print(e) print('best dev acc: 0.0 (at epoch 0)') print('final test acc: 0.0') print() return no_decay = ['bias', 'LayerNorm.weight'] grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'lr': args.encoder_lr, 'weight_decay': args.weight_decay}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'lr': args.encoder_lr, 'weight_decay': 0.0} ] optimizer = OPTIMIZER_CLASSES[args.optim](grouped_parameters) if args.lr_schedule == 'fixed': scheduler = ConstantLRSchedule(optimizer) elif args.lr_schedule == 'warmup_constant': scheduler = WarmupConstantSchedule(optimizer, warmup_steps=args.warmup_steps) elif args.lr_schedule == 'warmup_linear': max_steps = int(args.n_epochs * (dataset.train_size() / args.batch_size)) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=max_steps) if args.loss == 'margin_rank': loss_func = nn.MarginRankingLoss(margin=0.1, reduction='mean') elif args.loss == 'cross_entropy': loss_func = nn.CrossEntropyLoss(reduction='mean') ################################################################################################### # Training # ################################################################################################### print() print('***** running training *****') print(f'| batch_size: {args.batch_size} | num_epochs: {args.n_epochs} | num_train: {dataset.train_size()} |' f' num_dev: {dataset.dev_size()} | num_test: {dataset.test_size()}') global_step = 0 best_dev_acc = 0 best_dev_epoch = 0 final_test_acc = 0 try: for epoch in range(int(args.n_epochs)): model.train() tqdm_bar = tqdm(dataset.train(), desc="Training") for qids, labels, *input_data in tqdm_bar: optimizer.zero_grad() batch_loss = 0 bs = labels.size(0) for a in range(0, bs, args.mini_batch_size): b = min(a + args.mini_batch_size, bs) logits = model(*[x[a:b] for x in input_data], layer_id=args.encoder_layer) if args.loss == 'margin_rank': num_choice = logits.size(1) flat_logits = logits.view(-1) correct_mask = F.one_hot(labels, num_classes=num_choice).view(-1) # of length batch_size*num_choice correct_logits = flat_logits[correct_mask == 1].contiguous().view(-1, 1).expand(-1, num_choice - 1).contiguous().view(-1) # of length batch_size*(num_choice-1) wrong_logits = flat_logits[correct_mask == 0] # of length batch_size*(num_choice-1) y = wrong_logits.new_ones((wrong_logits.size(0),)) loss = loss_func(correct_logits, wrong_logits, y) # margin ranking loss elif args.loss == 'cross_entropy': loss = loss_func(logits, labels[a:b]) loss = loss * (b - a) / bs loss.backward() batch_loss += loss.item() if args.max_grad_norm > 0: nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() tqdm_bar.desc = "loss: {:.2e} lr: {:.2e}".format(batch_loss, scheduler.get_lr()[0]) global_step += 1 model.eval() dev_acc = evaluate_accuracy(dataset.dev(), model) test_acc = evaluate_accuracy(dataset.test(), model) if dataset.test_size() > 0 else 0.0 if dev_acc > best_dev_acc: final_test_acc = test_acc best_dev_acc = dev_acc best_dev_epoch = epoch torch.save([model, args], model_path) print('| epoch {:5} | dev_acc {:7.4f} | test_acc {:7.4f} |'.format(epoch, dev_acc, test_acc)) if epoch - best_dev_epoch >= args.max_epochs_before_stop: break except (KeyboardInterrupt, RuntimeError) as e: print(e) print('***** training ends *****') print() print('training ends in {} steps'.format(global_step)) print('best dev acc: {:.4f} (at epoch {})'.format(best_dev_acc, best_dev_epoch)) print('final test acc: {:.4f}'.format(final_test_acc)) print()
[p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = int(n_epochs * train_size / batch_size / accumulation_steps) num_warmup_steps = int(num_train_optimization_steps * warmup) optimizer = AdamW(optimizer_grouped_parameters, lr=lr, correct_bias=False) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_train_optimization_steps) model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0) model.zero_grad() model = model.train() tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=do_lower_case) convert_func = functools.partial(convert_data, tokenizer=tokenizer, max_seq_len=max_seq_len, max_question_len=max_question_len,
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task.", ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.", ) parser.add_argument( "--task_name", default=None, type=str, required=True, help="The name of the task to train.", ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written.", ) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3", ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.", ) parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval or not.") parser.add_argument( "--eval_on", default="dev", help="Whether to run eval on the dev set or test set.", ) parser.add_argument( "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.", ) parser.add_argument( "--train_batch_size", default=32, type=int, help="Total batch size for training.", ) parser.add_argument( "--eval_batch_size", default=8, type=int, help="Total batch size for eval.", ) parser.add_argument( "--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.", ) parser.add_argument( "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.", ) parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.", ) parser.add_argument( "--weight_decay", default=0.01, type=float, help="Weight deay if we apply some.", ) parser.add_argument( "--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.", ) parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument( "--no_cuda", action="store_true", help="Whether not to use CUDA when available", ) parser.add_argument( "--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus", ) parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit float precision instead of 32-bit", ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) parser.add_argument( "--loss_scale", type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n", ) parser.add_argument( "--server_ip", type=str, default="", help="Can be used for distant debugging.", ) parser.add_argument( "--server_port", type=str, default="", help="Can be used for distant debugging.", ) args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = {"ner": NerProcessor} if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend="nccl") logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = (args.train_batch_size // args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() num_labels = len(label_list) + 1 tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = 0 if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = (int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs) if args.local_rank != -1: num_train_optimization_steps = (num_train_optimization_steps // torch.distributed.get_world_size()) if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab # Prepare model config = BertConfig.from_pretrained(args.bert_model, num_labels=num_labels, finetuning_task=args.task_name) model = Ner.from_pretrained(args.bert_model, from_tf=False, config=config) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] warmup_steps = int(args.warmup_proportion * num_train_optimization_steps) optimizer = AdamW( optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, ) scheduler = WarmupLinearSchedule( optimizer, warmup_steps=warmup_steps, t_total=num_train_optimization_steps, ) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if n_gpu > 1: model = torch.nn.DataParallel(model) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True, ) global_step = 0 nb_tr_steps = 0 tr_loss = 0 label_map = {i: label for i, label in enumerate(label_list, 1)} if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) all_valid_ids = torch.tensor([f.valid_ids for f in train_features], dtype=torch.long) all_lmask_ids = torch.tensor([f.label_mask for f in train_features], dtype=torch.long) train_data = TensorDataset( all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_valid_ids, all_lmask_ids, ) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, valid_ids, l_mask = ( batch) loss = model( input_ids, segment_ids, input_mask, label_ids, valid_ids, l_mask, ) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 # Save a trained model and the associated configuration model_to_save = (model.module if hasattr(model, "module") else model ) # Only save the model it-self model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) label_map = {i: label for i, label in enumerate(label_list, 1)} model_config = { "bert_model": args.bert_model, "do_lower": args.do_lower_case, "max_seq_length": args.max_seq_length, "num_labels": len(label_list) + 1, "label_map": label_map, } json.dump( model_config, open(os.path.join(args.output_dir, "model_config.json"), "w"), ) # Load a trained model and config that you have fine-tuned else: # Load a trained model and vocabulary that you have fine-tuned model = Ner.from_pretrained(args.output_dir) tokenizer = BertTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): if args.eval_on == "dev": eval_examples = processor.get_dev_examples(args.data_dir) elif args.eval_on == "test": eval_examples = processor.get_test_examples(args.data_dir) else: raise ValueError("eval on dev or test set only") eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) all_valid_ids = torch.tensor([f.valid_ids for f in eval_features], dtype=torch.long) all_lmask_ids = torch.tensor([f.label_mask for f in eval_features], dtype=torch.long) eval_data = TensorDataset( all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_valid_ids, all_lmask_ids, ) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 y_true = [] y_pred = [] label_map = {i: label for i, label in enumerate(label_list, 1)} for ( input_ids, input_mask, segment_ids, label_ids, valid_ids, l_mask, ) in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) valid_ids = valid_ids.to(device) label_ids = label_ids.to(device) l_mask = l_mask.to(device) with torch.no_grad(): logits = model( input_ids, segment_ids, input_mask, valid_ids=valid_ids, attention_mask_label=l_mask, ) logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2) logits = logits.detach().cpu().numpy() label_ids = label_ids.to("cpu").numpy() input_mask = input_mask.to("cpu").numpy() for i, label in enumerate(label_ids): temp_1 = [] temp_2 = [] for j, m in enumerate(label): if j == 0: continue elif label_ids[i][j] == len(label_map): y_true.append(temp_1) y_pred.append(temp_2) break else: temp_1.append(label_map[label_ids[i][j]]) temp_2.append(label_map[logits[i][j]]) report = classification_report(y_true, y_pred, digits=4) logger.info("\n%s", report) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") logger.info("\n%s", report) writer.write(report)
def trains(args, train_dataset, eval_dataset, model): train_sampler = RandomSampler(train_dataset) # 随机抽取训练数据 train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) # 将训练数据进行封装成dataloader t_total = len( train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # gradient_accumulation_steps通过累计梯度来解决本地显存不足问题。 no_decay = ['bias', 'LayerNorm.weight', 'transitions'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch") set_seed(args) best_acc = 0. for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(epoch_iterator): batch = tuple(t.to(args.device) for t in batch) inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2], 'labels': batch[3], } # code.interact(local=locals()) outputs = model(**inputs) # loss = [1], logits = [batch_size,2] loss, logits = outputs[0], outputs[1] # code.interact(local = locals()) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) logging_loss += loss.item() tr_loss += loss.item() if 0 == (step + 1) % args.gradient_accumulation_steps: optimizer.step() scheduler.step() model.zero_grad() global_step += 1 logger.info("EPOCH = [%d/%d] global_step = %d loss = %f", _ + 1, args.num_train_epochs, global_step, logging_loss) logging_loss = 0.0 # if (global_step < 100 and global_step % 10 == 0) or (global_step % 50 == 0): # 每 相隔 100步,评估一次 if (global_step % 5 == 0 and global_step <= 100) or (global_step % 100 == 0 and global_step < 1000) \ or (global_step % 200 == 0): best_acc = evaluate_and_save_model(args, model, eval_dataset, _, global_step, best_acc) best_acc = evaluate_and_save_model(args, model, eval_dataset, _, global_step, best_acc)
def trains(args, train_dataset, eval_dataset, model): train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) t_total = len(train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs no_decay = ['bias', 'LayerNorm.weight', 'transitions'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch") set_seed(args) best_f1 = 0. for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(epoch_iterator): batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2], 'tags': batch[3], 'decode': True } outputs = model(**inputs) loss, pre_tag = outputs[0], outputs[1] if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) logging_loss += loss.item() tr_loss += loss.item() if 0 == (step + 1) % args.gradient_accumulation_steps: optimizer.step() scheduler.step() model.zero_grad() global_step += 1 logger.info("EPOCH = [%d/%d] global_step = %d loss = %f", _ + 1, args.num_train_epochs, global_step, logging_loss) logging_loss = 0.0 # 每隔100 steps 評估F1 if global_step % 100 == 0: best_f1 = evaluate_and_save_model(args, model, eval_dataset, _, global_step, best_f1) # 最後再評估一次F1 best_f1 = evaluate_and_save_model(args, model, eval_dataset, _, global_step, best_f1)
def train(args): random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if torch.cuda.is_available() and args.cuda: torch.cuda.manual_seed(args.seed) print('configuration:') print('\n'.join('\t{:15} {}'.format(k + ':', str(v)) for k, v in sorted(dict(vars(args)).items()))) print() config_path = os.path.join(args.save_dir, 'config.json') model_path = os.path.join(args.save_dir, 'model.pt') log_path = os.path.join(args.save_dir, 'log.csv') if args.save: export_config(args, config_path) check_path(model_path) with open(log_path, 'w') as fout: fout.write('step,train_acc,dev_acc\n') ################################################################################################### # Load data # ################################################################################################### cp_emb = [np.load(path) for path in args.ent_emb_paths] cp_emb = torch.tensor(np.concatenate(cp_emb, 1)) concept_num, concept_dim = cp_emb.size(0), cp_emb.size(1) print('num_concepts: {}, concept_dim: {}'.format(concept_num, concept_dim)) device = torch.device( "cuda:0" if torch.cuda.is_available() and args.cuda else "cpu") dataset = GconAttnDataLoader( train_statement_path=args.train_statements, train_concept_jsonl=args.train_concepts, dev_statement_path=args.dev_statements, dev_concept_jsonl=args.dev_concepts, test_statement_path=args.test_statements, test_concept_jsonl=args.test_concepts, concept2id_path=args.cpnet_vocab_path, batch_size=args.batch_size, eval_batch_size=args.eval_batch_size, device=device, model_name=args.encoder, max_cpt_num=max_cpt_num[args.dataset], max_seq_length=args.max_seq_len, is_inhouse=args.inhouse, inhouse_train_qids_path=args.inhouse_train_qids, subsample=args.subsample, format=args.format) print('len(train_set): {} len(dev_set): {} len(test_set): {}'.format( dataset.train_size(), dataset.dev_size(), dataset.test_size())) print() ################################################################################################### # Build model # ################################################################################################### lstm_config = get_lstm_config_from_args(args) model = LMGconAttn(model_name=args.encoder, concept_num=concept_num, concept_dim=args.cpt_out_dim, concept_in_dim=concept_dim, freeze_ent_emb=args.freeze_ent_emb, pretrained_concept_emb=cp_emb, hidden_dim=args.decoder_hidden_dim, dropout=args.dropoutm, encoder_config=lstm_config) if args.freeze_ent_emb: freeze_net(model.decoder.concept_emb) try: model.to(device) except RuntimeError as e: print(e) print('best dev acc: 0.0 (at epoch 0)') print('final test acc: 0.0') print() return no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] grouped_parameters = [ { 'params': [ p for n, p in model.encoder.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay, 'lr': args.encoder_lr }, { 'params': [ p for n, p in model.encoder.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0, 'lr': args.encoder_lr }, { 'params': [ p for n, p in model.decoder.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay, 'lr': args.decoder_lr }, { 'params': [ p for n, p in model.decoder.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0, 'lr': args.decoder_lr }, ] optimizer = OPTIMIZER_CLASSES[args.optim](grouped_parameters) if args.lr_schedule == 'fixed': scheduler = ConstantLRSchedule(optimizer) elif args.lr_schedule == 'warmup_constant': scheduler = WarmupConstantSchedule(optimizer, warmup_steps=args.warmup_steps) elif args.lr_schedule == 'warmup_linear': max_steps = int(args.n_epochs * (dataset.train_size() / args.batch_size)) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=max_steps) print('parameters:') for name, param in model.decoder.named_parameters(): if param.requires_grad: print('\t{:45}\ttrainable\t{}'.format(name, param.size())) else: print('\t{:45}\tfixed\t{}'.format(name, param.size())) num_params = sum(p.numel() for p in model.decoder.parameters() if p.requires_grad) print('\ttotal:', num_params) if args.loss == 'margin_rank': loss_func = nn.MarginRankingLoss(margin=0.1, reduction='mean') elif args.loss == 'cross_entropy': loss_func = nn.CrossEntropyLoss(reduction='mean') ################################################################################################### # Training # ################################################################################################### print('-' * 71) global_step, best_dev_epoch = 0, 0 best_dev_acc, final_test_acc, total_loss = 0.0, 0.0, 0.0 start_time = time.time() model.train() freeze_net(model.encoder) try: for epoch_id in range(args.n_epochs): if epoch_id == args.unfreeze_epoch: unfreeze_net(model.encoder) if epoch_id == args.refreeze_epoch: freeze_net(model.encoder) model.train() for qids, labels, *input_data in dataset.train(): optimizer.zero_grad() bs = labels.size(0) for a in range(0, bs, args.mini_batch_size): b = min(a + args.mini_batch_size, bs) logits, _ = model(*[x[a:b] for x in input_data], layer_id=args.encoder_layer) if args.loss == 'margin_rank': num_choice = logits.size(1) flat_logits = logits.view(-1) correct_mask = F.one_hot( labels, num_classes=num_choice).view( -1) # of length batch_size*num_choice correct_logits = flat_logits[ correct_mask == 1].contiguous().view(-1, 1).expand( -1, num_choice - 1).contiguous().view( -1) # of length batch_size*(num_choice-1) wrong_logits = flat_logits[ correct_mask == 0] # of length batch_size*(num_choice-1) y = wrong_logits.new_ones((wrong_logits.size(0), )) loss = loss_func(correct_logits, wrong_logits, y) # margin ranking loss elif args.loss == 'cross_entropy': loss = loss_func(logits, labels[a:b]) loss = loss * (b - a) / bs loss.backward() total_loss += loss.item() if args.max_grad_norm > 0: nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) scheduler.step() optimizer.step() if (global_step + 1) % args.log_interval == 0: total_loss /= args.log_interval ms_per_batch = 1000 * (time.time() - start_time) / args.log_interval print( '| step {:5} | lr: {:9.7f} | loss {:7.4f} | ms/batch {:7.2f} |' .format(global_step, scheduler.get_lr()[0], total_loss, ms_per_batch)) total_loss = 0 start_time = time.time() global_step += 1 model.eval() dev_acc = evaluate_accuracy(dataset.dev(), model) test_acc = evaluate_accuracy( dataset.test(), model) if args.test_statements else 0.0 print('-' * 71) print('| step {:5} | dev_acc {:7.4f} | test_acc {:7.4f} |'.format( global_step, dev_acc, test_acc)) print('-' * 71) if args.save: with open(log_path, 'a') as fout: fout.write('{},{},{}\n'.format(global_step, dev_acc, test_acc)) if dev_acc >= best_dev_acc: best_dev_acc = dev_acc final_test_acc = test_acc best_dev_epoch = epoch_id if args.save: torch.save([model, args], model_path) print(f'model saved to {model_path}') model.train() start_time = time.time() if epoch_id > args.unfreeze_epoch and epoch_id - best_dev_epoch >= args.max_epochs_before_stop: break except (KeyboardInterrupt, RuntimeError) as e: print(e) print() print('training ends in {} steps'.format(global_step)) print('best dev acc: {:.4f} (at epoch {})'.format(best_dev_acc, best_dev_epoch)) print('final test acc: {:.4f}'.format(final_test_acc)) print()
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs args.warmup_steps = t_total // 100 # Prepare optimizer and schedule (linear warmup and decay) optimizer_grouped_parameters = get_param_groups(args, model) optimizer = RAdam(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: # model = torch.nn.DataParallel(model) model = DataParallelModel(model) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) args.logging_steps = len(train_dataloader) // 1 args.save_steps = args.logging_steps global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch") set_seed(args) for _ in train_iterator: args.current_epoch = _ epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, } # XLM and RoBERTa don't use segment_ids # 'labels': batch[3]} outputs = model(**inputs) outputs = [outputs[i][0] for i in range(len(outputs))] loss_fct = CrossEntropyLoss() loss_fct = DataParallelCriterion(loss_fct) loss = loss_fct(outputs, batch[3]) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() model.zero_grad() global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def main(): args = init_args() device, n_gpu = U.get_device(logger) output_dir = U.create_save_path(args, __file__) run_details_file = os.path.join(output_dir, "run_details.txt") # tb_dir = os.path.join(output_dir, "all_scalars.json") tb_writer = SummaryWriter(output_dir) special_tokens_dict = { "additional_special_tokens": [ '[s:genre]', '[s:artist]', '[s:year]', '[s:album]', '[s:song_name]', '[s:lyrics]', '[e:genre]', '[e:artist]', '[e:year]', '[e:album]', '[e:song_name]', '[e:lyrics]' ] } U.log_arguments(run_details_file, args, special_tokens_dict["additional_special_tokens"]) # Initialise model & tokenizer enc = GPT2Tokenizer.from_pretrained(args.model_size) enc.add_special_tokens(special_tokens_dict) model = GPT2LMHeadModel.from_pretrained(args.model_size) model.resize_token_embeddings(len(enc)) # Prepare training data train_data_loader = prepare_train_data(args, enc, device) # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimization_steps = ((len(train_data_loader) * args.num_train_epochs) // \ (args.train_batch_size * args.gradient_accumulation_steps)) + 1000 # TODO: Could use NVIDIA Apex for lower precision calculations. optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=optimization_steps) # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ # @ FINE-TUNE GPT2 # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ if args.train_model: logger.info("\nFine-tuning GPT2") print( "To visualise data using TensorBoardX -> type in console:\ntensorboard --logdir={}" .format(output_dir)) model.to(device) model.train() for epoch in trange(int(args.num_train_epochs), desc="Epoch"): past = None if epoch > 0: # Re-process dataset since the features dropout is random. train_data_loader = prepare_train_data(args, enc, device) for step, batch in enumerate( tqdm(train_data_loader, desc="Training")): tok_ids, tok_type_ids, pos_ids, att_mask, lm_labels = batch outputs = model(input_ids=tok_ids, past=past, attention_mask=att_mask, token_type_ids=tok_type_ids, position_ids=pos_ids, labels=lm_labels) loss = outputs[0] # predicted_scores = outputs[1] # past = outputs[2] # Log the loss to TensorBoardX global_step = (epoch * len(train_data_loader)) + (step + 1) tb_writer.add_scalar('loss', loss.item(), global_step) # Normalise the loss (Simulates average of a batch) loss = loss / args.gradient_accumulation_steps loss.backward(retain_graph=True) if (step + 1) % args.gradient_accumulation_steps == 0: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() optimizer.zero_grad() if (epoch + 1) % args.save_every_n_epoch == 0: save_model_dir = U.make_dir( os.path.join(output_dir, "model_epoch_" + str(epoch + 1))) model.save_pretrained(save_model_dir) enc.save_pretrained(save_model_dir) tb_dir = os.path.join(output_dir, "all_scalars.json") tb_writer.export_scalars_to_json(tb_dir) tb_writer.close() # Save model and tokenizer to a directory save_model_dir = U.make_dir( os.path.join(output_dir, "model_epoch_" + str(epoch + 1))) model.save_pretrained(save_model_dir) enc.save_pretrained(save_model_dir)
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed( args) # Added here for reproductibility (even between python 2 and 3) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3] } if args.model_type != 'distilbert': inputs['token_type_ids'] = batch[2] if args.model_type in [ 'bert', 'xlnet' ] else None # XLM, DistilBERT and RoBERTa don't use segment_ids outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if (step + 1 ) % args.gradient_accumulation_steps == 0 and not args.tpu: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) # Save model checkpoint output_dir = os.path.join( args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) if args.tpu: args.xla_model.optimizer_step(optimizer, barrier=True) model.zero_grad() global_step += 1 if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def train(self, train_dataset, output_dir, show_running_loss=True): """ Trains the model on train_dataset. Utility function to be used by the train_model() method. Not intended to be used directly. """ tokenizer = self.tokenizer device = self.device model = self.model args = self.args tb_writer = SummaryWriter() train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args["train_batch_size"]) t_total = len(train_dataloader) // args["gradient_accumulation_steps"] * args["num_train_epochs"] no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ {"params": [p for n, p in model.named_parameters() if not any( nd in n for nd in no_decay)], "weight_decay": args["weight_decay"]}, {"params": [p for n, p in model.named_parameters() if any( nd in n for nd in no_decay)], "weight_decay": 0.0} ] warmup_steps = math.ceil(t_total * args["warmup_ratio"]) args["warmup_steps"] = warmup_steps if args["warmup_steps"] == 0 else args["warmup_steps"] optimizer = AdamW(optimizer_grouped_parameters, lr=args["learning_rate"], eps=args["adam_epsilon"]) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args["warmup_steps"], t_total=t_total) if args["fp16"]: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args["fp16_opt_level"]) if args["n_gpu"] > 1: model = torch.nn.DataParallel(model) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args["num_train_epochs"]), desc="Epoch", disable=args["silent"]) model.train() for _ in train_iterator: # epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args["silent"]) for step, batch in enumerate(tqdm(train_dataloader, desc="Current iteration", disable=args["silent"])): batch = tuple(t.to(device) for t in batch) inputs = self._get_inputs_dict(batch) outputs = model(**inputs) # model outputs are always tuple in pytorch-transformers (see doc) loss = outputs[0] if show_running_loss: if not args["silent"]: print("\rRunning loss: %f" % loss, end="") if args["gradient_accumulation_steps"] > 1: loss = loss / args["gradient_accumulation_steps"] if args["fp16"]: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args["max_grad_norm"]) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args["max_grad_norm"]) tr_loss += loss.item() if (step + 1) % args["gradient_accumulation_steps"] == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args["logging_steps"] > 0 and global_step % args["logging_steps"] == 0: # Log metrics # Only evaluate when single GPU otherwise metrics may not average well tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss)/args["logging_steps"], global_step) logging_loss = tr_loss if args["save_steps"] > 0 and global_step % args["save_steps"] == 0: # Save model checkpoint output_dir_current = os.path.join(output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir_current): os.makedirs(output_dir_current) # Take care of distributed/parallel training model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(output_dir_current) self.tokenizer.save_pretrained(output_dir_current) return global_step, tr_loss / global_step