def predict(args, model, tokenizer, prefix=""): pred_output_dir = args.output_dir if not os.path.exists(pred_output_dir) and args.local_rank in [-1, 0]: os.makedirs(pred_output_dir) test_dataset = load_and_cache_examples(args, args.task_name, tokenizer, data_type='test') # Note that DistributedSampler samples randomly test_sampler = SequentialSampler( test_dataset) if args.local_rank == -1 else DistributedSampler( test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=1, collate_fn=collate_fn) # Eval! logger.info("***** Running prediction %s *****", prefix) logger.info(" Num examples = %d", len(test_dataset)) logger.info(" Batch size = %d", 1) results = [] output_predict_file = os.path.join(pred_output_dir, prefix, "test_prediction.json") pbar = ProgressBar(n_total=len(test_dataloader), desc="Predicting") if isinstance(model, nn.DataParallel): model = model.module for step, batch in enumerate(test_dataloader): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": None } if args.model_type != "distilbert": # XLM and RoBERTa don"t use segment_ids inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet"] else None) outputs = model(**inputs) logits = outputs[0] if args.use_crf: tags = model.crf.decode(logits, inputs['attention_mask']) tags = tags.squeeze(0).cpu().numpy().tolist() else: tags = np.argmax(logits.cpu().numpy(), axis=2).tolist() preds = tags[0][1:-1] # [CLS]XXXX[SEP] label_entities = get_entities(preds, args.id2label, args.markup) json_d = {} json_d['id'] = step json_d['tag_seq'] = " ".join([args.id2label[x] for x in preds]) json_d['entities'] = label_entities results.append(json_d) pbar(step) logger.info("\n") with open(output_predict_file, "w") as writer: for record in results: writer.write(json.dumps(record) + '\n') if args.task_name == 'cluener': output_submit_file = os.path.join(pred_output_dir, prefix, "test_submit.json") test_text = [] with open(os.path.join(args.data_dir, "test.json"), 'r') as fr: for line in fr: test_text.append(json.loads(line)) test_submit = [] for x, y in zip(test_text, results): json_d = {} json_d['id'] = x['id'] json_d['label'] = {} entities = y['entities'] words = list(x['text']) if len(entities) != 0: for subject in entities: tag = subject[0] start = subject[1] end = subject[2] word = "".join(words[start:end + 1]) if tag in json_d['label']: if word in json_d['label'][tag]: json_d['label'][tag][word].append([start, end]) else: json_d['label'][tag][word] = [[start, end]] else: json_d['label'][tag] = {} json_d['label'][tag][word] = [[start, end]] test_submit.append(json_d) json_to_text(output_submit_file, test_submit)
def train(args, train_dataset, model, tokenizer): """ Train the model """ args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate_fn) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) if "bert" in args.model_type: no_decay = ["bias", "LayerNorm.weight"] bert_param_optimizer = list(model.bert.named_parameters()) linear_param_optimizer = list(model.classifier.named_parameters()) optimizer_grouped_parameters = [{ 'params': [ p for n, p in bert_param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay, 'lr': args.learning_rate }, { 'params': [ p for n, p in bert_param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0, 'lr': args.learning_rate }, { 'params': [ p for n, p in linear_param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay, 'lr': args.crf_learning_rate }, { 'params': [ p for n, p in linear_param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0, 'lr': args.crf_learning_rate }] if args.use_crf: crf_param_optimizer = list(model.crf.named_parameters()) optimizer_grouped_parameters.extend([{ 'params': [ p for n, p in crf_param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay, 'lr': args.crf_learning_rate }, { 'params': [ p for n, p in crf_param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0, 'lr': args.crf_learning_rate }]) if args.use_lstm: lstm_param_optimizer = list(model.bilstm.named_parameters()) optimizer_grouped_parameters.extend([{ 'params': [ p for n, p in lstm_param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay, 'lr': args.crf_learning_rate }, { 'params': [ p for n, p in lstm_param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0, 'lr': args.crf_learning_rate }]) else: no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] args.warmup_steps = int(t_total * args.warmup_proportion) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=3, # verbose=1, epsilon=1e-4, cooldown=0, min_lr=0, eps=1e-8) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join( args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt")): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) logger.info(" model type = %s", args.model_type) global_step = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path ) and "checkpoint" in args.model_name_or_path: # set global_step to gobal_step of last saved checkpoint from model path global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0]) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() seed_everything( args.seed ) # Added here for reproductibility (even between python 2 and 3) for _ in range(int(args.num_train_epochs)): pbar = ProgressBar(n_total=len(train_dataloader), desc='Training') for step, batch in enumerate(train_dataloader): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3], 'input_lens': batch[4] } if args.model_type != "distilbert": # XLM and RoBERTa don"t use segment_ids inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet"] else None) outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in pytorch-transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() pbar(step, {'loss': loss.item()}) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics print(" ") if args.local_rank == -1: # Only evaluate when single GPU otherwise metrics may not average well evaluate(args, model, tokenizer) if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training # model_to_save.save_pretrained(output_dir) save_model(model_to_save, output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) tokenizer.save_vocabulary(output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) logger.info("\n") if 'cuda' in str(args.device): torch.cuda.empty_cache() return global_step, tr_loss / global_step
def evaluate(args, model, tokenizer, prefix=""): metric = SeqEntityScore(args.id2label, markup=args.markup) eval_output_dir = args.output_dir if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) eval_dataset = load_and_cache_examples(args, args.task_name, tokenizer, data_type='dev') args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler( eval_dataset) if args.local_rank == -1 else DistributedSampler( eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate_fn) # Eval! logger.info("***** Running evaluation %s *****", prefix) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 pbar = ProgressBar(n_total=len(eval_dataloader), desc="Evaluating") if isinstance(model, nn.DataParallel): model = model.module for step, batch in enumerate(eval_dataloader): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3], 'input_lens': batch[4] } if args.model_type != "distilbert": # XLM and RoBERTa don"t use segment_ids inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet"] else None) outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] if args.use_crf: preds = model.crf.decode(logits, inputs['attention_mask']) preds = preds.squeeze(0).cpu().numpy().tolist() else: preds = np.argmax(logits.cpu().numpy(), axis=2).tolist() if args.n_gpu > 1: tmp_eval_loss = tmp_eval_loss.mean( ) # mean() to average on multi-gpu parallel evaluating eval_loss += tmp_eval_loss.item() nb_eval_steps += 1 out_label_ids = inputs['labels'].cpu().numpy().tolist() input_lens = inputs['input_lens'].cpu().numpy().tolist() for i, label in enumerate(out_label_ids): temp_1 = [] temp_2 = [] for j, m in enumerate(label): if j == 0: continue elif j == input_lens[i] - 1: metric.update(pred_paths=[temp_2], label_paths=[temp_1]) break else: temp_1.append(args.id2label[out_label_ids[i][j]]) temp_2.append(args.id2label[preds[i][j]]) pbar(step) logger.info("\n") eval_loss = eval_loss / nb_eval_steps eval_info, entity_info = metric.result() results = {f'{key}': value for key, value in eval_info.items()} results['loss'] = eval_loss logger.info("***** Eval results %s *****", prefix) info = "-".join( [f' {key}: {value:.4f} ' for key, value in results.items()]) logger.info(info) logger.info("***** Entity results %s *****", prefix) for key in sorted(entity_info.keys()): logger.info("******* %s results ********" % key) info = "-".join([ f' {key}: {value:.4f} ' for key, value in entity_info[key].items() ]) logger.info(info) return results
def train(args, train_dataset, model, tokenizer): writer = SummaryWriter(args.summary_dir) """ Train the model """ args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate_fn) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] bert_param_optimizer = list(model.bert.named_parameters()) crf_param_optimizer = list(model.crf.named_parameters()) linear_param_optimizer = list(model.classifier.named_parameters()) optimizer_grouped_parameters = [ {'params': [p for n, p in bert_param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay, 'lr': args.learning_rate}, {'params': [p for n, p in bert_param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0, 'lr': args.learning_rate}, {'params': [p for n, p in crf_param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay, 'lr': args.crf_learning_rate}, {'params': [p for n, p in crf_param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0, 'lr': args.crf_learning_rate}, {'params': [p for n, p in linear_param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay, 'lr': args.crf_learning_rate}, {'params': [p for n, p in linear_param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0, 'lr': args.crf_learning_rate} ] if args.warmup_steps == -1: args.warmup_steps = int(t_total * args.warmup_proportion) # args.warmup_steps = int(442) # optimizer = RAdam(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # scheduler = lr_scheduler.MultiStepLR(optimizer, [1 * len(train_dataloader), 4 * len(train_dataloader)], 0.5) # scheduler = ReduceLROnPlateau(optimizer,'min') # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt")): # Load in optimizer and scheduler states optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path) and "checkpoint" in args.model_name_or_path: # set global_step to gobal_step of last saved checkpoint from model path global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0]) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps) logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) best_eval_f1 = 0 tr_loss, logging_loss = 0.0, 0.0 if args.do_adv: fgm = FGM(model, emb_name=args.adv_name, epsilon=args.adv_epsilon) model.zero_grad() seed_everything(args.seed) # Added here for reproductibility (even between python 2 and 3) for epoch in range(int(args.num_train_epochs)): pbar = ProgressBar(n_total=len(train_dataloader), desc='Training') for step, batch in enumerate(train_dataloader): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(args.device) for t in batch) inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3], 'input_lens': batch[4]} if args.model_type != "distilbert": # XLM and RoBERTa don"t use segment_ids inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet"] else None) outputs = model(**inputs) loss = outputs[0] # model outputs are always tuple in pytorch-transformers (see doc) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if args.do_adv: fgm.attack() loss_adv = model(**inputs)[0] if args.n_gpu > 1: loss_adv = loss_adv.mean() loss_adv.backward() fgm.restore() cur_lr = optimizer.state_dict()['param_groups'][0]['lr'] # cur_lr = scheduler.get_lr() pbar(step, {'epoch': int(epoch), 'loss': loss.item(), 'lr': cur_lr}) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: writer.add_scalar('loss', loss.item(), global_step=global_step) writer.add_scalar('lr', cur_lr, global_step=global_step) if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) scheduler.step() # Update learning rate schedule optimizer.step() model.zero_grad() global_step += 1 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics print(" ") if args.local_rank == -1: # Only evaluate when single GPU otherwise metrics may not average well prefix = f'epoch {epoch} global_step {global_step}' results = evaluate(args, model, tokenizer, prefix=prefix) if results['f1'] > best_eval_f1: best_eval_f1 = results['f1'] output_dir = os.path.join(args.output_dir, "best_eval_checkpoint") if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) tokenizer.save_vocabulary(output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) results['checkoutpoint'] = global_step results['epoch'] = epoch # results.update(vars(args)) json.dump(results, open(os.path.join(output_dir, 'eval_result.txt'), 'w'), ensure_ascii=False, indent=4) # if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: # # Save model checkpoint # output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step)) # if not os.path.exists(output_dir): # os.makedirs(output_dir) # model_to_save = ( # model.module if hasattr(model, "module") else model # ) # Take care of distributed/parallel training # model_to_save.save_pretrained(output_dir) # torch.save(args, os.path.join(output_dir, "training_args.bin")) # logger.info("Saving model checkpoint to %s", output_dir) # tokenizer.save_vocabulary(output_dir) # torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) # torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) # logger.info("Saving optimizer and scheduler states to %s", output_dir) logger.info("\n") if 'cuda' in str(args.device): torch.cuda.empty_cache() return global_step, tr_loss / global_step
def train(args, train_dataset, model, tokenizer): """ Train the model """ # 总batch size = GPU数量 * 每个GPU上的batch size args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) # 如果无指定GPU就随机采样,如果指定了GPU就分布式采样 train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) # 准备dataloader train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate_fn) # 训练steps if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) # no_dacay内的参数不参与权重衰减 # BN是固定C,[B,H,W]进行归一化处理(处理为均值0,方差1的正太分布上),适用于CNN # LN是固定N,[C,H,W]进行归一化处理,适用于RNN(BN适用于固定深度的前向神经网络,而RNN因输入序列长度不一致而深度不固定,因此BN不合适,而LN不依赖于batch的大小和输入sequence的深度,因此可以用于batchsize为1和RNN中对边长的输入sequence的normalize操作) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] # 配置优化器和warmup机制 args.warmup_steps = int(t_total * args.warmup_proportion) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist # 检查是否有已保存的optimizer或scheduler if os.path.isfile(os.path.join( args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt")): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) # 开启FP16 if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # # 如果有多个GPU,就直接用torch.nn.DataParallel,会自动调用当前可用的多个GPU # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! 日志记录 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint 检查是否从断点开始重训练 if os.path.exists(args.model_name_or_path ) and "checkpoint" in args.model_name_or_path: # set global_step to gobal_step of last saved checkpoint from model path global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0]) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) tr_loss, logging_loss = 0.0, 0.0 # 是否开启adversarial if args.do_adv: fgm = FGM(model, emb_name=args.adv_name, epsilon=args.adv_epsilon) model.zero_grad() seed_everything( args.seed ) # Added here for reproductibility (even between python 2 and 3) for _ in range(int(args.num_train_epochs)): pbar = ProgressBar(n_total=len(train_dataloader), desc='Training') for step, batch in enumerate(train_dataloader): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3] } if args.model_type != "distilbert": # XLM and RoBERTa don"t use segment_ids inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet"] else None) # 训练模型 outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in pytorch-transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if args.do_adv: fgm.attack() loss_adv = model(**inputs)[0] if args.n_gpu > 1: loss_adv = loss_adv.mean() loss_adv.backward() fgm.restore() pbar(step, {'loss': loss.item()}) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # 梯度裁剪 torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) scheduler.step() # Update learning rate schedule 学习率更新 optimizer.step() # 优化器更新 model.zero_grad() # 清空现有梯度,避免累计 global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics print(" ") if args.local_rank == -1: # Only evaluate when single GPU otherwise metrics may not average well evaluate(args, model, tokenizer) if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) # Take care of distributed/parallel training model_to_save = (model.module if hasattr(model, "module") else model) model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) tokenizer.save_vocabulary(output_dir) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) logger.info("\n") if 'cuda' in str(args.device): torch.cuda.empty_cache() return global_step, tr_loss / global_step
def create_training_instances(input_file, tokenizer, max_seq_len, short_seq_prob, max_ngram, masked_lm_prob, max_predictions_per_seq): """Create `TrainingInstance`s from raw text.""" all_documents = [[]] # Input file format: # (1) One sentence per line. These should ideally be actual sentences, not # entire paragraphs or arbitrary spans of text. (Because we use the # sentence boundaries for the "next sentence prediction" task). # (2) Blank lines between documents. Document boundaries are needed so # that the "next sentence prediction" task doesn't span between documents. f = open(input_file, 'r') lines = f.readlines() pbar = ProgressBar(n_total=len(lines), desc='read data') for line_cnt, line in enumerate(lines): line = line.strip() # Empty lines are used as document delimiters if not line: all_documents.append([]) tokens = tokenizer.tokenize(line) if tokens: all_documents[-1].append(tokens) pbar(step=line_cnt) print(' ') # Remove empty documents all_documents = [x for x in all_documents if x] random.shuffle(all_documents) vocab_words = list(tokenizer.vocab.keys()) instances = [] pbar = ProgressBar(n_total=len(all_documents), desc='create instances') for document_index in range(len(all_documents)): instances.extend( create_instances_from_document(all_documents, document_index, max_seq_len, short_seq_prob, max_ngram, masked_lm_prob, max_predictions_per_seq, vocab_words)) pbar(step=document_index) print(' ') ex_idx = 0 while ex_idx < 5: instance = instances[ex_idx] logger.info("-------------------------Example-----------------------") logger.info(f"id: {ex_idx}") logger.info( f"original tokens: {' '.join([str(x) for x in instance['original_tokens']])}" ) logger.info( f"tokens: {' '.join([str(x) for x in instance['tokens']])}") logger.info( f"masked_lm_labels: {' '.join([str(x) for x in instance['masked_lm_labels']])}" ) logger.info( f"segment_ids: {' '.join([str(x) for x in instance['segment_ids']])}" ) logger.info( f"masked_lm_positions: {' '.join([str(x) for x in instance['masked_lm_positions']])}" ) logger.info(f"is_random_next : {instance['is_random_next']}") ex_idx += 1 random.shuffle(instances) return instances
def train(args, train_dataset, label_lists, model, tokenizer): """ Train the model """ args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate_fn_ner) if args.max_steps > 0: num_training_steps = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: num_training_steps = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs args.warmup_steps = int(num_training_steps * args.warmup_proportion) # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] # optimizer = Lamb(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) optimizer = AdamW(params=optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=num_training_steps) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", num_training_steps) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() seed_everything( args.seed ) # Added here for reproductibility (even between python 2 and 3) for epoch in range(int(args.num_train_epochs)): pbar = ProgressBar(n_total=len(train_dataloader), desc='Training') for step, batch in enumerate(train_dataloader): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3] } inputs['token_type_ids'] = batch[2] outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: #Log metrics if args.local_rank == -1: # Only evaluate when single GPU otherwise metrics may not average well evaluate(args, model, tokenizer, label_lists) if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, 'checkpoint-epoch{0}-step{1}'.format(epoch, global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) pbar(step, {'loss': loss.item()}) print(" ") if 'cuda' in str(args.device): torch.cuda.empty_cache() return global_step, tr_loss / global_step
def evaluate(args, model, tokenizer, label_lists, prefix=""): # Loop to handle MNLI double evaluation (matched, mis-matched) eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else ( args.task_name, ) eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else ( args.output_dir, ) results = {} logger.info("**** Evaluate *****") for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, data_type='dev') if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max( 1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler( eval_dataset) if args.local_rank == -1 else DistributedSampler( eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate_fn_ner) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None pbar = ProgressBar(n_total=len(eval_dataloader), desc="Evaluating") for step, batch in enumerate(eval_dataloader): now = datetime.datetime.now() model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3] } inputs['token_type_ids'] = batch[2] outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = np.argmax(logits.detach().cpu().numpy(), axis=2) out_label_ids = inputs['labels'].detach().cpu().numpy() else: preds_argmax = np.argmax(logits.detach().cpu().numpy(), axis=2) preds = collate_pred(preds, preds_argmax, label_lists) out_label_ids = collate_pred( out_label_ids, inputs['labels'].detach().cpu().numpy(), label_lists) pbar(step) delta = (datetime.datetime.now() - now).microseconds / 1000 logger.info("*** Evaluating timecost, input length %d, timecost %d" \ % (len(batch[0]), delta)) print(' ') if 'cuda' in str(args.device): torch.cuda.empty_cache() evaluater = NerAccuracyEvaluator(label_lists, "WORD") result = evaluater.evaluate(preds, out_label_ids, args.label_with_bi) results.update(result) logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) # Store evaluate results if args.do_eval and args.output_eval: print_eval_output(args.output_dir, preds, out_label_ids, label_lists) return results