def main(): args = get_argparse().parse_args() # 模型保存目录 if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) args.output_dir = args.output_dir + '{}'.format(args.model_type) if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) # CUDA, GPU if torch.cuda.is_available() and not args.no_cuda: args.device = torch.device("cuda:0") else: args.device = torch.device("cpu") # 打印参数 time_ = time.strftime("%Y-%m-%d", time.localtime()) init_logger(log_file=args.output_dir + f'/{args.model_type}-{args.task_name}-{time_}.log') logger.info("="*20+" args "+"="*20) for para in args.__dict__: msg = para + " = " + str(args.__dict__[para]) logger.info(msg) # Set seed seed_everything(args.seed) # Prepare NER task processor = NerProcessor() label_list = processor.get_labels() args.id2label = {i: label for i, label in enumerate(label_list)} args.label2id = {label: i for i, label in enumerate(label_list)} num_labels = len(label_list) num_labels = int((num_labels+1)/2) # 部分B I pad_token_label_id = CrossEntropyLoss().ignore_index # -100 # Load pretrained model and tokenizer args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, cache_dir=args.cache_dir if args.cache_dir else None, ) tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None, ) setattr(config, 'soft_label', args.soft_label) model = model_class(config=config) # 训练 if args.do_train: if args.continue_train: model = model_class.from_pretrained(args.continue_train_checkpoint, config=config) print(f"Continue training from {args.continue_train_checkpoint}") # 基础预训练模型 elif args.model_type.lower() == "electra": model.BaseModel = ElectraModel.from_pretrained(args.model_name_or_path) logger.info(f"Loading Electra from {args.model_name_or_path}...") elif args.model_type.lower() == "bert" : model.BaseModel = BertModel.from_pretrained(args.model_name_or_path) logger.info(f"Loading Bert from {args.model_name_or_path}...") elif args.model_type.lower() == "albert": model.BaseModel = AlbertModel.from_pretrained(args.model_name_or_path) logger.info(f"Loading AlBert from {args.model_name_or_path}...") print(model) model.to(args.device) train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, label_list, pad_token_label_id, data_type='train') global_step, lr_loss = train(args, train_dataset, model, tokenizer, label_list, pad_token_label_id) logger.info(" global_step = %s, average loss = %s", global_step, lr_loss) # 保存 logger.info("Saving model checkpoint to %s", args.output_dir) model.save_pretrained(args.output_dir) tokenizer.save_vocabulary(args.output_dir) torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # 测试集 if args.do_predict: tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) checkpoints = [args.output_dir] logger.info("Predict the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else "" model = model_class.from_pretrained(checkpoint, config=config) model.to(args.device) predict(args, model, tokenizer, label_list, pad_token_label_id, prefix=prefix)
def predict(args, model, tokenizer, label_list, pad_token_label_id, prefix=""): pred_output_dir = args.output_dir if not os.path.exists(pred_output_dir): os.makedirs(pred_output_dir) test_dataset = load_and_cache_examples(args, args.task_name, tokenizer, label_list, pad_token_label_id,data_type='test') # Note that DistributedSampler samples randomly test_sampler = SequentialSampler(test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=1, collate_fn=collate_fn) ### span_labels = [] for label in label_list: label = label.split('-')[-1] if label not in span_labels: span_labels.append(label) span_map = {i: label for i, label in enumerate(span_labels)} # Eval logger.info("***** Running prediction %s *****", prefix) logger.info(" Num examples = %d", len(test_dataset)) logger.info(" Batch size = %d", 1) results = [] # 全部测试结果 error_results=[] # 预测错误结果 true_labels = [] # 真实标签 predict_labels = [] # 预测标签 output_predict_file = os.path.join(pred_output_dir, prefix, "test_prediction.txt") error_predict_file = os.path.join(pred_output_dir, prefix, "Error_test_prediction.txt") pbar = ProgressBar(n_total=len(test_dataloader), desc="Predicting") if isinstance(model, torch.nn.DataParallel): # 多GPU训练 model = model.module for step, batch in enumerate(test_dataloader): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = {"input_ids": batch[0], "attention_mask": batch[1], "start_positions": batch[5], "end_positions": batch[6]} if args.model_type != "distilbert": # XLM and RoBERTa don"t use segment_ids inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet"] else None) outputs = model(**inputs) tmp_eval_loss, start_logits, end_logits = outputs[:3] start_preds = start_logits.detach().cpu().numpy() end_preds = end_logits.detach().cpu().numpy() start_preds = np.argmax(start_preds, axis=2) end_preds = np.argmax(end_preds, axis=2) start_preds_list = [span_map[j] for j in start_preds[0][1:-1]] end_preds_list = [span_map[j] for j in end_preds[0][1:-1]] batch_true_labels = batch[3].squeeze(0).cpu().numpy().tolist()[1:-1] batch_true_labels = [args.id2label.get(i) for i in batch_true_labels] true_labels.append(batch_true_labels) batch_predict_labels = convert_span_to_bio([start_preds_list], [end_preds_list]) predict_labels.extend(batch_predict_labels) input_ids = inputs["input_ids"].squeeze(0).cpu().numpy().tolist()[1:-1] sent = "" ifError=False for input_id,pre,lab in zip(input_ids, batch_predict_labels[0], batch_true_labels): sent+=" ".join([tokenizer.ids_to_tokens[input_id],lab,pre])+"\n" if lab != pre: ifError=True sent+="\n" results.append(sent) if ifError: error_results.append(sent) ifError = False pbar(step) # 计算测试集 acc, recall, f1 logger.info("\n测试集结果统计:") logger.info("accuary: %s", str(accuracy_score(true_labels, predict_labels))) logger.info("p: %s", str(precision_score(true_labels, predict_labels))) logger.info("r: %s", str(recall_score(true_labels, predict_labels))) logger.info("f1: %s", str(f1_score(true_labels, predict_labels))) logger.info("classification report: ") logger.info(str(classification_report(true_labels, predict_labels, mode='strict', scheme=IOB2))) logger.info("\n") with open(output_predict_file, "w",encoding="utf-8") as writer: for record in results: writer.write(record) with open(error_predict_file, "w",encoding="utf-8") as writer: for record in error_results: writer.write(record)
def predict(args, model, tokenizer, label_list, pad_token_label_id, prefix=""): pred_output_dir = args.output_dir if not os.path.exists(pred_output_dir): os.makedirs(pred_output_dir) test_dataset = load_and_cache_examples(args, args.task_name, tokenizer, label_list, pad_token_label_id, data_type='test') # Note that DistributedSampler samples randomly test_sampler = SequentialSampler(test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=1, collate_fn=collate_fn) # 每次只有一条数据 # Eval logger.info("***** Running prediction %s *****", prefix) logger.info(" Num examples = %d", len(test_dataset)) logger.info(" Batch size = %d", 1) results = [] # 全部测试结果 error_results = [] # 预测错误结果 true_labels = [] # 真实标签 predict_labels = [] # 预测标签 output_predict_file = os.path.join(pred_output_dir, prefix, "test_prediction.txt") error_predict_file = os.path.join(pred_output_dir, prefix, "Error_test_prediction.txt") pbar = ProgressBar(n_total=len(test_dataloader), desc="Predicting") if isinstance(model, torch.nn.DataParallel): # 多GPU训练 model = model.module for step, batch in enumerate(test_dataloader): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": None, 'input_lens': batch[4] } if args.model_type != "distilbert": # XLM and RoBERTa don"t use segment_ids inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet"] else None) outputs = model(**inputs) logits = outputs[0] batch_predict_labels = model.crf.decode(logits, inputs['attention_mask']) batch_predict_labels = batch_predict_labels[0][ 1:-1] # [CLS]XXXX[SEP] 每次只有一条数据 batch_true_labels = batch[3].squeeze(0).cpu().numpy().tolist()[1:-1] input_ids = inputs["input_ids"].squeeze(0).cpu().numpy().tolist()[1:-1] sent = "" ifError = False for input_id, pre, lab in zip(input_ids, batch_predict_labels, batch_true_labels): sent += " ".join([ tokenizer.ids_to_tokens[input_id], args.id2label[lab], args.id2label[pre] ]) + "\n" if args.id2label[lab] != args.id2label[pre]: ifError = True sent += "\n" results.append(sent) if ifError: error_results.append(sent) ifError = False pbar(step) # 计算测试集 acc, recall, f1 batch_true = [args.id2label.get(i) for i in batch_true_labels] batch_predict = [args.id2label.get(i) for i in batch_predict_labels] assert len(batch_true) == len(batch_predict) true_labels.append(batch_true) predict_labels.append(batch_predict) logger.info("\n测试集结果统计:") logger.info("accuary: %s", str(accuracy_score(true_labels, predict_labels))) logger.info("p: %s", str(precision_score(true_labels, predict_labels))) logger.info("r: %s", str(recall_score(true_labels, predict_labels))) logger.info("f1: %s", str(f1_score(true_labels, predict_labels))) logger.info("classification report: ") logger.info( str( classification_report(true_labels, predict_labels, mode='strict', scheme=IOB2))) logger.info("\n") with open(output_predict_file, "w", encoding="utf-8") as writer: for record in results: writer.write(record) with open(error_predict_file, "w", encoding="utf-8") as writer: for record in error_results: writer.write(record)
def evaluate(args, model, tokenizer, label_list, pad_token_label_id): eval_output_dir = args.output_dir if not os.path.exists(eval_output_dir): os.makedirs(eval_output_dir) eval_dataset = load_and_cache_examples(args, args.task_name, tokenizer, label_list, pad_token_label_id, data_type='dev') args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) ## span_labels = [] for label in label_list: label = label.split('-')[-1] if label not in span_labels: span_labels.append(label) span_map = {i: label for i, label in enumerate(span_labels)} # Eval logger.info("***** Running evaluation %s *****") logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 true_labels = [] predict_labels = [] model.eval() pbar = ProgressBar(n_total=len(eval_dataloader), desc="Evaluating") for step, batch in enumerate(eval_dataloader): batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = {"input_ids": batch[0], "attention_mask": batch[1], "start_positions": batch[5], "end_positions": batch[6]} if args.model_type != "distilbert": # XLM and RoBERTa don"t use segment_ids inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet"] else None) outputs = model(**inputs) tmp_eval_loss, start_logits, end_logits = outputs[:3] if args.n_gpu > 1: tmp_eval_loss = tmp_eval_loss.mean() # mean() to average on multi-gpu parallel evaluating eval_loss += tmp_eval_loss.item() nb_eval_steps += 1 start_preds = start_logits.detach().cpu().numpy() # [64, 128, 5] end_preds = end_logits.detach().cpu().numpy() start_preds = np.argmax(start_preds, axis=2) # [64, 128] end_preds = np.argmax(end_preds, axis=2) start_preds_list = [] end_preds_list = [] batch_true_labels = batch[4].squeeze(0).cpu().numpy().tolist() for index, input_length in enumerate(batch[3]): # batch[3] 每句长度 start_preds_list.append([span_map[j] for j in start_preds[index][:input_length]][1:-1]) end_preds_list.append([span_map[j] for j in end_preds[index][:input_length]][1:-1]) batch_true = [args.id2label.get(i) for i in batch_true_labels[index][:input_length]][1:-1] true_labels.append(batch_true) batch_predict_labels = convert_span_to_bio(start_preds_list, end_preds_list) predict_labels.extend(batch_predict_labels) pbar(step) logger.info("\n") logger.info("average eval_loss: %s", str(eval_loss/nb_eval_steps)) logger.info("accuary: %s", str(accuracy_score(true_labels, predict_labels))) logger.info("p: %s", str(precision_score(true_labels, predict_labels))) logger.info("r: %s", str(recall_score(true_labels, predict_labels))) logger.info("f1: %s", str(f1_score(true_labels, predict_labels))) logger.info("classification report: ") logger.info(str(classification_report(true_labels, predict_labels, mode='strict', scheme=IOB2)))
def evaluate(args, model, tokenizer, label_list, pad_token_label_id): eval_output_dir = args.output_dir if not os.path.exists(eval_output_dir): os.makedirs(eval_output_dir) eval_dataset = load_and_cache_examples(args, args.task_name, tokenizer, label_list, pad_token_label_id, data_type='dev') args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate_fn) # Eval logger.info("***** Running evaluation %s *****") logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 true_labels = [] predict_labels = [] model.eval() pbar = ProgressBar(n_total=len(eval_dataloader), desc="Evaluating") for step, batch in enumerate(eval_dataloader): batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3], 'input_lens': batch[4] } if args.model_type != "distilbert": # XLM and RoBERTa don"t use segment_ids inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet"] else None) outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] batch_predict_labels = model.crf.decode(logits, inputs['attention_mask']) if args.n_gpu > 1: tmp_eval_loss = tmp_eval_loss.mean( ) # mean() to average on multi-gpu parallel evaluating eval_loss += tmp_eval_loss.item() nb_eval_steps += 1 batch_true_labels = batch[3].squeeze(0).cpu().numpy().tolist() pbar(step) for index, input_length in enumerate(batch[4]): batch_true = [ args.id2label.get(i) for i in batch_true_labels[index][:input_length] ][1:-1] batch_predict = [ args.id2label.get(i) for i in batch_predict_labels[index][:input_length] ][1:-1] true_labels.append(batch_true) predict_labels.append(batch_predict) logger.info("\n") logger.info("average eval_loss: %s", str(eval_loss / nb_eval_steps)) logger.info("accuary: %s", str(accuracy_score(true_labels, predict_labels))) logger.info("p: %s", str(precision_score(true_labels, predict_labels))) logger.info("r: %s", str(recall_score(true_labels, predict_labels))) logger.info("f1: %s", str(f1_score(true_labels, predict_labels))) logger.info("classification report: ") logger.info( str( classification_report(true_labels, predict_labels, mode='strict', scheme=IOB2)))