def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--train_file", default=None, type=str) parser.add_argument("--eval_file", default=None, type=str) parser.add_argument("--test_file", default=None, type=str) parser.add_argument("--inference_file", default=None, type=str) parser.add_argument("--model_name_or_path", default=None, type=str) parser.add_argument("--output_dir", default=None, type=str) ## other parameters parser.add_argument("--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument("--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument("--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--max_seq_length", default=256, type=int) parser.add_argument("--do_train", default=False, type=boolean_string) parser.add_argument("--do_eval", default=False, type=boolean_string) parser.add_argument("--do_test", default=False, type=boolean_string) parser.add_argument("--resume", default=False, type=boolean_string) parser.add_argument("--do_inference", default=False, type=boolean_string) parser.add_argument("--train_batch_size", default=8, type=int) parser.add_argument("--eval_batch_size", default=8, type=int) parser.add_argument("--learning_rate", default=3e-5, type=float) parser.add_argument("--num_train_epochs", default=10, type=float) parser.add_argument("--warmup_proprotion", default=0.1, type=float) parser.add_argument("--use_weight", default=1, type=int) parser.add_argument("--local_rank", type=int, default=-1) parser.add_argument("--seed", type=int, default=2019) parser.add_argument("--fp16", default=False) parser.add_argument("--loss_scale", type=float, default=0) parser.add_argument('--gradient_accumulation_steps', type=int, default=1) parser.add_argument("--warmup_steps", default=0, type=int) parser.add_argument("--adam_epsilon", default=1e-8, type=float) parser.add_argument("--max_steps", default=-1, type=int) parser.add_argument("--do_lower_case", action='store_true') parser.add_argument("--logging_steps", default=500, type=int) parser.add_argument("--clean", default=False, type=boolean_string, help="clean the output dir") parser.add_argument("--need_birnn", default=False, type=boolean_string) parser.add_argument("--rnn_dim", default=128, type=int) args = parser.parse_args() device = torch.device("cuda") # os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_ args.device = device n_gpu = torch.cuda.device_count() logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt = '%m/%d/%Y %H:%M:%S', level = logging.INFO) logger.info(f"device: {device} n_gpu: {n_gpu}") if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) # now_time = datetime.datetime.now().strftime('%Y-%m-%d_%H') # tmp_dir = args.output_dir + '/' +str(now_time) + '_ernie' # if not os.path.exists(tmp_dir): # os.makedirs(tmp_dir) # args.output_dir = tmp_dir if args.clean and args.do_train and not args.resume: # logger.info("清理") if os.path.exists(args.output_dir): def del_file(path): ls = os.listdir(path) for i in ls: c_path = os.path.join(path, i) print(c_path) if os.path.isdir(c_path): del_file(c_path) os.rmdir(c_path) else: os.remove(c_path) try: del_file(args.output_dir) except Exception as e: print(e) print('pleace remove the files of output dir and data.conf') exit(-1) if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.resume: raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if not os.path.exists(os.path.join(args.output_dir, "eval")): os.makedirs(os.path.join(args.output_dir, "eval")) writer = SummaryWriter(logdir=os.path.join(args.output_dir, "eval"), comment="Linear") processor = NerProcessor() label_list = processor.get_labels(args) num_labels = len(label_list) args.label_list = label_list if os.path.exists(os.path.join(args.output_dir, "label2id.pkl")): with open(os.path.join(args.output_dir, "label2id.pkl"), "rb") as f: label2id = pickle.load(f) else: label2id = {l:i for i,l in enumerate(label_list)} with open(os.path.join(args.output_dir, "label2id.pkl"), "wb") as f: pickle.dump(label2id, f) id2label = {value:key for key,value in label2id.items()} # Prepare optimizer and schedule (linear warmup and decay) if args.do_train: if args.resume: tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) config = BertConfig.from_pretrained(args.output_dir, num_labels=num_labels) model = BERT_BiLSTM_CRF.from_pretrained(args.output_dir, config=config, need_birnn=args.need_birnn, rnn_dim=args.rnn_dim) else: tokenizer = BertTokenizer.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case) config = BertConfig.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels) model = BERT_BiLSTM_CRF.from_pretrained(args.cache_dir if args.cache_dir else args.model_name_or_path, config=config, need_birnn=args.need_birnn, rnn_dim=args.rnn_dim) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) train_examples, train_features, train_data = get_Dataset(args, processor, tokenizer, mode="train") train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) if args.do_eval: eval_examples, eval_features, eval_data = get_Dataset(args, processor, tokenizer, mode="eval") if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps =t_total) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_data)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Total optimization steps = %d", t_total) model.train() global_step = 0 tr_loss, logging_loss = 0.0, 0.0 best_f1 = 0.0 for ep in trange(int(args.num_train_epochs), desc="Epoch"): model.train() for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch outputs = model(input_ids, label_ids, segment_ids, input_mask) loss = outputs if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: tr_loss_avg = (tr_loss-logging_loss)/args.logging_steps writer.add_scalar("Train/loss", tr_loss_avg, global_step) logging_loss = tr_loss if args.do_eval: all_ori_tokens_eval = [f.ori_tokens for f in eval_features] overall, by_type = evaluate(args, eval_data, model, id2label, all_ori_tokens_eval) # add eval result to tensorboard f1_score = overall.fscore writer.add_scalar("Eval/precision", overall.prec, ep) writer.add_scalar("Eval/recall", overall.rec, ep) writer.add_scalar("Eval/f1_score", overall.fscore, ep) # save the best performs model if f1_score >= best_f1: logger.info(f"----------the best f1 is {f1_score}---------") best_f1 = f1_score model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) # logger.info(f'epoch {ep}, train loss: {tr_loss}') # writer.add_graph(model) writer.close() # model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training # model_to_save.save_pretrained(args.output_dir) # tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model # torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) if args.do_test: # model = BertForTokenClassification.from_pretrained(args.output_dir) # model.to(device) label_map = {i : label for i, label in enumerate(label_list)} tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) #args = torch.load(os.path.join(args.output_dir, 'training_args.bin')) model = BERT_BiLSTM_CRF.from_pretrained(args.output_dir, need_birnn=args.need_birnn, rnn_dim=args.rnn_dim) model.to(device) test_examples, test_features, test_data = get_Pred_Dataset(args, processor, tokenizer, mode="test") logger.info("***** Running test *****") logger.info(f" Num examples = {len(test_examples)}") logger.info(f" Batch size = {args.eval_batch_size}") all_ori_tokens = [f.ori_tokens for f in test_features] all_ori_labels = [e.label.split(" ") for e in test_examples] test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size) model.eval() pred_labels = [] for b_i, (input_ids, input_mask, segment_ids, label_ids) in enumerate(tqdm(test_dataloader, desc="Predicting")): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model.predict(input_ids, segment_ids, input_mask) # logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2) # logits = logits.detach().cpu().numpy() for l in logits: pred_label = [] for idx in l: pred_label.append(id2label[idx]) pred_labels.append(pred_label) assert len(pred_labels) == len(all_ori_tokens) == len(all_ori_labels) print(len(pred_labels)) with open(os.path.join(args.output_dir, "token_labels_.txt"), "w", encoding="utf-8") as f: for ori_tokens, ori_labels,prel in zip(all_ori_tokens, all_ori_labels, pred_labels): for ot,ol,pl in zip(ori_tokens, ori_labels, prel): if ot in ["[CLS]", "[SEP]"]: continue else: f.write(f"{ot} {ol} {pl}\n") f.write("\n") if args.do_inference: label_map = {i : label for i, label in enumerate(label_list)} tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) #args = torch.load(os.path.join(args.output_dir, 'training_args.bin')) model = BERT_BiLSTM_CRF.from_pretrained(args.output_dir, need_birnn=args.need_birnn, rnn_dim=args.rnn_dim) model.to(device) test_examples, test_features, test_data = get_Pred_Dataset(args, processor, tokenizer, mode="inference") logger.info("***** Running test *****") logger.info(f" Num examples = {len(test_examples)}") logger.info(f" Batch size = {args.eval_batch_size}") all_ori_tokens = [f.ori_tokens for f in test_features] all_ori_labels = [e.label.split(" ") for e in test_examples] test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size) model.eval() pred_labels = [] for b_i, (input_ids, input_mask, segment_ids, label_ids) in enumerate(tqdm(test_dataloader, desc="Predicting")): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model.predict(input_ids, segment_ids, input_mask) # logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2) # logits = logits.detach().cpu().numpy() for l in logits: pred_label = [] for idx in l: pred_label.append(id2label[idx]) pred_labels.append(pred_label) assert len(pred_labels) == len(all_ori_tokens) == len(all_ori_labels) print(len(pred_labels)) with open(os.path.join(args.output_dir, "token_labels_.txt"), "w", encoding="utf-8") as f: for ori_tokens, ori_labels,prel in zip(all_ori_tokens, all_ori_labels, pred_labels): for ot,ol,pl in zip(ori_tokens, ori_labels, prel): if ot in ["[CLS]", "[SEP]"]: continue else: f.write(f"{ot} {ol} {pl}\n") f.write("\n")
def main(): ''' Parameters ''' parser = argparse.ArgumentParser() parser.add_argument("--train_file", default="./data/train.pkl", type=str) parser.add_argument("--eval_file", default="./data/dev.pkl", type=str) parser.add_argument("--test_file", default="./data/test.pkl", type=str) parser.add_argument("--model_type", default="roberta-large", type=str) parser.add_argument("--model_name_or_path", default="../language_model/roberta-large/", type=str) parser.add_argument("--do_lower_case", default=True, type=boolean_string) parser.add_argument("--output_dir", default="./state_models/roberta-large/", type=str) parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--do_train", default=False, type=boolean_string) parser.add_argument("--do_eval", default=False, type=boolean_string) parser.add_argument("--do_test", default=True, type=boolean_string) parser.add_argument("--seed", type=int, default=2020) parser.add_argument("--max_seq_length", default=64, type=int) parser.add_argument("--train_batch_size", default=8, type=int) parser.add_argument("--eval_batch_size", default=8, type=int) parser.add_argument("--num_train_epochs", default=5, type=float) parser.add_argument("--no_cuda", default=False, type=boolean_string, help="Whether not to use CUDA when available") parser.add_argument("--save_check_point", default=4600, type=int) parser.add_argument("--eval_steps", default=100, type=int) parser.add_argument("--skip_eval_rate", default=0.30, type=float) parser.add_argument("--logging_steps", default=200, type=int) parser.add_argument("--warmup_steps", default=0, type=int) parser.add_argument("--warmup_proprotion", default=0.1, type=float) parser.add_argument("--max_steps", default=-1, type=int) parser.add_argument('--gradient_accumulation_steps', type=int, default=1) parser.add_argument("--learning_rate", default=1e-5, type=float) parser.add_argument("--adam_epsilon", default=1e-8, type=float) parser.add_argument("--overwrite_output_dir", default=True, type=boolean_string) args = parser.parse_args() # Setup CUDA, GPU if args.no_cuda: device = torch.device("cpu") args.n_gpu = 0 else: device = torch.device("cuda") args.n_gpu = torch.cuda.device_count() args.device = device print("device: {0}, n_gpu: {1}".format(device, args.n_gpu)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) # Setup path to save model if args.overwrite_output_dir and args.do_train: if os.path.exists(args.output_dir): def del_file(path): ls = os.listdir(path) for i in ls: c_path = os.path.join(path, i) print("Clean the output path: {}".format(c_path)) if os.path.isdir(c_path): del_file(c_path) os.rmdir(c_path) else: os.remove(c_path) try: del_file(args.output_dir) except Exception as e: print(e) print('pleace remove the files of output dir and data.conf') exit(-1) if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Setup seed set_seed(args) print("Training/evaluation parameters %s", args) ''' Data ''' processor = ATSAProcessor() label_list = processor.get_labels() args.num_labels = len(label_list) args.label_list = label_list ''' Train ''' if args.do_train: # -------------------- loading model -------------------- config = AutoConfig.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=args.num_labels, output_hidden_states=True) tokenizer = AutoTokenizer.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case) model = RobertaForSequenceClassification_MERGE.from_pretrained( args.model_name_or_path, config=config) model.to(device) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # -------------------- loading data -------------------- train_examples, train_features, train_data = get_Dataset(args, processor, tokenizer, mode="train") train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) if args.do_eval: eval_examples, eval_features, eval_data = get_Dataset(args, processor, tokenizer, mode="eval") # t_total:模型参数更新次数,模型每个batch更新一次 # len(train_dataloader):训练集数据总batch数 # len(train-dataloader) // args.gradient_accumulation_steps:一个epoch模型参数更新次数 if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # -------------------- optimizer & schedule (linear warmup and decay) -------------------- no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # -------------------- Train -------------------- print("\n******************** Running Train ********************") print(" Num examples = {}".format(len(train_data))) print(" Num Epochs = {}".format(args.num_train_epochs)) print(" Total optimization steps = {}".format(t_total)) global_step = 0 best_f1 = 0.0 tr_loss, logging_loss = 0.0, 0.0 model.train() model.zero_grad() for ep in range(int(args.num_train_epochs)): for step, batch in enumerate(train_dataloader): model.train() batch = tuple(t.to(device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2], 'labels': batch[3] } if "roberta" in args.model_type: inputs["token_type_ids"] = None outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: tr_loss_avg = ( tr_loss - logging_loss ) / args.logging_steps # 计算一个logging_step的平均loss logging_loss = tr_loss print( " epoch: {:d}, global_step: {:d}, train loss: {:.4f}". format(ep, global_step, tr_loss_avg)) # save model trained by train data & eval data if args.save_check_point == global_step: model_to_save = model.module if hasattr( model, 'module') else model model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save( args, os.path.join(args.output_dir, 'training_args.bin')) ''' Eval ''' if args.do_eval and global_step > args.skip_eval_rate * t_total and global_step % args.eval_steps == 0: eval_acc, eval_f1 = evaluate(args, model, eval_data, global_step, tr_loss_avg) # save the best performs model if eval_f1 > best_f1: print( "**************** the best f1 is {:.4f} ****************\n" .format(eval_f1)) best_f1 = eval_f1 model_to_save = model.module if hasattr( model, 'module') else model model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save( args, os.path.join(args.output_dir, 'training_args.bin')) ''' Eval at the end of the train ''' if args.do_eval and global_step > args.skip_eval_rate * t_total and global_step % args.eval_steps == 0: eval_acc, eval_f1 = evaluate(args, model, eval_data, global_step) # save the best performs model if eval_f1 > best_f1: print( "**************** the best f1 is {:.4f} ****************\n" .format(eval_f1)) best_f1 = eval_f1 model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) ''' Test ''' if args.do_test: args = torch.load(os.path.join(args.output_dir, 'training_args.bin')) config = AutoConfig.from_pretrained(args.output_dir, num_labels=args.num_labels, output_hidden_states=True) tokenizer = AutoTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) model = RobertaForSequenceClassification_MERGE.from_pretrained( args.output_dir, config=config) model.to(device) test_examples, test_features, test_data = get_Dataset(args, processor, tokenizer, mode="test") test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size) print("\n******************** Running test ********************") print(" Num examples = {:d}".format(len(test_examples))) print(" Batch size = {:d}".format(args.eval_batch_size)) logits_res = None # 输出logits用于投票 pred_res = [] model.eval() for _, batch in enumerate(test_dataloader): batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] } if "roberta" in args.model_type: inputs["token_type_ids"] = None with torch.no_grad(): outputs = model(**inputs) logits = outputs[0] # collect logits output if logits_res is None: logits_res = logits else: logits_res = torch.cat((logits_res, logits), dim=0) # collect label output pred_label = predict(logits, args.label_list) # 测试时 logits 为outputs[0] pred_res.extend(pred_label.tolist()) # pred_res = np.array(pred_res) # ground_truth = np.array(pd.read_pickle("./data/dev.pkl")["label"].tolist()) # ans = f1_score(y_true=ground_truth, y_pred=pred_res, labels=[0, 1, 2], average="macro") # print(ans) logits_res = logits_res.detach().cpu().numpy() label_0 = logits_res[:, 0].tolist() label_1 = logits_res[:, 1].tolist() label_2 = logits_res[:, 2].tolist() logits_df = pd.DataFrame({ "label_0": label_0, "label_1": label_1, "label_2": label_2 }) logits_df.to_csv(args.model_type + "_logits_test.csv")
def main(): parser = argparse.ArgumentParser() parser.add_argument("--dir_paths", default=None, type=str, nargs="+") parser.add_argument("--model_name_or_path", default="biobert_v1.1_pubmed", type=str) parser.add_argument("--max_seq_length", default=128, type=int) parser.add_argument("--do_train", default=True, type=boolean_string) parser.add_argument("--do_eval", default=True, type=boolean_string) parser.add_argument("--from_tf", default=False, type=boolean_string) parser.add_argument("--train_batch_size", default=32, type=int) parser.add_argument("--eval_batch_size", default=32, type=int) parser.add_argument("--bert_learning_rate", default=5e-5, type=float) parser.add_argument("--not_bert_learning_rate", default=5e-4, type=float) parser.add_argument("--num_train_epochs", default=10, type=float) parser.add_argument("--warmup_proprotion", default=0.2, type=float) parser.add_argument("--seed", type=int, default=2020) parser.add_argument('--gradient_accumulation_steps', type=int, default=1) parser.add_argument("--adam_epsilon", default=1e-8, type=float) parser.add_argument("--max_steps", default=-1, type=int) parser.add_argument("--do_lower_case", action='store_true') # BERT后接LSTM parser.add_argument("--need_birnn", default=False, type=boolean_string) parser.add_argument("--rnn_dim", default=128, type=int) # 增加charCNN parser.add_argument("--need_charcnn", default=False, type=boolean_string) parser.add_argument("--share_cnn", default=True, type=boolean_string) parser.add_argument("--char_embed", default=50, type=int) parser.add_argument("--char_out_dim", default=300, type=int) # 增加CNN parser.add_argument("--need_cnn", default=False, type=boolean_string) parser.add_argument("--cnn_out_dim", default=300, type=int) # 增加SAC parser.add_argument("--need_sac", action="store_true") parser.add_argument("--tag_num", default=2, type=int) parser.add_argument("--sac_factor", default=100) # 调试模式 parser.add_argument("--debug", action='store_true') args = parser.parse_args() logger.info("********%s*********", "参数设置") logger.info("args info:") logger.info(args.__dict__) device = torch.device("cuda") if torch.cuda.is_available() else "cpu" # os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_ args.device = device args.n_gpu = torch.cuda.device_count() if torch.cuda.is_available() else 0 logger.info(f"device: {device} n_gpu: {args.n_gpu}") set_seed() logger.info(f"seed: {args.seed}") logger.info("********%s*********", "读取数据") data_paths = args.dir_paths task_names = list(map(lambda x: x.split(os.path.sep)[-1], data_paths)) logger.info("task names: %s", str(task_names)) logger.info("task num: %d", len(task_names)) processor = NerProcessor(args.debug) tasks = [{} for i in range(len(task_names))] for i in range(len(task_names)): tasks[i]["task_id"] = i tasks[i]["task_name"] = task_names[i] tasks[i]["train_file"] = os.path.join(data_paths[i], "train_devel.tsv") tasks[i]["eval_file"] = os.path.join(data_paths[i], "test.tsv") tasks[i]["label_list"] = processor.get_labels(data_paths[i]) tasks[i]["label2id"] = {l: i for i, l in enumerate(tasks[i]["label_list"])} tasks[i]["id2label"] = {value: key for key, value in tasks[i]["label2id"].items()} for i in range(len(tasks)): logger.info("tasks info %s", str(tasks[i])) logger.info("********%s*********", "模型加载") tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path, do_lower_case=args.do_lower_case) if args.need_charcnn: char2id = {'#': 0, 'I': 1, 'm': 2, 'u': 3, 'n': 4, 'o': 5, 'h': 6, 'i': 7, 's': 8, 't': 9, 'c': 10, 'e': 11, 'a': 12, 'l': 13, 'g': 14, 'w': 15, 'p': 16, 'v': 17, 'f': 18, 'r': 19, 'S': 20, '-': 21, '1': 22, '0': 23, '9': 24, 'd': 25, ',': 26, 'H': 27, 'M': 28, 'B': 29, '4': 30, '5': 31, '(': 32, '%': 33, ')': 34, 'y': 35, 'k': 36, 'x': 37, 'b': 38, '.': 39, 'C': 40, 'E': 41, '8': 42, '6': 43, 'V': 44, 'j': 45, '2': 46, 'R': 47, 'N': 48, 'A': 49, 'D': 50, 'z': 51, 'O': 52, '<': 53, 'q': 54, 'X': 55, 'F': 56, '3': 57, 'G': 58, 'P': 59, ':': 60, '?': 61, 'K': 62, 'W': 63, 'T': 64, "'": 65, 'J': 66, 'L': 67, 'U': 68, '+': 69, ';': 70, '7': 71, '/': 72, 'Z': 73, '=': 74, 'Y': 75, 'Q': 76, '[': 77, '"': 78, '>': 79, '*': 80, ']': 81, '&': 82, '$': 83, '_': 84} else: char2id = None config = BertConfig.from_pretrained(args.model_name_or_path) model = BERT_BiLSTM_CRF.from_pretrained(args.model_name_or_path, config=config, char_vocab_size=len(char2id) if char2id is not None else 0, tag_num=args.tag_num, char_embedding_dim=args.char_embed, char_out_dim=args.char_out_dim, task_infos=tasks, need_cnn=args.need_cnn, cnn_out_dim=args.cnn_out_dim, need_sac=args.need_sac, sac_factor=args.sac_factor, need_birnn=args.need_birnn, need_charcnn=args.need_charcnn, share_cnn=args.share_cnn, rnn_dim=args.rnn_dim, from_tf=args.from_tf, device=device) if args.do_train: model.to(device) logger.info("********%s*********", "开始读取训练集数据") for i in range(len(tasks)): tasks[i]["train_examples"], tasks[i]["train_features"], tasks[i]["train_data"] = \ get_Dataset(args, tasks[i], processor, tokenizer, char2id, mode="train") train_sampler = RandomSampler(tasks[i]["train_data"]) tasks[i]["train_dataloader"] = DataLoader(tasks[i]["train_data"], sampler=train_sampler, batch_size=args.train_batch_size) tasks[i]["train_ori_words"] = [f.ori_words for f in tasks[i]["train_features"]] # print(tasks[i]["train_ori_words"]) if args.do_eval: logger.info("********%s*********", "开始读取验证集数据") for i in range(len(tasks)): tasks[i]["eval_examples"], tasks[i]["eval_features"], tasks[i]["eval_data"] = get_Dataset(args, tasks[i], processor, tokenizer, char2id, mode="eval") tasks[i]["eval_ori_words"] = [f.ori_words for f in tasks[i]["eval_features"]] # t_total = num_train_epochs * len(train_dataloader) / gradient_accumulation_steps # t_total表示总共需要更新的次数 batch_num = sum(list(map(lambda task: len(task["train_dataloader"]), tasks))) t_total = args.num_train_epochs * batch_num // args.gradient_accumulation_steps no_decay = ['bias', 'LayerNorm.weight'] # 最原始的设置 # optimizer_grouped_parameters = [ # {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, # {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} # ] # 为bert和非bert设置不同的学习率 optimizer_grouped_parameters = [ # in bert {'params': [p for n, p in model.named_parameters() if "bert" in n and not any(nd in n for nd in no_decay)], 'weight_decay': 0.01, "lr": args.bert_learning_rate}, {'params': [p for n, p in model.named_parameters() if "bert" in n and any(nd in n for nd in no_decay)], 'weight_decay': 0.0, "lr": args.bert_learning_rate}, {'params': [p for n, p in model.named_parameters() if "bert" not in n], 'weight_decay': 0.0, "lr": args.not_bert_learning_rate} ] # 改用BertAdam,这个优化器尽可能地模拟了原始tensorflow版bert的优化器 optimizer = BertAdam(optimizer_grouped_parameters, warmup=args.warmup_proprotion, t_total=t_total) # 开始训练 logger.info("********%s*********", "开始训练") logger.info("# of tasks: %d", len(tasks)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Total Optimization Steps = %d", t_total) for task in tasks: logger.info(" Task name: %s Num Examples %d", task["task_name"], len(task["train_dataloader"])) step = 0 total_loss = 0 update_step = 0 for ep in range(1, int(args.num_train_epochs) + 1): model.train() task_indexs = [i for i in range(len(tasks))] iter_train_dataloaders = list(map(lambda x: iter(x["train_dataloader"]), tasks)) while True: if len(task_indexs) == 0: break task_id = random.choice(task_indexs) task_id = torch.tensor(task_id, dtype=torch.long).to(device) batch = next(iter_train_dataloaders[task_id], None) if batch is None: task_indexs.remove(task_id) continue batch = tuple(t.to(device) for t in batch) if args.need_charcnn: input_word_ids, input_mask, label_ids, label_mask, char_ids = batch else: input_word_ids, input_mask, label_ids, label_mask = batch char_ids = None if args.need_sac: O_label = label_ids == tasks[task_id]["label2id"]["O"] pad_label = label_ids == 0 tag_mask = 1 - (O_label + pad_label).long().to(device) else: tag_mask = None loss = model(task_id, input_word_ids, input_mask, label_ids, char_ids, sac_mask=tag_mask) loss.backward() total_loss += loss.item() step += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() model.zero_grad() update_step = update_step + 1 if update_step % 100 == 0: logger.info("in ep %d, choose task: %d, loss %f", ep, task_id, loss) if args.do_eval: for task in tasks: logger.info("Evalating task %s, Train set", task["task_name"]) train_filename, test_filename = None, None if ep == args.num_train_epochs: train_filename = task["task_name"] + ".train.output.txt" test_filename = task["task_name"] + ".test.output.txt" # evaluate(args, task["task_id"], task["train_data"], model, task["id2label"], task["train_ori_words"], file_name=train_filename) logger.info("Evalating task %s, Eval set", task["task_name"]) evaluate(args, task["task_id"], task["eval_data"], model, task["id2label"], task["eval_ori_words"], file_name=test_filename)