def start(): # 优先使用缓存 if not os.path.exists(args.TRAIN) or not os.path.exists(args.VALID): produce_data(user_define=USER_DEFINE) if os.path.exists(args.TRAIN_CACHE): train_iter, num_train_steps = torch.load(args.TRAIN_CACHE) else: train_iter, num_train_steps = create_batch_iter("train") if os.path.exists(args.VALID_CACHE): eval_iter = torch.load(args.VALID_CACHE) else: eval_iter = create_batch_iter("dev") epoch_size = num_train_steps * args.train_batch_size * args.gradient_accumulation_steps / args.num_train_epochs pbar = ProgressBar(epoch_size=epoch_size, batch_size=args.train_batch_size) model = Bert_CRF.from_pretrained(args.bert_model, num_tag=len(args.labels)) for name, param in model.named_parameters(): if param.requires_grad: print(name) fit(model=model, training_iter=train_iter, eval_iter=eval_iter, num_epoch=args.num_train_epochs, pbar=pbar, num_train_steps=num_train_steps, verbose=1)
def predict_k_fold(): parser = HfArgumentParser(TrainingArguments) args: TrainingArguments = parser.parse_args_into_dataclasses()[0] logger.info(f"Training arguments: {args}") # Prepare devices device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() # args.n_gpu = 1 logger.info(f"device: {device}, n_gpu: {args.n_gpu}") set_seed(args) if "_" not in args.output_dir: args.output_dir = args.output_dir + sorted(os.listdir(args.output_dir))[-1] # 最新一次训练结果 print(f"model {args.output_dir} predict useed") test_dataloader, examples = create_batch_iter(args, "test") # tokenizer = AutoTokenizer.from_pretrained(args.model) # bert_config = AutoConfig.from_pretrained(args.model, return_dict=True) model = load_model(args) model.to(device) if args.n_gpu > 1: model = torch.nn.DataParallel(model) model.eval() with torch.no_grad(): test_logits = [] for step, batch in enumerate(tqdm(test_dataloader, desc="test", ascii=True)): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids = batch outputs = model(input_ids, input_mask, segment_ids) logits = torch.max(outputs.logits, dim=1)[1] if device.type == "cuda": logits = logits.cpu().numpy().astype(int) else: logits = logits.numpy() test_logits.extend(logits.tolist()) pred_dt = {} output_path = args.output_dir + "/test.csv" with open(output_path, "w", encoding="utf-8") as fw: i = 1 for exp, label in zip(examples, test_logits): print(f"write line:{i}") i += 1 exp: InputExample = exp _id = exp.label question = exp.text_a context = exp.text_b pred_dt[_id] = label fw.write(",".join([_id, str(label)]) + "\n") logger.info(f"output path: {output_path}")
def start(): train_iter, num_train_steps = create_batch_iter("train", args.TRAIN_PATH) eval_iter = create_batch_iter("dev", args.VALID_PATH) epoch_size = num_train_steps * args.train_batch_size * args.gradient_accumulation_steps / args.num_train_epochs print(f'epoch_size = {epoch_size}') pbar = ProgressBar(epoch_size=epoch_size, batch_size=args.train_batch_size) model = Bert_CRF.from_pretrained(args.bert_model, num_tag=len(args.labels)) for name, param in model.named_parameters(): if param.requires_grad: print(name) fit(model=model, training_iter=train_iter, eval_iter=eval_iter, num_epoch=args.num_train_epochs, pbar=pbar, num_train_steps=num_train_steps, verbose=1)
def start(): train_iter, num_train_steps = create_batch_iter("train") eval_iter = create_batch_iter("dev") epoch_size = num_train_steps * args.train_batch_size * args.gradient_accumulation_steps / args.num_train_epochs pbar = ProgressBar(epoch_size=epoch_size, batch_size=args.train_batch_size) model = QaExtract.from_pretrained(args.bert_model) for name, param in model.named_parameters(): if param.requires_grad: print(name) fit(model=model, training_iter=train_iter, eval_iter=eval_iter, num_epoch=args.num_train_epochs, pbar=pbar, num_train_steps=num_train_steps, verbose=1)
def start(): parser = argparse.ArgumentParser() parser.add_argument( "--do_not_train_ernie", default=False, action='store_true', ) parser.add_argument( "--do_CRF", default=False, action='store_true', ) arg = parser.parse_args() args.do_not_train_ernie = arg.do_not_train_ernie args.do_CRF = arg.do_CRF produce_data() train_iter, num_train_steps = create_batch_iter("train") eval_iter = create_batch_iter("dev") epoch_size = num_train_steps * args.train_batch_size * args.gradient_accumulation_steps / args.num_train_epochs pbar = ProgressBar(epoch_size=epoch_size, batch_size=args.train_batch_size) if args.load_weight: model = load_model(args.output_dir) else: model = Bert_CRF.from_pretrained(args.bert_model, num_tag=len(args.labels)) for name, param in model.named_parameters(): if param.requires_grad: print(name) fit(model=model, training_iter=train_iter, eval_iter=eval_iter, num_epoch=args.num_train_epochs, pbar=pbar, num_train_steps=num_train_steps, verbose=1)
from Io.data_loader import create_batch_iter from preprocessing.data_processor import produce_data import torch import os import json import config.args as args from util.model_util import load_model args.do_inference = True produce_data() test_iter = create_batch_iter("inference") epoch_size = args.train_batch_size * args.gradient_accumulation_steps / args.num_train_epochs model = load_model(args.output_dir) num_epoch = args.num_train_epochs device = torch.device( args.device if torch.cuda.is_available() and not args.no_cuda else "cpu") param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay':
def main(): parser = HfArgumentParser(TrainingArguments) args: TrainingArguments = parser.parse_args_into_dataclasses()[0] # Prepare output directory if not args.do_eval: args.output_dir = os.path.join( args.output_dir, list(filter(None, args.model.strip().split("/")))[-1] + "-" + datetime.now().strftime("%Y%m%d_%H%M%S")) os.mkdir(args.output_dir) logger = init_logger("souhu-text-match-2021", args.output_dir) logger.info(f"Output dir: {args.output_dir}") # Prepare devices device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() # args.n_gpu = 1 logger.info(f"device: {device}, n_gpu: {args.n_gpu}") set_seed(args) logger.info(f"Training arguments: {args}") if not args.do_eval: train_dataloader, num_train_steps = create_batch_iter( args, "train", logger) eval_dataloader, _ = create_batch_iter(args, "dev", logger) tokenizer = AutoTokenizer.from_pretrained(args.model) # bert_config = AutoConfig.from_pretrained(args.model, return_dict=True) model = BertForSequenceClassification.from_pretrained(args.model) model.to(device) if args.do_eval: # model.eval() # result = do_eval(model, eval_dataloader, device, -1, -1) # logger.info("***** Eval results *****") # for key in sorted(result.keys()): # logger.info(" %s = %s", key, str(result[key])) pass else: if args.n_gpu > 1: model = torch.nn.DataParallel(model) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) # scheduler = lr_scheduler.StepLR(optimizer, 2) scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2) pgd = PGD(model) K = 3 # Train and evaluate global_step = 0 best_dev_f1, best_epoch = float("-inf"), float("-inf") output_eval_file = os.path.join(args.output_dir, "eval_results.txt") train_loss2plot = [] train_acc2plot = [] train_f1_2plot = [] eval_loss2plot = [] eval_acc2plot = [] eval_f1_2plot = [] for epoch_ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0. train_logits = [] train_labels = [] model.train() for step, batch in enumerate( tqdm(train_dataloader, desc=f"Epoch {epoch_ + 1} iteration", ascii=True)): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, labels = batch outputs = model(input_ids, input_mask, segment_ids, labels=labels, return_dict=True) train_logits.append(outputs.logits) train_labels.append(labels) loss = outputs.loss if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() # 方向传播,得到正常的grad if args.do_adversarial: # 对抗训练 pgd.backup_grad() for t in range(K): pgd.attack(is_first_attack=( t == 0 )) # 在embedding上添加对抗扰动,first attack时备份param.data if t != K - 1: model.zero_grad() else: pgd.restore_grad() adv_outputs = model(input_ids, input_mask, segment_ids, labels=labels, return_dict=True) adv_loss = adv_outputs.loss if args.n_gpu > 1: adv_loss = adv_loss.mean() adv_loss.backward() # 反向传播,并在正常grad基础上,累加对抗训练的梯度 pgd.restore() # 恢复embedding参数 # 梯度下降,更新参数 optimizer.step() optimizer.zero_grad() tr_loss += loss.item() global_step += 1 if (step + 1) % args.gradient_accumulation_steps == 0: pass if (global_step + 1) % args.eval_step == 0: logger.info("***** Running evaluation *****") logger.info(" Process = {} iter {} step".format( epoch_, global_step)) logger.info(" Batch size = %d", args.eval_batch_size) logger.info( f"next step learning rate = {optimizer.param_groups[0]['lr']:.8f}" ) all_train_logits = torch.cat(train_logits, dim=0).cpu() all_train_labels = torch.cat(train_labels, dim=0).cpu() acc, prf = evaluate(all_train_logits, all_train_labels) train_loss2plot.append(loss.item()) train_acc2plot.append(acc) train_f1_2plot.append(prf[2]) loss = tr_loss / (step + 1) result = do_eval(args, model, eval_dataloader, device, epoch_, args.num_train_epochs, "eval", logger) scheduler.step(result["eval_loss"]) eval_loss2plot.append(result["eval_loss"]) eval_acc2plot.append(result["eval_acc"]) eval_f1_2plot.append((result["eval_f1"])) result['global_step'] = global_step result['train_loss'] = loss result_to_file(result, output_eval_file, logger) if args.do_eval: save_model = False else: save_model = False if result['eval_f1'] > best_dev_f1: best_dev_f1 = result['eval_f1'] best_epoch = epoch_ + 1 save_model = True if save_model: logger.info("***** Save model *****") best_model = model model_to_save = model.module if hasattr( best_model, 'module') else best_model output_model_file = os.path.join( args.output_dir, "pytorch_model.bin") output_config_file = os.path.join( args.output_dir, "config.json") torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) logger.info( f"best epoch: {best_epoch}, best eval f1:{best_dev_f1:.4f}") loss_acc_plot([ train_loss2plot, train_acc2plot, train_f1_2plot, eval_loss2plot, eval_acc2plot, eval_f1_2plot ], os.path.join(args.output_dir, "loss_acc_f1.png")) logger.info(f"output dir: {args.output_dir}")
def predict(): parser = HfArgumentParser(TrainingArguments) args: TrainingArguments = parser.parse_args_into_dataclasses()[0] logger = init_logger("souhu-text-match-2021", "output/logs/") logger.info(f"!!!!!!Test arguments: {args}") # Prepare devices device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() # args.n_gpu = 1 logger.info(f"device: {device}, n_gpu: {args.n_gpu}") set_seed(args) test_dataloader = create_batch_iter(args, "test", logger) args.output_dir = args.output_dir + sorted(os.listdir( args.output_dir))[-1] # 最新一次训练结果 logger.info(f"model {args.output_dir} predict useed") tokenizer = RoFormerTokenizer.from_pretrained( "/home/zhuminghao/work/model/pt/chinese_roformer_base") # 没保存,所以用原始一样 model = RoFormerForSequenceClassification.from_pretrained(args.output_dir) model.to(device) if args.n_gpu > 1: model = torch.nn.DataParallel(model) model.eval() with torch.no_grad(): test_logits = [] ids = [] for step, batch in enumerate( tqdm(test_dataloader, desc="test", ascii=True)): sources, targets, bt_ids = batch inputs = list(zip(sources, targets)) ids.append(bt_ids) pt_batch = tokenizer(inputs, padding=True, truncation="longest_first", max_length=args.max_seq_length, return_tensors="pt") pt_batch = pt_batch.to(device) outputs = model(**pt_batch, return_dict=True) logits = torch.max(outputs.logits, dim=1)[1] if device.type == "cuda": logits = logits.cpu().numpy().astype(int) else: logits = logits.numpy() test_logits.extend(logits.tolist()) output_path = args.output_dir + "/test.csv" with open(output_path, "w", encoding="utf-8") as fw: for id, label in zip(ids, test_logits): fw.write(",".join([id, str(label)]) + "\n") logger.info(f"output path: {output_path}")