def evaluate(args, model, tokenizer, labels, pad_token_label_id, best, mode, prefix="", verbose=True): eval_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode=mode) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) eval_sampler = SequentialSampler( eval_dataset) if args.local_rank == -1 else DistributedSampler( eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1: model = torch.nn.DataParallel(model) logger.info("***** Running evaluation %s *****", prefix) if verbose: logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None model.eval() for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3] } if args.model_type != "distilbert": inputs["token_type_ids"] = ( batch[2] if args.model_type in ["bert", "xlnet"] else None ) # XLM and RoBERTa don"t use segment_ids outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] if args.n_gpu > 1: tmp_eval_loss = tmp_eval_loss.mean() eval_loss += tmp_eval_loss.item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs["labels"].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = np.argmax(preds, axis=2) label_map = {i: label for i, label in enumerate(labels)} preds_list = [[] for _ in range(out_label_ids.shape[0])] out_id_list = [[] for _ in range(out_label_ids.shape[0])] preds_id_list = [[] for _ in range(out_label_ids.shape[0])] for i in range(out_label_ids.shape[0]): for j in range(out_label_ids.shape[1]): if out_label_ids[i, j] != pad_token_label_id: preds_list[i].append(label_map[preds[i][j]]) out_id_list[i].append(out_label_ids[i][j]) preds_id_list[i].append(preds[i][j]) correct_preds, total_correct, total_preds = 0., 0., 0. # i variables for ground_truth_id, predicted_id in zip(out_id_list, preds_id_list): # We use the get chunks function defined above to get the true chunks # and the predicted chunks from true labels and predicted labels respectively lab_chunks = set(get_chunks(ground_truth_id, tag_to_id(args.data_dir))) lab_pred_chunks = set( get_chunks(predicted_id, tag_to_id(args.data_dir))) # Updating the i variables correct_preds += len(lab_chunks & lab_pred_chunks) total_preds += len(lab_pred_chunks) total_correct += len(lab_chunks) p = correct_preds / total_preds if correct_preds > 0 else 0 r = correct_preds / total_correct if correct_preds > 0 else 0 new_F = 2 * p * r / (p + r) if correct_preds > 0 else 0 is_updated = False if new_F > best[-1]: best = [p, r, new_F] is_updated = True results = { "loss": eval_loss, "precision": p, "recall": r, "f1": new_F, "best_precision": best[0], "best_recall": best[1], "best_f1": best[-1] } logger.info("***** Eval results %s *****", prefix) for key in sorted(results.keys()): logger.info(" %s = %s", key, str(results[key])) return results, preds_list, best, is_updated
def main(): # directory for training outputs output_dir = "results/{:%Y%m%d_%H%M%S}/".format(datetime.now()) # required parameters parser = argparse.ArgumentParser() parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument("--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name: " + ", ".join(ALL_MODELS)) parser.add_argument("--checkpoint", default='', type=str, required=True, help="where to load pre-trained model.") parser.add_argument( "--max_seq_length", default=512, type=int, required=True, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument("--use_pretrained", action='store_true', default=True, help="If use pre-trained model weights.") # other parameters parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument("--num_epochs", default=30, type=int, help="Total number of training epochs to perform.") parser.add_argument( "--task_name", default='lpc', type=str, help="The name of the task to train selected in the list: " + ", ".join(processors.keys())) parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--learning_rate", default=2e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--warmup_steps", default=3, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--per_gpu_eval_batch_size", default=4, type=int, help="Batch size for evaluation.") parser.add_argument("--no_cuda", default=False, type=bool, help="Do not use cuda.") parser.add_argument("--do_lower_case", default=True, type=bool, help="Do lower case.") parser.add_argument("--seed", default=610, type=int, help="Random seed.") parser.add_argument("--num_labels", default=3, type=int, help="Classification label number.") parser.add_argument("--scheduler", default='warmup', type=str, help="Which type of scheduler to use.") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument( '--overwrite_cache', action='store_true', default=False, help="Overwrite the cached training and evaluation sets") parser.add_argument('--write_summary', default=True, type=bool, help="If write summary into tensorboard.") parser.add_argument( '--fp16', action='store_true', default=False, help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") # data directory parser.add_argument("--data_dir", default='../data/TF-IDF', type=str, help="data directory where pickle dataset is stored.") parser.add_argument( "--output_dir", default=output_dir, type=str, help="output directory for model, log file and summary.") parser.add_argument("--log_path", default=join(output_dir, "log.txt"), type=str, help="Path to log.txt.") parser.add_argument("--summary_path", default=join(output_dir, "summary"), type=str, help="Path to summary file.") args = parser.parse_args() if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if not os.path.exists(args.data_dir): os.makedirs(args.data_dir) args.logger = get_logger(args.log_path) # Setup CUDA, GPU & distributed training args.device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() args.logger.info("- device: {}, n_gpu: {}".format(args.device, args.n_gpu)) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # set seed set_seed(args.seed) # build model args.logger.info("Build model...") model = Model(args) # make data make_data(ARTICLES_FILEPATH, METADATA_FILEPATH, args.data_dir, file_name='dev') # build dataset args.logger.info("Loading dataset...") eval_dataset, guids = load_and_cache_examples(args, args.task_name, model.tokenizer, evaluate=True) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # training args.logger.info("Start testing:") preds = model.test(eval_dataloader) assert len(preds) == len( guids), "Prediction list and GUID list length do NOT equal!!!" # write results args.logger.info("Write prediction results:") write_results(OUTPUT_DIR, guids, preds) args.logger.info("Save results at: {}".format( os.path.join(OUTPUT_DIR, 'predictions.txt')))
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help="输入数据目录。应该包含CoNLL-2003 NER任务的训练文件", ) parser.add_argument( "--model_type", default=None, type=str, required=True, help="列表中选择的模型类型: " + ", ".join(MODEL_CLASSES.keys()), ) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="列表中选择的预训练模型或快捷方式名称的路径: " + ", ".join(ALL_MODELS), ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="输出目录, 将在其中写入模型预测和checkpoint", ) # Other parameters parser.add_argument("--config_name", default="", type=str, help="预训练的配置名称或路径(如果与model_name不同)") parser.add_argument( "--tokenizer_name", default="", type=str, help="预训练的tokenizer名称或路径(如果与model_name不同)", ) parser.add_argument( "--cache_dir", default="", type=str, help="您想在哪里存储从s3下载的预训练模型", ) parser.add_argument( "--max_seq_length", default=128, type=int, help="tokenization后的最大总输入序列长度。长度大于此长度的序列将被截断,较短的序列将被填充。", ) parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.") parser.add_argument( "--evaluate_during_training", action="store_true", help="是否在每个日志记录step的训练期间进行评估.", ) parser.add_argument("--do_lower_case", action="store_true", help="如果使用的是uncased的模型,请设置此标志") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="训练时每个GPU / CPU的批次大小。") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="评估时每个GPU / CPU的批次大小。") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help="在执行向后/更新过程之前要梯度累积的更新步骤数。", ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--adam_beta1", default=0.9, type=float, help="BETA1 for Adam optimizer.") parser.add_argument("--adam_beta2", default=0.999, type=float, help="BETA2 for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="要执行的训练epoch总数。") parser.add_argument( "--max_steps", default=-1, type=int, help="If > 0: 设置要执行的训练步骤总数。覆盖num_train_epochs。", ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.") parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action="store_true", help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", ) parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") parser.add_argument("--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory") parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets") parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--fp16", action="store_true", help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") # mean teacher parser.add_argument('--mt', type=int, default=0, help='mean teacher. 是否使用mean teacher') parser.add_argument('--mt_updatefreq', type=int, default=1, help='mean teacher update frequency') parser.add_argument( '--mt_class', type=str, default="kl", help= 'mean teacher class, choices:[smart, prob, logit, kl(default), distill].' ) parser.add_argument('--mt_lambda', type=float, default=1, help="trade off parameter of the consistent loss.") parser.add_argument('--mt_rampup', type=int, default=300, help="rampup iteration.") parser.add_argument( '--mt_alpha1', default=0.99, type=float, help= "moving average parameter of mean teacher (for the exponential moving average)." ) parser.add_argument( '--mt_alpha2', default=0.995, type=float, help= "moving average parameter of mean teacher (for the exponential moving average)." ) parser.add_argument('--mt_beta', default=10, type=float, help="coefficient of mt_loss term.") parser.add_argument( '--mt_avg', default="exponential", type=str, help= "moving average method, choices:[exponentail(default), simple, double_ema]." ) parser.add_argument( '--mt_loss_type', default="logits", type=str, help="subject to 衡量模型差异, choices:[embeds, logits(default)].") # virtual adversarial training parser.add_argument('--vat', type=int, default=0, help='virtual adversarial training.') parser.add_argument( '--vat_eps', type=float, default=1e-3, help='perturbation size for virtual adversarial training.') parser.add_argument( '--vat_lambda', type=float, default=1, help='trade off parameter for virtual adversarial training.') parser.add_argument( '--vat_beta', type=float, default=1, help='coefficient of the virtual adversarial training loss term.') parser.add_argument( '--vat_loss_type', default="logits", type=str, help= "subject to measure model difference, choices = [embeds, logits(default)]." ) # self-training parser.add_argument( '--self_training_reinit', type=int, default=0, help='如果teacher模型已更新,是否重新初始化student模型。0表示重启重新初始化,1表示不初始化') parser.add_argument( '--self_training_begin_step', type=int, default=900, help='开始步骤(通常在第一个epoch之后)开始self-training。对应论文中的第一阶段早停策略') parser.add_argument( '--self_training_label_mode', type=str, default="hard", help= '伪标签类型. choices:[hard(default), soft]. 软标签是一个teacher模型预测出来的,类似logits的概率值,是浮点数,硬标签直接就是整数,就是对应概率最大的位置的索引,例如soft是0.82, hard就是1' ) parser.add_argument( '--self_training_period', type=int, default=878, help='the self-training period., 每训练多少个step后,更新一下teacher模型') parser.add_argument('--self_training_hp_label', type=float, default=0, help='是否使用高置信度标签重新加权软标签') parser.add_argument('--self_training_ensemble_label', type=int, default=0, help='use ensemble label.') args = parser.parse_args() # 决定是否覆盖已有的output目录 if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # 如果outputs目录不存在,那么创建 if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logging_fh = logging.FileHandler(os.path.join(args.output_dir, 'log.txt')) logging_fh.setLevel(logging.DEBUG) logger.addHandler(logging_fh) logger.warning( "处理的 rank: %s, device: %s, n_gpu: %s, 是否分布式训练: %s, 是否 16-bits 训练 : %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set seed set_seed(args) # 获取这个数据的所有labels. eg: ['O', 'B-LOC', 'B-ORG', 'B-PER', 'B-MISC', 'I-PER', 'I-MISC', 'I-ORG', 'I-LOC', '<START>', '<STOP>'] labels = get_labels(args.data_dir) num_labels = len(labels) # 使用交叉熵, 忽略索引作为padding label ID,以便以后真实标签ID去计算损失, eg: pad_token_label_id = -100 pad_token_label_id = CrossEntropyLoss().ignore_index # 加载预训练模型 if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, cache_dir=args.cache_dir if args.cache_dir else None, ) tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None, ) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab logger.info("训练/评估 参数 %s", args) # 开始训练 if args.do_train: #加载数据集 train_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode="train") #开始训练模型 model, global_step, tr_loss, best_dev, best_test = train( args, train_dataset, model_class, config, tokenizer, labels, pad_token_label_id) #打印日志 logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # 保存last-practice:如果您使用模型的默认名称,则可以使用from_pretrained()重新加载它 if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): logger.info("保存模型 checkpoint to %s", args.output_dir) model_to_save = (model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) torch.save(args, os.path.join(args.output_dir, "training_args.bin")) torch.save(model.state_dict(), os.path.join(args.output_dir, "model.pt")) #评估模型 results = {} if args.do_eval and args.local_rank in [-1, 0]: tokenizer = tokenizer_class.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))) logging.getLogger("pytorch_transformers.modeling_utils").setLevel( logging.WARN) # Reduce logging logger.info("评估如下 checkpoints: %s", checkpoints) if not best_dev: best_dev = [0, 0, 0] for checkpoint in checkpoints: global_step = checkpoint.split( "-")[-1] if len(checkpoints) > 1 else "" model = model_class.from_pretrained(checkpoint) model.to(args.device) result, _, best_dev, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, best=best_dev, mode="dev", prefix=global_step) if global_step: result = { "{}_{}".format(global_step, k): v for k, v in result.items() } results.update(result) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: for key in sorted(results.keys()): writer.write("{} = {}\n".format(key, str(results[key]))) if args.do_predict and args.local_rank in [-1, 0]: tokenizer = tokenizer_class.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) model = model_class.from_pretrained(args.output_dir) model.to(args.device) if not best_test: best_test = [0, 0, 0] result, predictions, _, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, best=best_test, mode="test") # Save results output_test_results_file = os.path.join(args.output_dir, "test_results.txt") with open(output_test_results_file, "w") as writer: for key in sorted(result.keys()): writer.write("{} = {}\n".format(key, str(result[key]))) # Save predictions output_test_predictions_file = os.path.join(args.output_dir, "test_predictions.txt") with open(output_test_predictions_file, "w") as writer: with open(os.path.join(args.data_dir, "test.json"), "r") as f: example_id = 0 data = json.load(f) for item in data: output_line = str( item["str_words"]) + " " + predictions[example_id].pop( 0) + "\n" writer.write(output_line) example_id += 1 return results
def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix="") -> Dict: # Loop to handle MNLI double evaluation (matched, mis-matched) eval_output_dir = args.output_dir eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True) if args.local_rank in [-1, 0]: os.makedirs(eval_output_dir, exist_ok=True) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly def collate(examples): if tokenizer._pad_token is None: if args.token_discrimination or args.mask_token_discrimination: return ( pad_sequence([example[0] for example in examples], batch_first=True), pad_sequence([example[1] for example in examples], batch_first=True) ) else: # this includes args.mlm return pad_sequence(examples, batch_first=True) if args.token_discrimination or args.mask_token_discrimination: return ( pad_sequence([example[0] for example in examples], batch_first=True, padding_value=tokenizer.pad_token_id), pad_sequence([example[1] for example in examples], batch_first=True, padding_value=tokenizer.pad_token_id) ) else: # this includes args.mlm return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader( eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate ) # multi-gpu evaluate if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 model.eval() for batch in tqdm(eval_dataloader, desc="Evaluating"): if args.mlm: inputs, labels = mask_tokens(batch, tokenizer, args) elif args.token_discrimination or args.mask_token_discrimination: inputs, labels = batch else: inputs, labels = batch, batch inputs = inputs.to(args.device) labels = labels.to(args.device) with torch.no_grad(): if args.mlm: outputs = model(inputs, masked_lm_labels=labels) else: # this also includes args.token_discrimination and args.mask_token_discrimination cases outputs = model(inputs, labels=labels) lm_loss = outputs[0] eval_loss += lm_loss.mean().item() nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps perplexity = torch.exp(torch.tensor(eval_loss)) result = { "loss": eval_loss, "perplexity": perplexity } output_eval_file = os.path.join( eval_output_dir, prefix, "eval_results.txt" ) with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return result
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the training files for the CoNLL-2003 NER task.", ) parser.add_argument( "--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()), ) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS), ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written.", ) parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name", ) parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3", ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.") parser.add_argument( "--evaluate_during_training", action="store_true", help="Whether to run evaluation during training at each logging step.", ) parser.add_argument( "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--adam_beta1", default=0.9, type=float, help="BETA1 for Adam optimizer.") parser.add_argument("--adam_beta2", default=0.999, type=float, help="BETA2 for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs.", ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.") parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action="store_true", help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", ) parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") parser.add_argument("--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory") parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets") parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--fp16", action="store_true", help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") # mean teacher parser.add_argument('--mt', type=int, default=0, help='mean teacher.') parser.add_argument('--mt_updatefreq', type=int, default=1, help='mean teacher update frequency') parser.add_argument( '--mt_class', type=str, default="kl", help= 'mean teacher class, choices:[smart, prob, logit, kl(default), distill].' ) parser.add_argument('--mt_lambda', type=float, default=1, help="trade off parameter of the consistent loss.") parser.add_argument('--mt_rampup', type=int, default=300, help="rampup iteration.") parser.add_argument( '--mt_alpha1', default=0.99, type=float, help= "moving average parameter of mean teacher (for the exponential moving average)." ) parser.add_argument( '--mt_alpha2', default=0.995, type=float, help= "moving average parameter of mean teacher (for the exponential moving average)." ) parser.add_argument('--mt_beta', default=10, type=float, help="coefficient of mt_loss term.") parser.add_argument( '--mt_avg', default="exponential", type=str, help= "moving average method, choices:[exponentail(default), simple, double_ema]." ) parser.add_argument( '--mt_loss_type', default="logits", type=str, help= "subject to measure model difference, choices:[embeds, logits(default)]." ) # virtual adversarial training parser.add_argument('--vat', type=int, default=0, help='virtual adversarial training.') parser.add_argument( '--vat_eps', type=float, default=1e-3, help='perturbation size for virtual adversarial training.') parser.add_argument( '--vat_lambda', type=float, default=1, help='trade off parameter for virtual adversarial training.') parser.add_argument( '--vat_beta', type=float, default=1, help='coefficient of the virtual adversarial training loss term.') parser.add_argument( '--vat_loss_type', default="logits", type=str, help= "subject to measure model difference, choices = [embeds, logits(default)]." ) args = parser.parse_args() if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logging_fh = logging.FileHandler(os.path.join(args.output_dir, 'log.txt')) logging_fh.setLevel(logging.DEBUG) logger.addHandler(logging_fh) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set seed set_seed(args) labels = get_labels(args.data_dir) num_labels = len(labels) # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later pad_token_label_id = CrossEntropyLoss().ignore_index # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, cache_dir=args.cache_dir if args.cache_dir else None, ) tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None, ) model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None, ) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: train_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode="train") global_step, tr_loss, best_dev, best_test = train( args, train_dataset, model, tokenizer, labels, pad_token_label_id) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving last-practice: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): logger.info("Saving model checkpoint to %s", args.output_dir) model_to_save = (model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) torch.save(args, os.path.join(args.output_dir, "training_args.bin")) torch.save(model.state_dict(), os.path.join(args.output_dir, "model.pt")) # Evaluation results = {} if args.do_eval and args.local_rank in [-1, 0]: tokenizer = tokenizer_class.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))) logging.getLogger("pytorch_transformers.modeling_utils").setLevel( logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) if not best_dev: best_dev = [0, 0, 0] for checkpoint in checkpoints: global_step = checkpoint.split( "-")[-1] if len(checkpoints) > 1 else "" model = model_class.from_pretrained(checkpoint) model.to(args.device) result, _, best_dev, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, best=best_dev, mode="dev", prefix=global_step) if global_step: result = { "{}_{}".format(global_step, k): v for k, v in result.items() } results.update(result) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: for key in sorted(results.keys()): writer.write("{} = {}\n".format(key, str(results[key]))) if args.do_predict and args.local_rank in [-1, 0]: tokenizer = tokenizer_class.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) model = model_class.from_pretrained(args.output_dir) model.to(args.device) if not best_test: best_test = [0, 0, 0] result, predictions, _, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, best=best_test, mode="test") # Save results output_test_results_file = os.path.join(args.output_dir, "test_results.txt") with open(output_test_results_file, "w") as writer: for key in sorted(result.keys()): writer.write("{} = {}\n".format(key, str(result[key]))) # Save predictions output_test_predictions_file = os.path.join(args.output_dir, "test_predictions.txt") with open(output_test_predictions_file, "w") as writer: with open(os.path.join(args.data_dir, "test.json"), "r") as f: example_id = 0 data = json.load(f) for item in data: output_line = str( item["str_words"]) + " " + predictions[example_id].pop( 0) + "\n" writer.write(output_line) example_id += 1 return results
def main(): args = get_args() if args.model_type in ["bert", "roberta", "distilbert", "camembert"] and not \ (args.mlm or args.token_discrimination or args.mask_token_discrimination): raise ValueError( "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the --mlm " "flag (masked language modeling).") if args.eval_data_file is None and args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument.") if args.should_continue: sorted_checkpoints = _sorted_checkpoints(args) if len(sorted_checkpoints) == 0: raise ValueError( "Used --should_continue but no checkpoint was found in --output_dir." ) else: args.model_name_or_path = sorted_checkpoints[-1] if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set seed set_seed(args) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Barrier to make sure only the first process in distributed training download model & vocab if args.config_name: config = AutoConfig.from_pretrained(args.config_name, cache_dir=args.cache_dir) elif args.model_name_or_path: config = AutoConfig.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir) else: # When we release a pip version exposing CONFIG_MAPPING, # we can do `config = CONFIG_MAPPING[args.model_type]()`. raise ValueError( "You are instantiating a new config instance from scratch. This is not supported, but you can do it from another script, save it," "and load it from here, using --config_name") if args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, cache_dir=args.cache_dir) elif args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it," "and load it from here, using --tokenizer_name") if args.block_size <= 0: args.block_size = tokenizer.max_len # Our input block size will be the max possible for the model else: args.block_size = min(args.block_size, tokenizer.max_len) if args.model_name_or_path and (args.token_discrimination or args.mask_token_discrimination): model = RobertaForTokenDiscrimination.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, cache_dir=args.cache_dir, ) elif args.model_name_or_path and args.mlm: model = AutoModelWithLMHead.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, cache_dir=args.cache_dir, ) else: logger.info("Training new model from scratch") model = AutoModelWithLMHead.from_config(config) model.to(args.device) if args.local_rank == 0: torch.distributed.barrier() # End of barrier # to make sure only the first process # in distributed training download model & vocab logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Barrier to make sure only the first process in distributed training process the dataset, and the others will use the cache train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False) if args.local_rank == 0: torch.distributed.barrier() global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use save_pretrained for the model and tokenizer, # you can reload them using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if args.local_rank in [-1, 0]: os.makedirs(args.output_dir, exist_ok=True) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = (model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Load a trained model and vocabulary that you have fine-tuned if args.mlm: model = AutoModelWithLMHead.from_pretrained(args.output_dir) elif args.token_discrimination or args.mask_token_discrimination: model = RobertaForTokenDiscrimination.from_pretrained( args.output_dir) else: raise NotImplementedError( 'only mlm and token discrimination loss supported') tokenizer = AutoTokenizer.from_pretrained(args.output_dir) model.to(args.device) # Evaluation results = {} if args.do_eval and args.local_rank in [-1, 0]: checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))) logging.getLogger("transformers.modeling_utils").setLevel( logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split( "-")[-1] if len(checkpoints) > 1 else "" prefix = checkpoint.split( "/")[-1] if checkpoint.find("checkpoint") != -1 else "" if args.mlm: model = AutoModelWithLMHead.from_pretrained(checkpoint) elif args.token_discrimination or args.mask_token_discrimination: model = RobertaForTokenDiscrimination.from_pretrained( checkpoint) else: raise NotImplementedError( 'only mlm and token discrimination loss supported') model.to(args.device) result = evaluate(args, model, tokenizer, prefix=prefix) result = dict( (k + "_{}".format(global_step), v) for k, v in result.items()) results.update(result) return results
def main(): # directory for training outputs output_dir = "results/{:%Y%m%d_%H%M%S}/".format(datetime.now()) # required parameters parser = argparse.ArgumentParser() parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument("--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name: " + ", ".join(ALL_MODELS)) # data directory parser.add_argument("--data_dir", default='', type=str, help="data directory where pickle dataset is stored.") parser.add_argument( "--output_dir", default=output_dir, type=str, help="output directory for model, log file and summary.") parser.add_argument("--log_path", default=join(output_dir, "log.txt"), type=str, help="Path to log.txt.") parser.add_argument("--summary_path", default=join(output_dir, "summary"), type=str, help="Path to summary file.") parser.add_argument("--model_dir", default=join(output_dir, "model/"), type=str, help="where to load pre-trained model.") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument( "--max_summary_length", default=200, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument( '--overwrite_cache', action='store_true', default=False, help="Overwrite the cached training and evaluation sets") args = parser.parse_args() args.logger = get_logger(args.log_path) # Setup CUDA, GPU & distributed training args.device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() args.logger.info("- device: {}, n_gpu: {}".format(args.device, args.n_gpu)) # set seed set_seed(args.seed) # load tokenizer tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) # build dataset args.logger.info("Loading dataset...") train_dataset, _ = load_and_cache_examples(args, tokenizer, evaluate=False) eval_dataset, _ = load_and_cache_examples(args, tokenizer, evaluate=True)
def main(_): logging.set_verbosity(logging.INFO) args = flags.FLAGS.flag_values_dict() if ( os.path.exists(args["output_dir"]) and os.listdir(args["output_dir"]) and args["do_train"] and not args["overwrite_output_dir"] ): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( args["output_dir"] ) ) if args["fp16"]: tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True}) if args["tpu"]: resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=args["tpu"]) tf.config.experimental_connect_to_cluster(resolver) tf.tpu.experimental.initialize_tpu_system(resolver) strategy = tf.distribute.experimental.TPUStrategy(resolver) args["n_device"] = args["num_tpu_cores"] elif len(args["gpus"].split(",")) > 1: args["n_device"] = len([f"/gpu:{gpu}" for gpu in args["gpus"].split(",")]) strategy = tf.distribute.MirroredStrategy(devices=[f"/gpu:{gpu}" for gpu in args["gpus"].split(",")]) elif args["no_cuda"]: args["n_device"] = 1 strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0") else: args["n_device"] = len(args["gpus"].split(",")) strategy = tf.distribute.OneDeviceStrategy(device="/gpu:" + args["gpus"].split(",")[0]) logging.warning( "n_device: %s, distributed training: %s, 16-bits training: %s", args["n_device"], bool(args["n_device"] > 1), args["fp16"], ) labels = get_labels(args["labels"]) num_labels = len(labels) pad_token_label_id = -1 config = AutoConfig.from_pretrained( args["config_name"] if args["config_name"] else args["model_name_or_path"], num_labels=num_labels, cache_dir=args["cache_dir"] if args["cache_dir"] else None, ) logging.info("Training/evaluation parameters %s", args) # Training if args["do_train"]: tokenizer = AutoTokenizer.from_pretrained( args["tokenizer_name"] if args["tokenizer_name"] else args["model_name_or_path"], do_lower_case=args["do_lower_case"], cache_dir=args["cache_dir"] if args["cache_dir"] else None, ) with strategy.scope(): model = TFAutoModelForTokenClassification.from_pretrained( args["model_name_or_path"], from_pt=bool(".bin" in args["model_name_or_path"]), config=config, cache_dir=args["cache_dir"] if args["cache_dir"] else None, ) train_batch_size = args["per_device_train_batch_size"] * args["n_device"] print('** train_batch_size = %s' % train_batch_size) train_dataset, num_train_examples = load_and_cache_examples( args, tokenizer, labels, pad_token_label_id, train_batch_size, mode="train" ) train_dataset = strategy.experimental_distribute_dataset(train_dataset) print('** num_train_examples = %s' % num_train_examples) train( args, strategy, train_dataset, tokenizer, model, num_train_examples, labels, train_batch_size, pad_token_label_id, ) if not os.path.exists(args["output_dir"]): os.makedirs(args["output_dir"]) logging.info("Saving model to %s", args["output_dir"]) model.save_pretrained(args["output_dir"]) tokenizer.save_pretrained(args["output_dir"]) # Evaluation if args["do_eval"]: tokenizer = AutoTokenizer.from_pretrained(args["output_dir"], do_lower_case=args["do_lower_case"]) checkpoints = [] results = [] if args["eval_all_checkpoints"]: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args["output_dir"] + "/**/" + TF2_WEIGHTS_NAME, recursive=True), key=lambda f: int("".join(filter(str.isdigit, f)) or -1), ) ) logging.info("Evaluate the following checkpoints: %s", checkpoints) if len(checkpoints) == 0: checkpoints.append(args["output_dir"]) for checkpoint in checkpoints: global_step = checkpoint.split("-")[-1] if re.match(".*checkpoint-[0-9]", checkpoint) else "final" with strategy.scope(): model = TFAutoModelForTokenClassification.from_pretrained(checkpoint) y_true, y_pred, eval_loss = evaluate( args, strategy, model, tokenizer, labels, pad_token_label_id, mode="dev" ) report = metrics.classification_report(y_true, y_pred, digits=4) if global_step: results.append({global_step + "_report": report, global_step + "_loss": eval_loss}) output_eval_file = os.path.join(args["output_dir"], "eval_results.txt") with tf.io.gfile.GFile(output_eval_file, "w") as writer: for res in results: for key, val in res.items(): if "loss" in key: logging.info(key + " = " + str(val)) writer.write(key + " = " + str(val)) writer.write("\n") else: logging.info(key) logging.info("\n" + report) writer.write(key + "\n") writer.write(report) writer.write("\n") if args["do_predict"]: tokenizer = AutoTokenizer.from_pretrained(args["output_dir"], do_lower_case=args["do_lower_case"]) model = TFAutoModelForTokenClassification.from_pretrained(args["output_dir"]) eval_batch_size = args["per_device_eval_batch_size"] * args["n_device"] predict_dataset, _ = load_and_cache_examples( args, tokenizer, labels, pad_token_label_id, eval_batch_size, mode="test" ) y_true, y_pred, pred_loss = evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode="test") output_test_results_file = os.path.join(args["output_dir"], "test_results.txt") output_test_predictions_file = os.path.join(args["output_dir"], "test_predictions.txt") report = metrics.classification_report(y_true, y_pred, digits=4) with tf.io.gfile.GFile(output_test_results_file, "w") as writer: report = metrics.classification_report(y_true, y_pred, digits=4) logging.info("\n" + report) writer.write(report) writer.write("\n\nloss = " + str(pred_loss)) with tf.io.gfile.GFile(output_test_predictions_file, "w") as writer: with tf.io.gfile.GFile(os.path.join(args["data_dir"], "test.txt"), "r") as f: example_id = 0 for line in f: if line.startswith("-DOCSTART-") or line == "" or line == "\n": writer.write(line) if not y_pred[example_id]: example_id += 1 elif y_pred[example_id]: output_line = line.split()[0] + " " + y_pred[example_id].pop(0) + "\n" writer.write(output_line) else: logging.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0])
def evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode): eval_batch_size = args["per_device_eval_batch_size"] * args["n_device"] eval_dataset, size = load_and_cache_examples( args, tokenizer, labels, pad_token_label_id, eval_batch_size, mode=mode ) eval_dataset = strategy.experimental_distribute_dataset(eval_dataset) preds = None num_eval_steps = math.ceil(size / eval_batch_size) ## original version will through error: TypeError: sequence item 0: expected str instance, NBProgressBar found # master = master_bar(range(1)) # eval_iterator = progress_bar(eval_dataset, total=num_eval_steps, parent=master, display=args["n_device"] > 1) # fix bug: https://github.com/huggingface/transformers/issues/3530 eval_iterator = progress_bar(eval_dataset, total=num_eval_steps, display=args["n_device"] > 1) loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE) loss = 0.0 logging.info("***** Running evaluation *****") logging.info(" Num examples = %d", size) logging.info(" Batch size = %d", eval_batch_size) for eval_features, eval_labels in eval_iterator: inputs = {"attention_mask": eval_features["input_mask"], "training": False} if args["model_type"] != "distilbert": inputs["token_type_ids"] = ( eval_features["segment_ids"] if args["model_type"] in ["bert", "xlnet"] else None ) with strategy.scope(): logits = model(eval_features["input_ids"], **inputs)[0] active_loss = tf.reshape(eval_labels, (-1,)) != pad_token_label_id active_logits = tf.boolean_mask(tf.reshape(logits, (-1, len(labels))), active_loss) active_labels = tf.boolean_mask(tf.reshape(eval_labels, (-1,)), active_loss) cross_entropy = loss_fct(active_labels, active_logits) loss += tf.reduce_sum(cross_entropy) * (1.0 / eval_batch_size) if preds is None: preds = logits.numpy() label_ids = eval_labels.numpy() else: preds = np.append(preds, logits.numpy(), axis=0) label_ids = np.append(label_ids, eval_labels.numpy(), axis=0) preds = np.argmax(preds, axis=2) y_pred = [[] for _ in range(label_ids.shape[0])] y_true = [[] for _ in range(label_ids.shape[0])] loss = loss / num_eval_steps for i in range(label_ids.shape[0]): for j in range(label_ids.shape[1]): if label_ids[i, j] != pad_token_label_id: # y_pred[i].append(labels[preds[i, j] - 1]) # y_true[i].append(labels[label_ids[i, j] - 1]) y_pred[i].append(labels[preds[i, j]]) y_true[i].append(labels[label_ids[i, j]]) return y_true, y_pred, loss.numpy()