def prune_heads(args, model, eval_dataloader, head_mask): """This method shows how to prune head (remove heads weights) based on the head importance scores as described in Michel et al. (http://arxiv.org/abs/1905.10650) """ # Try pruning and test time speedup # Pruning is like masking but we actually remove the masked weights before_time = datetime.now() _, _, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False, compute_importance=False, head_mask=head_mask) preds = np.argmax( preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds) score_masking = glue_compute_metrics(args.task_name, preds, labels)[args.metric_name] original_time = datetime.now() - before_time original_num_params = sum(p.numel() for p in model.parameters()) heads_to_prune = dict( (layer, (1 - head_mask[layer].long()).nonzero().squeeze().tolist()) for layer in range(len(head_mask))) assert sum( len(h) for h in heads_to_prune.values()) == (1 - head_mask.long()).sum().item() model.prune_heads(heads_to_prune) pruned_num_params = sum(p.numel() for p in model.parameters()) before_time = datetime.now() _, _, preds, labels = compute_heads_importance( args, model, eval_dataloader, compute_entropy=False, compute_importance=False, head_mask=None, actually_pruned=True, ) preds = np.argmax( preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds) score_pruning = glue_compute_metrics(args.task_name, preds, labels)[args.metric_name] new_time = datetime.now() - before_time logger.info( "Pruning: original num of params: %.2e, after pruning %.2e (%.1f percents)", original_num_params, pruned_num_params, pruned_num_params / original_num_params * 100, ) logger.info("Pruning: score with masking: %f score with pruning: %f", score_masking, score_pruning) logger.info( "Pruning: speed ratio (new timing / original timing): %f percents", original_time / new_time * 100)
def adapted_glue_compute_metrics(task_name, preds, labels): "Adapted from `glue_compute_metrics` to also handle SNLI." try: return glue_compute_metrics(task_name, preds, labels) except KeyError: if task_name in ["snli", "winogrande", "toxic"]: # Since MNLI also uses accuracy. return glue_compute_metrics("mnli", preds, labels) raise KeyError(task_name)
def mask_heads(args, model, eval_dataloader): """ This method shows how to mask head (set some heads to zero), to test the effect on the network, based on the head importance scores, as described in Michel et al. (http://arxiv.org/abs/1905.10650) """ _, head_importance, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False) preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds) original_score = glue_compute_metrics(args.task_name, preds, labels)[args.metric_name] logger.info("Pruning: original score: %f, threshold: %f", original_score, original_score * args.masking_threshold) new_head_mask = torch.ones_like(head_importance) num_to_mask = max(1, int(new_head_mask.numel() * args.masking_amount)) current_score = original_score while current_score >= original_score * args.masking_threshold: head_mask = new_head_mask.clone() # save current head mask # heads from least important to most - keep only not-masked heads head_importance[head_mask == 0.0] = float("Inf") current_heads_to_mask = head_importance.view(-1).sort()[1] if len(current_heads_to_mask) <= num_to_mask: break # mask heads current_heads_to_mask = current_heads_to_mask[:num_to_mask] logger.info("Heads to mask: %s", str(current_heads_to_mask.tolist())) new_head_mask = new_head_mask.view(-1) new_head_mask[current_heads_to_mask] = 0.0 new_head_mask = new_head_mask.view_as(head_mask) new_head_mask = new_head_mask.clone().detach() print_2d_tensor(new_head_mask) # Compute metric and head importance again _, head_importance, preds, labels = compute_heads_importance( args, model, eval_dataloader, compute_entropy=False, head_mask=new_head_mask ) preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds) current_score = glue_compute_metrics(args.task_name, preds, labels)[args.metric_name] logger.info( "Masking: current score: %f, remaining heads %d (%.1f percents)", current_score, new_head_mask.sum(), new_head_mask.sum() / new_head_mask.numel() * 100, ) logger.info("Final head mask") print_2d_tensor(head_mask) np.save(os.path.join(args.output_dir, "head_mask.npy"), head_mask.detach().cpu().numpy()) return head_mask
def compute_metrics(p: EvalPrediction) -> Dict: if output_mode == "classification": preds = np.argmax(p.predictions, axis=1) elif output_mode == "regression": preds = np.squeeze(p.predictions) return glue_compute_metrics(data_args.task_name, preds, p.label_ids)
def compute_metrics_fn(p: EvalPrediction): preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions if output_mode == "classification": preds = np.argmax(preds, axis=1) else: # regression preds = np.squeeze(preds) return glue_compute_metrics(task_name, preds, p.label_ids)
def compute_metrics_fn(p: EvalPrediction): if output_mode == "classification": preds = np.argmax(p.predictions, axis=1) elif output_mode == "regression": preds = np.squeeze(p.predictions) metrics = glue_compute_metrics(task_name, preds, p.label_ids) return metrics
def compute_glue_metrics(task_name, p): output_mode = glue_output_modes[task_name] if output_mode == "classification": preds = np.argmax(p.predictions, axis=1) elif output_mode == "regression": preds = np.squeeze(p.predictions) return glue_compute_metrics(task_name, preds, p.label_ids)
def compute_metrics_fn(p: transformers.EvalPrediction): if output_mode == "classification": preds = np.argmax(p.predictions, axis=1) elif output_mode == "regression": preds = np.squeeze(p.predictions) if task_name in ynt.genernal_tasks_num_labels: return ynt.genernal_compute_metrics(task_name, preds, p.label_ids) elif task_name in transformers.glue_tasks_num_labels: return transformers.glue_compute_metrics( task_name, preds, p.label_ids)
def evaluate(task_name, model, eval_dataloader, model_type, output_mode = 'classification', device='cuda'): # results = {} eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None for batch_idx, batch in enumerate(eval_dataloader): model.eval() batch = tuple(t.to(device) for t in batch) with torch.no_grad(): inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]} if model_type != "distilbert": inputs["token_type_ids"] = ( batch[2] if model_type in ["bert", "xlnet", "albert"] else None ) # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs["labels"].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) progress_bar(batch_idx, len(eval_dataloader), 'Evaluating...') eval_loss = eval_loss / nb_eval_steps if output_mode == "classification": preds = np.argmax(preds, axis=1) elif output_mode == "regression": preds = np.squeeze(preds) result = glue_compute_metrics(task_name, preds, out_label_ids) # [ # print(result) # results.update(result) return result
def compute_glue_eval_metrics_regression(task_name: str, p: EvalPrediction) -> Dict: preds = np.squeeze(p.predictions) return glue_compute_metrics(task_name, preds, p.label_ids)
def compute_glue_eval_metrics(task_name: str, p: EvalPrediction) -> Dict: preds = np.argmax(p.predictions, axis=1) return glue_compute_metrics(task_name, preds, p.label_ids)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--n_models", type=int, help="Number of models") parser.add_argument("--k", type=int, default=16, help="Number of training instances per label") parser.add_argument( "--condition", type=str, help= "A dictionary contains conditions that the experiment results need to fulfill (e.g., tag, task_name, few_shot_type)" ) # These options should usually be kept as their default values parser.add_argument("--data_dir", type=str, default="data/k-shot", help="Data directory") parser.add_argument("--save_logit_dir", type=str, default="ensemble_predict_results", help="Directory to store the logit file.") parser.add_argument("--log", type=str, default="log", help="Log path.") parser.add_argument("--key", type=str, default='', help="Validation metric name") parser.add_argument("--test_key", type=str, default="", help="Test metric name") parser.add_argument("--test_key2", type=str, default="", help="Second test metric name") args = parser.parse_args() condition = eval(args.condition) if len(args.key) == 0: if condition['task_name'] == 'cola': args.key = 'cola_dev_eval_mcc' args.test_key = 'cola_test_eval_mcc' elif condition['task_name'] == 'mrpc/acc': args.key = 'mrpc_dev_eval_acc' args.test_key = 'mrpc_test_eval_acc' args.test_key2 = 'mrpc_test_eval_f1' condition['task_name'] = 'mrpc' elif condition['task_name'] == 'mrpc/f1': args.key = 'mrpc_dev_eval_f1' args.test_key2 = 'mrpc_test_eval_acc' args.test_key = 'mrpc_test_eval_f1' condition['task_name'] = 'mrpc' elif condition['task_name'] == 'qqp/acc': args.key = 'qqp_dev_eval_acc' args.test_key = 'qqp_test_eval_acc' args.test_key2 = 'qqp_test_eval_f1' condition['task_name'] = 'qqp' elif condition['task_name'] == 'qqp/f1': args.key = 'qqp_dev_eval_f1' args.test_key2 = 'qqp_test_eval_acc' args.test_key = 'qqp_test_eval_f1' condition['task_name'] = 'qqp' elif condition['task_name'] == 'sts-b/pearson': args.key = 'sts-b_dev_eval_pearson' args.test_key = 'sts-b_test_eval_pearson' args.test_key2 = 'sts-b_test_eval_spearmanr' condition['task_name'] = 'sts-b' elif condition['task_name'] == 'sts-b/spearmanr': args.key = 'sts-b_dev_eval_spearmanr' args.test_key2 = 'sts-b_test_eval_pearson' args.test_key = 'sts-b_test_eval_spearmanr' condition['task_name'] = 'sts-b' elif condition['task_name'] == 'qnli': args.key = 'qnli_dev_eval_acc' args.test_key = 'qnli_test_eval_acc' elif condition['task_name'] == 'sst-2': args.key = 'sst-2_dev_eval_acc' args.test_key = 'sst-2_test_eval_acc' elif condition['task_name'] == 'snli': args.key = 'snli_dev_eval_acc' args.test_key = 'snli_test_eval_acc' elif condition['task_name'] == 'mnli': args.key = 'mnli_dev_eval_mnli/acc' args.test_key = 'mnli_test_eval_mnli/acc' elif condition['task_name'] == 'mnli-mm': args.key = 'mnli_dev_eval_mnli/acc' args.test_key = 'mnli-mm_test_eval_mnli-mm/acc' elif condition['task_name'] == 'rte': args.key = 'rte_dev_eval_acc' args.test_key = 'rte_test_eval_acc' elif condition['task_name'] == 'ag_news': args.key = 'ag_news_dev_eval_acc' args.test_key = 'ag_news_test_eval_acc' elif condition['task_name'] == 'yahoo_answers': args.key = 'yahoo_answers_dev_eval_acc' args.test_key = 'yahoo_answers_test_eval_acc' elif condition['task_name'] == 'yelp_review_full': args.key = 'yelp_review_full_dev_eval_acc' args.test_key = 'yelp_review_full_test_eval_acc' elif condition['task_name'] == 'mr': args.key = 'mr_dev_eval_acc' args.test_key = 'mr_test_eval_acc' elif condition['task_name'] == 'sst-5': args.key = 'sst-5_dev_eval_acc' args.test_key = 'sst-5_test_eval_acc' elif condition['task_name'] == 'subj': args.key = 'subj_dev_eval_acc' args.test_key = 'subj_test_eval_acc' elif condition['task_name'] == 'trec': args.key = 'trec_dev_eval_acc' args.test_key = 'trec_test_eval_acc' elif condition['task_name'] == 'cr': args.key = 'cr_dev_eval_acc' args.test_key = 'cr_test_eval_acc' elif condition['task_name'] == 'mpqa': args.key = 'mpqa_dev_eval_acc' args.test_key = 'mpqa_test_eval_acc' else: raise NotImplementedError with open(args.log) as f: result_list = [] for line in f: result_list.append(eval(line)) seed_result = {} seed_best = {} # Gather all logs satisfying the conditions for item in result_list: ok = True for cond in condition: if cond == 'task_name' and condition['task_name'] == 'mnli-mm': if cond not in item or item[cond] != 'mnli': ok = False break else: if cond not in item or item[cond] != condition[cond]: ok = False break if 'model_id' not in item or 'array_id' not in item: ok = False if ok: seed = int(item['data_dir'].split('-')[-1]) model_id = item['model_id'] array_id = item['array_id'] if model_id >= 0 and model_id < args.n_models: if seed not in seed_result: seed_result[seed] = {} seed_best[seed] = {} if model_id not in seed_result[seed]: seed_result[seed][model_id] = [] seed_best[seed][model_id] = {args.key: -1e9} seed_result[seed][model_id].append(item) if item[args.key] > seed_best[seed][model_id][args.key]: seed_best[seed][model_id] = item final_result_dev = np.zeros((len(seed_result), args.n_models)) final_result_test = np.zeros((len(seed_result), args.n_models)) final_result_test2 = np.zeros((len(seed_result), args.n_models)) logit_file_list = {} for seed in seed_result: logit_file_list[seed] = [] # Get the results for each model and pick the best dev trial for each model/seed for model_id in range(args.n_models): for i, seed in enumerate(seed_result): final_result_dev[i][model_id] = seed_best[seed][model_id][args.key] final_result_test[i][model_id] = seed_best[seed][model_id][ args.test_key] if len(args.test_key2) > 0: final_result_test2[i][model_id] = seed_best[seed][model_id][ args.test_key2] logit_file_list[seed].append("{}-{}-{}.npy".format( condition['task_name'], model_id, seed_best[seed][model_id]["array_id"])) s = "Model %d | val: mean +- std: %.1f +- %.1f | test: mean +- std: %.1f (%.1f) (median %.1f)" % ( model_id, final_result_dev[:, model_id].mean() * 100, final_result_dev[:, model_id].std() * 100, final_result_test[:, model_id].mean() * 100, final_result_test[:, model_id].std() * 100, np.median(final_result_test[:, model_id]) * 100) if len(args.test_key2) > 0: s += " / %.1f +- %.1f (median %.1f)" % ( final_result_test2[:, model_id].mean() * 100, final_result_test2[:, model_id].std() * 100, np.median(final_result_test2[:, model_id]) * 100) print(s) # Map lower-case names to official names (data folder name) data_dir_mapping = { 'cola': 'CoLA', 'mrpc': 'MRPC', 'qqp': 'QQP', 'sts-b': 'STS-B', 'sst-2': 'SST-2', 'snli': 'SNLI', 'mnli': 'MNLI', 'mnli-mm': 'MNLI', 'rte': 'RTE', 'ag_news': 'ag_news', 'yahoo_answers': 'yahoo_answers', 'yelp_review_full': 'yelp_review_full', 'sst-5': 'sst-5', 'mr': 'mr', 'cr': 'cr', 'mpqa': 'mpqa', 'subj': 'subj', 'trec': 'trec' } tokenizer = AutoTokenizer.from_pretrained('roberta-large') ensemble_result = np.zeros((len(seed_result))) ensemble_result2 = np.zeros((len(seed_result))) # for second metric # Ensemble for each seed for seed_id, seed in enumerate(seed_result): labels = get_labels(args.data_dir, args.k, seed, condition['task_name'], data_dir_mapping[condition['task_name']]) # Logits mean_logits = None for fname in logit_file_list[seed]: logits = np.load(os.path.join(args.save_logit_dir, fname)) if mean_logits is None: mean_logits = logits else: mean_logits += logits mean_logits /= len(logit_file_list[seed]) # Compute metrics preds = mean_logits.argmax(-1) if condition['task_name'] in [ 'sst-5', 'mr', 'cr', 'mpqa', 'subj', 'trec' ]: metric = {"acc": simple_accuracy(preds, labels)} else: metric = glue_compute_metrics(condition['task_name'], preds, labels) ensemble_result[seed_id] = metric[args.test_key.split('_')[-1]] if len(args.test_key2) > 0: ensemble_result2[seed_id] = metric[args.test_key2.split('_')[-1]] s = "mean +- std: %.1f (%.1f) (median %.1f)" % ( ensemble_result.mean() * 100, ensemble_result.std() * 100, np.median(ensemble_result) * 100) if len(args.test_key2) > 0: s += " / %.1f (%.1f) (median %.1f)" % ( ensemble_result2.mean() * 100, ensemble_result2.std() * 100, np.median(ensemble_result2) * 100) print(s)
def compute_metrics(p: EvalPrediction) -> Dict: return glue_compute_metrics(data_args.task_name, output_mode, p.predictions, p.label_ids)
def compute_metrics_fn(p: EvalPrediction): preds = np.argmax(p.predictions, axis=1) return glue_compute_metrics(task_name, preds, p.label_ids)
def evaluate(args, model, task, tokenizer, accuracy_matrix, train_task_num, current_task_num, log_matrix, prefix=""): eval_dataset = load_examples(args, task, tokenizer, evaluate=True) if not os.path.exists(os.path.join(args.output_dir, prefix)): os.makedirs(os.path.join(args.output_dir, prefix)) eval_sampler = RandomSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # Eval! logger.info( "***** Running evaluation:: Task : {}, Prefix : {} *****".format( task, prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None indexes = None for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[1], "attention_mask": batch[2], "token_type_ids": batch[3], "labels": batch[4] } outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() indexes = batch[0].detach().cpu().numpy() out_label_ids = inputs["labels"].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) indexes = np.append(indexes, batch[0].detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps if task == "ner" or task == "pos": results = defaultdict() preds = np.argmax(preds, axis=2) pad_token_label_id = CrossEntropyLoss().ignore_index tags_vals = task_processors[task]().get_labels() label_map = {i: label for i, label in enumerate(tags_vals)} index_list = [] * out_label_ids.shape[0] out_label_list = [[] for _ in range(out_label_ids.shape[0])] preds_list = [[] for _ in range(out_label_ids.shape[0])] for i in range(out_label_ids.shape[0]): index_list.append(indexes[i]) for j in range(out_label_ids.shape[1]): if out_label_ids[i, j] != pad_token_label_id: out_label_list[i].append(label_map[out_label_ids[i][j]]) preds_list[i].append(label_map[preds[i][j]]) if task == "ner": results = { "loss": eval_loss, "acc": accuracy_score(out_label_list, preds_list), "precision": precision_score(out_label_list, preds_list), "recall": recall_score(out_label_list, preds_list), "f1": f1_score(out_label_list, preds_list), "classification report": classification_report(out_label_list, preds_list), } sequence_lens = [] if train_task_num == 0: eval_result_file = os.path.join( args.output_dir, "eval_results_" + str(task) + "1" + ".txt") else: eval_result_file = os.path.join( args.output_dir, "eval_results_" + str(task) + "2" + ".txt") with open(eval_result_file, "w") as writer: writer.write("index\tlabel\tprediction\n") for index1, (item0, item1, item2) in enumerate( zip(index_list, out_label_list, preds_list)): for index2, (label, pred) in enumerate(zip(item1, item2)): if label != pad_token_label_id: if index2 == 0: writer.write(f"{item0}\t{label}\t{pred}\n") else: writer.write(f"\t{label}\t{pred}\n") sequence_lens.append(index2 + 1) writer.write("\n") writer.write(str(sum(sequence_lens) / len(sequence_lens))) writer.close() elif task == "pos": results = { "loss": eval_loss, "acc": accuracy_score(out_label_list, preds_list), "precision": precision_score(out_label_list, preds_list), "recall": recall_score(out_label_list, preds_list), "f1": f1_score(out_label_list, preds_list), } result = results["acc"] else: preds = np.argmax(preds, axis=1) results = {} result = glue_compute_metrics(task, preds, out_label_ids) results.update(result) # Log evaluation result for the first task CoLA if task == "cola": # index_list = [] * out_label_ids.shape[0] # for i in range(out_label_ids.shape[0]): # index_list.append(indexes[i]) tags_vals = task_processors[task]().get_labels() label_map = {} for i, label in enumerate(tags_vals): label_map[label] = i if train_task_num == 0: eval_result_file = os.path.join( args.output_dir, "eval_results_" + str(task) + "1" + ".txt") else: eval_result_file = os.path.join( args.output_dir, "eval_results_" + str(task) + "2" + ".txt") with open(eval_result_file, "w") as writer: writer.write("index\tlabel\tprediction\n") for index, (item0, item1, item2) in enumerate( zip(indexes, out_label_ids, preds)): item1 = label_map[str(item1)] item2 = label_map[str(item2)] writer.write(f"{item0}\t{item1}\t{item2}\n") writer.close() if task == 'cola': result = result['mcc'] else: result = result['acc'] logger.info("***** Eval results {} {}*****".format(prefix, task)) for key in sorted(results.keys()): logger.info(" %s = %s", key, str(results[key])) if log_matrix: accuracy_matrix[train_task_num][current_task_num] = format( result, ".7f") return results, accuracy_matrix, result
def _compute_glue_metrics(self, task_name): return lambda p: glue_compute_metrics( task_name, np.argmax(p.predictions, axis=1), p.label_ids)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() if ( os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir ): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) try: num_labels = glue_tasks_num_labels[data_args.task_name] output_mode = glue_output_modes[data_args.task_name] except KeyError: raise ValueError("Task not found: %s" % (data_args.task_name)) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) tui_ids = None if model_args.umls: tui_ids = create_cui_dict(voc_updated=model_args.med_document, tokenizer=tokenizer) # Get datasets train_dataset = ( GlueDataset(data_args,tokenizer=tokenizer, cache_dir=model_args.cache_dir, tui=tui_ids) if training_args.do_train else None ) eval_dataset = ( GlueDataset(data_args, tokenizer=tokenizer, mode="dev", cache_dir=model_args.cache_dir) if training_args.do_eval else None ) test_dataset = ( GlueDataset(data_args, tokenizer=tokenizer, mode="test", cache_dir=model_args.cache_dir) if training_args.do_predict else None ) def build_compute_metrics_fn(task_name: str) -> Callable[[EvalPrediction], Dict]: def compute_metrics_fn(p: EvalPrediction): if output_mode == "classification": preds = np.argmax(p.predictions, axis=1) elif output_mode == "regression": preds = np.squeeze(p.predictions) return glue_compute_metrics(task_name, preds, p.label_ids) return compute_metrics_fn # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=build_compute_metrics_fn(data_args.task_name), ) # Training if training_args.do_train: trainer.train( model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None ) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation eval_results = {} if training_args.do_eval: logger.info("*** Evaluate ***") # Loop to handle MNLI double evaluation (matched, mis-matched) eval_datasets = [eval_dataset] if data_args.task_name == "mnli": pass # mnli_mm_data_args = dataclasses.replace(data_args, task_name="mnli-mm") # eval_datasets.append( # GlueDataset(mnli_mm_data_args, tokenizer=tokenizer, mode="dev", cache_dir=model_args.cache_dir) # ) for eval_dataset in eval_datasets: trainer.compute_metrics = build_compute_metrics_fn(eval_dataset.args.task_name) eval_result = trainer.evaluate(eval_dataset=eval_dataset) output_eval_file = os.path.join( training_args.output_dir, f"eval_results_{eval_dataset.args.task_name}.txt" ) if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(eval_dataset.args.task_name)) for key, value in eval_result.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) eval_results.update(eval_result) labels_id = [] if training_args.do_predict: logging.info("*** Test ***") test_datasets = [test_dataset] if data_args.task_name == "mnli": pass # mnli_mm_data_args = dataclasses.replace(data_args, task_name="mnli-mm") # test_datasets.append( # GlueDataset(mnli_mm_data_args, tokenizer=tokenizer, mode="test", cache_dir=model_args.cache_dir) # ) for test_dataset in test_datasets: predictions = trainer.predict(test_dataset=test_dataset).predictions if output_mode == "classification": predictions = np.argmax(predictions, axis=1) for i in range(0, len(test_dataset.features)): labels_id.append(test_dataset.features[i].label) metric = glue_compute_metrics(data_args.task_name, predictions, labels_id) output_test_file = os.path.join( training_args.output_dir, f"test_results_{test_dataset.args.task_name}_metric.txt" ) if trainer.is_world_master(): with open(output_test_file, "w") as writer: logger.info("***** test results {} *****".format(test_dataset.args.task_name)) for key, value in metric.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) eval_results.update(eval_result) output_test_file = os.path.join( training_args.output_dir, f"test_results_{test_dataset.args.task_name}.txt" ) if trainer.is_world_master(): with open(output_test_file, "w") as writer: logger.info("***** Test results {} *****".format(test_dataset.args.task_name)) writer.write("index\tprediction\n") for index, item in enumerate(predictions): if output_mode == "regression": writer.write("%d\t%3.3f\n" % (index, item)) else: item = test_dataset.get_labels()[item] writer.write("%d\t%s\n" % (index, item)) return eval_results
def compute_metrics_fn(p: EvalPrediction): preds = np.argmax(p.predictions, axis=1) return glue_compute_metrics("classification", preds, p.label_ids)
def compute_metrics(p: EvalPrediction) -> Dict: preds = np.argmax(p.predictions, axis=1) return glue_compute_metrics(data_args.task_name, preds, p.label_ids)
# model.zero_grad() losses.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() # global_step += 1 # ------ # Record # ------ preds = logits.data.cpu().numpy() preds = np.argmax(preds, axis=1) out_label_ids = inputs["labels"].data.cpu().numpy() result = glue_compute_metrics( task_name, preds, out_label_ids) # ['acc', 'f1', 'acc_and_f1'] if recorder is not None: recorder.update(losses.item(), acc=[result['acc_and_f1']], batch_size=args.train_batch_size, is_train=True) recorder.print_training_result(batch_idx=step, n_batch=len(train_dataloader)) else: train_loss += losses.item() progress_bar(step, len(train_dataloader), "Loss: %.3f" % (train_loss / (step + 1))) result = evaluate(task_name, model, eval_dataloader, model_type) print(result) if recorder is not None: