def compute_metrics_fn(p: EvalPrediction): if output_mode == "classification": preds = np.argmax(p.predictions, axis=1) elif output_mode == "regression": preds = np.squeeze(p.predictions) return glue_compute_metrics(task_name, preds, p.label_ids)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # model_type = model_args.model_type # log_dir = './results' # if model_type == 'base': # model_args.model_name_or_path = 'bert-base-uncased' # elif model_type == 'base-pubmed': # model_args.model_name_or_path = 'bionlp/bluebert_pubmed_uncased_L-12_H-768_A-12' # elif model_type == 'base-pubmed-mimic': # model_args.model_name_or_path = 'bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12' # else: # raise NotImplementedError # Setup logging logging.basicConfig( format= '[%(asctime)s - %(levelname)s - %(filename)s: %(lineno)d (%(funcName)s)] %(message)s', datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) # Set seed set_seed(training_args.seed) try: num_labels = glue_tasks_num_labels[data_args.task_name] output_mode = glue_output_modes[data_args.task_name] except KeyError: raise ValueError("Task not found: %s" % (data_args.task_name)) # Load tokenizer tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) dataset_name = data_args.data_dir.split('/')[-1] if dataset_name in ['GAD', 'EUADR']: final_split_results = [] original_data_dir = copy.deepcopy(x=data_args.data_dir) data_splits = list(map(str, range(1, 11))) for split in data_splits: data_args.data_dir = os.path.join(original_data_dir, split) # Get datasets train_dataset = (GlueDataset( data_args, tokenizer=tokenizer, cache_dir=model_args.cache_dir) if training_args.do_train else None) eval_dataset = (GlueDataset(data_args, tokenizer=tokenizer, mode="dev", cache_dir=model_args.cache_dir) if training_args.do_eval else None) test_dataset = (GlueDataset(data_args, tokenizer=tokenizer, mode="test", cache_dir=model_args.cache_dir) if training_args.do_predict else None) # Load pretrained model # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. # Currently, this code do not support distributed training. training_args.warmup_steps = int( model_args.warmup_proportion * (len(train_dataset) / training_args.per_device_train_batch_size) * training_args.num_train_epochs) training_args_weight_decay = 0.01 logger.info("Training/evaluation parameters %s", training_args) config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, ) try: model = AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_tf=False, config=config, cache_dir=model_args.cache_dir, ) except: model = AutoModelForSequenceClassification.from_pretrained( os.path.join(model_args.model_name_or_path, "model.ckpt.index"), from_tf=True, config=config, cache_dir=model_args.cache_dir, ) def build_compute_metrics_fn( task_name: str) -> Callable[[EvalPrediction], Dict]: def compute_metrics_fn(p: EvalPrediction): if output_mode == "classification": preds = np.argmax(p.predictions, axis=1) elif output_mode == "regression": preds = np.squeeze(p.predictions) return glue_compute_metrics(task_name, preds, p.label_ids) return compute_metrics_fn # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=build_compute_metrics_fn(data_args.task_name), ) # Training if training_args.do_train: training_start_time = time.time() trainer.train( model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) training_end_time = time.time() training_total_time = training_end_time - training_start_time trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation eval_results = {} if training_args.do_eval: logger.info("*** Evaluate ***") # Loop to handle MNLI double evaluation (matched, mis-matched) eval_datasets = [eval_dataset] if data_args.task_name == "mnli": mnli_mm_data_args = dataclasses.replace( data_args, task_name="mnli-mm") eval_datasets.append( GlueDataset(mnli_mm_data_args, tokenizer=tokenizer, mode="dev", cache_dir=model_args.cache_dir)) for eval_dataset in eval_datasets: trainer.compute_metrics = build_compute_metrics_fn( eval_dataset.args.task_name) eval_result = trainer.evaluate(eval_dataset=eval_dataset) output_eval_file = os.path.join( training_args.output_dir, f"eval_results_{eval_dataset.args.task_name}.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format( eval_dataset.args.task_name)) for key, value in eval_result.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) eval_results.update(eval_result) if training_args.do_predict: logging.info("*** Test ***") test_datasets = [test_dataset] if data_args.task_name == "mnli": mnli_mm_data_args = dataclasses.replace( data_args, task_name="mnli-mm") test_datasets.append( GlueDataset(mnli_mm_data_args, tokenizer=tokenizer, mode="test", cache_dir=model_args.cache_dir)) for test_dataset in test_datasets: predictions = trainer.predict( test_dataset=test_dataset).predictions labels = np.array([ test_dataset.__getitem__(idx).label for idx in range(len(test_dataset)) ]) assert len(predictions) == len( labels ), f"len(predictions) = {len(predictions)} =/= len(labels) = {len(labels)}" if output_mode == "classification": predictions = np.argmax(predictions, axis=1) output_test_file = os.path.join( training_args.output_dir, f"test_results.txt" #f"test_results_{test_dataset.args.task_name}.txt" ) test_results = glue_compute_metrics(task_name='ddi', preds=predictions, labels=labels) if trainer.is_world_master(): with open(output_test_file, "w") as writer: logger.info("***** Test results {} *****".format( test_dataset.args.task_name)) logger.info( f"Accuracy: {test_results['acc']}\tMacro F1: {test_results['f1']}" ) writer.write("index\tprediction\n") for index, item in enumerate(predictions): if output_mode == "regression": writer.write("%d\t%3.3f\n" % (index, item)) else: item = test_dataset.get_labels()[item] writer.write("%d\t%s\n" % (index, item)) training_time_formatted = time.strftime( '%H:%M:%S', time.gmtime(training_total_time)) logger.info( f"Total training time: {training_time_formatted}") final_results = copy.deepcopy(x=test_results) final_results['training_time'] = training_time_formatted logger.info( f"F1: {final_results['f1']} | Acc: {final_results['acc']} | Time Elapsed: {final_results['training_time']}" ) final_split_results.append(final_results) else: # Get datasets train_dataset = (GlueDataset( data_args, tokenizer=tokenizer, cache_dir=model_args.cache_dir) if training_args.do_train else None) eval_dataset = (GlueDataset(data_args, tokenizer=tokenizer, mode="dev", cache_dir=model_args.cache_dir) if training_args.do_eval else None) test_dataset = (GlueDataset(data_args, tokenizer=tokenizer, mode="test", cache_dir=model_args.cache_dir) if training_args.do_predict else None) # Load pretrained model # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. # Currently, this code do not support distributed training. training_args.warmup_steps = int( model_args.warmup_proportion * (len(train_dataset) / training_args.per_device_train_batch_size) * training_args.num_train_epochs) training_args_weight_decay = 0.01 logger.info("Training/evaluation parameters %s", training_args) config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, ) try: model = AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_tf=False, config=config, cache_dir=model_args.cache_dir, ) except: model = AutoModelForSequenceClassification.from_pretrained( os.path.join(model_args.model_name_or_path, "model.ckpt.index"), from_tf=True, config=config, cache_dir=model_args.cache_dir, ) def build_compute_metrics_fn( task_name: str) -> Callable[[EvalPrediction], Dict]: def compute_metrics_fn(p: EvalPrediction): if output_mode == "classification": preds = np.argmax(p.predictions, axis=1) elif output_mode == "regression": preds = np.squeeze(p.predictions) return glue_compute_metrics(task_name, preds, p.label_ids) return compute_metrics_fn # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=build_compute_metrics_fn(data_args.task_name), ) # Training if training_args.do_train: training_start_time = time.time() trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) training_end_time = time.time() training_total_time = training_end_time - training_start_time trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation eval_results = {} if training_args.do_eval: logger.info("*** Evaluate ***") # Loop to handle MNLI double evaluation (matched, mis-matched) eval_datasets = [eval_dataset] if data_args.task_name == "mnli": mnli_mm_data_args = dataclasses.replace(data_args, task_name="mnli-mm") eval_datasets.append( GlueDataset(mnli_mm_data_args, tokenizer=tokenizer, mode="dev", cache_dir=model_args.cache_dir)) for eval_dataset in eval_datasets: trainer.compute_metrics = build_compute_metrics_fn( eval_dataset.args.task_name) eval_result = trainer.evaluate(eval_dataset=eval_dataset) output_eval_file = os.path.join( training_args.output_dir, f"eval_results_{eval_dataset.args.task_name}.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format( eval_dataset.args.task_name)) for key, value in eval_result.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) eval_results.update(eval_result) if training_args.do_predict: logging.info("*** Test ***") test_datasets = [test_dataset] if data_args.task_name == "mnli": mnli_mm_data_args = dataclasses.replace(data_args, task_name="mnli-mm") test_datasets.append( GlueDataset(mnli_mm_data_args, tokenizer=tokenizer, mode="test", cache_dir=model_args.cache_dir)) for test_dataset in test_datasets: predictions = trainer.predict( test_dataset=test_dataset).predictions labels = np.array([ test_dataset.__getitem__(idx).label for idx in range(len(test_dataset)) ]) assert len(predictions) == len( labels ), f"len(predictions) = {len(predictions)} =/= len(labels) = {len(labels)}" if output_mode == "classification": predictions = np.argmax(predictions, axis=1) output_test_file = os.path.join( training_args.output_dir, f"test_results.txt" #f"test_results_{test_dataset.args.task_name}.txt" ) test_results = glue_compute_metrics(task_name='ddi', preds=predictions, labels=labels) if trainer.is_world_master(): with open(output_test_file, "w") as writer: logger.info("***** Test results {} *****".format( test_dataset.args.task_name)) logger.info( f"Accuracy: {test_results['acc']}\tMacro F1: {test_results['f1']}" ) writer.write("index\tprediction\n") for index, item in enumerate(predictions): if output_mode == "regression": writer.write("%d\t%3.3f\n" % (index, item)) else: item = test_dataset.get_labels()[item] writer.write("%d\t%s\n" % (index, item)) training_time_formatted = time.strftime( '%H:%M:%S', time.gmtime(training_total_time)) logger.info( f"Total training time: {training_time_formatted}") final_results = copy.deepcopy(x=test_results) final_results['training_time'] = training_time_formatted logger.info( f"F1: {final_results['f1']} | Acc: {final_results['acc']} | Time Elapsed: {final_results['training_time']}" ) if dataset_name in ['GAD', 'EUADR']: average_f1_scores = np.mean([x['f1'] for x in final_split_results]) average_acc = np.mean([x['acc'] for x in final_split_results]) logger.info( f"Average F1 Scores: {average_f1_scores} | Average Accuracy: {average_acc}" ) return final_split_results else: return final_results
def compute_metrics_fn(p: EvalPrediction): preds = np.argmax(p.predictions, axis=1) return glue_compute_metrics(preds, p.label_ids)