def run_finetuning_single_task(model_args, data_args, training_args, last_checkpoint=None): """On a single task train, evaluate, and save results""" datasets = init_datasets_task(data_args, training_args) is_regression, label_list, num_labels = get_labels(datasets, data_args) logging.info(f"Training {data_args.task_name} with {num_labels} labels") # For finetuning required to add labels and task name to config kwargs extra_config_kwargs = dict( num_labels=num_labels, finetuning_task=data_args.task_name, ) config = init_config(model_args, extra_config_kwargs=extra_config_kwargs) tokenizer = init_tokenizer(model_args) model = init_model(model_args, config, tokenizer, finetuning=True) # Tokenizing and preprocessing the datasets for downstream tasks # TODO: load from cached tokenized datasets for finetuning as well logging.info(f"Tokenizing datasets for finetuning ...") tokenized_datasets = preprocess_datasets_task(datasets, tokenizer, data_args, model, num_labels, label_list, is_regression) # Separate into train, eval and test train_dataset = tokenized_datasets["train"] eval_dataset = tokenized_datasets["validation_matched" if data_args. task_name == "mnli" else "validation"] test_dataset = None if ((data_args.task_name is not None or data_args.test_file is not None) and training_args.do_predict): test_dataset = tokenized_datasets["test_matched" if data_args. task_name == "mnli" else "test"] # Log fingerprint used in HF smart caching logging.info(f"Dataset fingerprint: {train_dataset._fingerprint}") # Data collator will default to DataCollatorWithPadding, # so we change it if we already did the padding. if data_args.pad_to_max_length: data_collator = default_data_collator elif training_args.fp16: data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) else: data_collator = None # Train trainer = init_trainer( tokenizer=tokenizer, data_collator=data_collator, training_args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, model=init_model(model_args, config, tokenizer), trainer_callbacks=model_args.trainer_callbacks or None, finetuning=True, task_name=data_args.task_name, is_regression=is_regression) if training_args.do_train: train(trainer, training_args.output_dir, last_checkpoint) # Evaluate eval_results = {} if training_args.do_eval: logging.info("*** Evaluate ***") # Handle special case of extra validation dataset for MNLI tasks = [data_args.task_name] eval_datasets = [eval_dataset] if data_args.task_name == "mnli": tasks.append("mnli-mm") eval_datasets.append(tokenized_datasets["validation_mismatched"]) eval_results = evaluate_tasks(trainer, training_args.output_dir, tasks, eval_datasets) # Test/Predict if training_args.do_predict: logging.info("*** Test ***") # Handle special case of extra test dataset for MNLI tasks = [data_args.task_name] test_datasets = [test_dataset] if data_args.task_name == "mnli": tasks.append("mnli-mm") test_datasets.append(tokenized_datasets["test_mismatched"]) test_tasks(trainer, training_args.output_dir, tasks, test_datasets, is_regression, label_list) # There is an existing issue on training multiple models in sequence in this code # There is a memory leakage on the model, a small amount of GPU memory remains after # the run and accumulates over several runs. It fails with OOM after about 20 runs, # even when all tensors on GPU are explicitly deleted, garbage is collected and # cache is cleared. Tried multiple solutions but this weird little hack is the only # thing that worked. model.to("cpu") return eval_results
def run_finetuning_single_task( model_args, data_args, training_args, last_checkpoint=None, run_idx=None, ): """On a single task train, evaluate, and save results""" # TODO # accept run# as an argument for finetuning with multiple runs on a single task # update the save directory to include run# tokenizer, data_collator, train_dataset, eval_dataset, test_dataset, model, \ is_regression, tokenized_datasets, label_list, config = \ init_dataset_for_finetuning( model_args, data_args, training_args, last_checkpoint ) # Code safety check_eval_and_max_steps(training_args, train_dataset) training_args = check_best_metric(training_args, data_args.task_name) # Update where model is saved for each run training_args = update_run_number(training_args, run_idx) # Train trainer = init_trainer( tokenizer=tokenizer, data_collator=data_collator, training_args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, model=model, trainer_callbacks=model_args.trainer_callbacks or None, finetuning=True, task_name=data_args.task_name, is_regression=is_regression) if training_args.do_train: train(trainer, training_args.output_dir, training_args.rm_checkpoints, last_checkpoint) if training_args.do_eval: eval_results = evaluate_tasks_handler(trainer, data_args, model_args, training_args, eval_dataset, tokenized_datasets) # Test/Predict if training_args.do_predict: logging.info("*** Test ***") # Handle special case of extra test dataset for MNLI tasks = [data_args.task_name] test_datasets = [test_dataset] if data_args.task_name == "mnli": tasks.append("mnli-mm") test_datasets.append(tokenized_datasets["test_mismatched"]) test_tasks(trainer, training_args.output_dir, tasks, test_datasets, is_regression, label_list) # TODO # Remove any unnecessary checkpoints to reduce space demands if training_args.load_best_model_at_end: pass # find best model checkpoint # delete the rest # There is an existing issue on training multiple models in sequence in this code # There is a memory leakage on the model, a small amount of GPU memory remains after # the run and accumulates over several runs. It fails with OOM after about 20 runs, # even when all tensors on GPU are explicitly deleted, garbage is collected and # cache is cleared. Tried multiple solutions but this weird little hack is the only # thing that worked. model.to("cpu") return eval_results
def run_pretraining(model_args, data_args, training_args, last_checkpoint=None): """Pretrain and evaluate a language model""" logging.info(f"Pre-training a masked language model.") datasets, tokenized_datasets, dataset_path = init_datasets_mlm(data_args) config = init_config(model_args) tokenizer = init_tokenizer(model_args) if tokenized_datasets is None: # Tokenizing and preprocessing the datasets for language modeling if training_args.do_train: column_names = datasets["train"].column_names else: column_names = datasets["validation"].column_names text_column_name = "text" if "text" in column_names else column_names[0] logging.info(f"Tokenizing datasets for pretraining ...") tokenized_datasets = preprocess_datasets_mlm(datasets, tokenizer, data_args, column_names, text_column_name) # Save only if a dataset_path has been defined in the previous steps # that will be True only when loading from dataset hub if data_args.save_tokenized_data and dataset_path is not None: logging.info(f"Saving tokenized dataset to {dataset_path}") tokenized_datasets.save_to_disk(dataset_path) # Separate into train, eval and test train_dataset = tokenized_datasets["train"] eval_dataset = tokenized_datasets["validation"] # Log fingerprint used in HF smart caching logging.info(f"Dataset fingerprint: {train_dataset._fingerprint}") # Data collator will take care of randomly masking the tokens. # argument defined in experiment config assert hasattr(transformers, data_args.data_collator), \ f"Data collator {data_args.data_collator} not available" data_collator = getattr(transformers, data_args.data_collator)( tokenizer=tokenizer, mlm_probability=data_args.mlm_probability) # Run hp search or regular training if model_args.hp_num_trials >= 1: run_hyperparameter_search( model_args=model_args, config=config, tokenizer=tokenizer, data_collator=data_collator, training_args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, ) else: trainer = init_trainer( tokenizer=tokenizer, data_collator=data_collator, training_args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, model=init_model(model_args, config, tokenizer), trainer_class=model_args.trainer_class, trainer_callbacks=model_args.trainer_callbacks or None, ) if training_args.do_train: train(trainer, training_args.output_dir, last_checkpoint) # Evaluate in full eval dataset. # if using hp search, load best model before running evaluate if training_args.do_eval: logging.info("*** Evaluate ***") evaluate_language_model(trainer, eval_dataset, training_args.output_dir)
def run_finetuning_squad( model_args, data_args, training_args, last_checkpoint=None, run_idx=None, ): """On a single task train, evaluate, and save results""" # Make sure dataset name, task name, and version_2_with_negative # match before loading the dataset data_args = check_squad_version(data_args) data_init = init_dataset_for_squad(model_args, data_args, training_args, last_checkpoint) tokenizer = data_init[0] data_collator = data_init[1] train_dataset = data_init[2] eval_dataset = data_init[3] eval_examples = data_init[4] model = data_init[5] answer_column_name = data_init[6] # Code safety check_eval_and_max_steps(training_args, train_dataset) # Pass dataset_name instead of task_name for the special case of squad # squad and squad_v2 have different metrics and datasets, but same task_name training_args = check_best_metric(training_args, data_args.dataset_name) # Post-processing: def post_processing_function(examples, features, predictions, stage="eval"): # Post-processing: we match the start logits and end logits to # answers in the original context. if data_args.beam_search: predictions, scores_diff_json = \ postprocess_qa_predictions_with_beam_search( examples=examples, features=features, predictions=predictions, version_2_with_negative=data_args.version_2_with_negative, n_best_size=data_args.n_best_size, max_answer_length=data_args.max_answer_length, start_n_top=model.config.start_n_top, end_n_top=model.config.end_n_top, output_dir=training_args.output_dir, # log_level=log_level, prefix=stage, ) else: predictions = postprocess_qa_predictions( examples=examples, features=features, predictions=predictions, version_2_with_negative=data_args.version_2_with_negative, n_best_size=data_args.n_best_size, max_answer_length=data_args.max_answer_length, output_dir=training_args.output_dir, prefix=stage, ) if data_args.version_2_with_negative: if data_args.beam_search: formatted_predictions = [ { "id": k, "prediction_text": v, "no_answer_probability": scores_diff_json[k] } # noqa E501 for k, v in predictions.items() ] else: formatted_predictions = [ { "id": k, "prediction_text": v, "no_answer_probability": 0.0 } for k, v in predictions.items() # noqa E501 ] else: formatted_predictions = [{ "id": k, "prediction_text": v } for k, v in predictions.items()] # noqa E501 references = [{ "id": ex["id"], "answers": ex[answer_column_name] } for ex in examples] # noqa E501 return EvalPrediction(predictions=formatted_predictions, label_ids=references) # Update where model is saved for each run training_args = update_run_number(training_args, run_idx) training_args.trainer_class = QuestionAnsweringTrainer trainer_kwargs = dict( model=model, args=training_args, tokenizer=tokenizer, train_dataset=train_dataset, eval_dataset=eval_dataset, eval_examples=eval_examples, data_collator=data_collator, post_process_function=post_processing_function, callbacks=model_args.trainer_callbacks or None, ) # Train trainer = init_squad_trainer(trainer_kwargs, data_args, training_args.trainer_class, model_args.trainer_callbacks) if training_args.do_train: # Note, rm_checkpoints=True means one model will be saved # in the output_dir, and all checkpoint subdirectories will be # deleted when train() is called. train(trainer, training_args.output_dir, training_args.rm_checkpoints, last_checkpoint) eval_results = {} if training_args.do_eval: eval_results = evaluate_task_handler(trainer, data_args, model_args, training_args, eval_dataset) if training_args.do_predict: raise NotImplementedError( "Storing test results for squad not yet implemented") # There is an existing issue on training multiple models in sequence in this code # There is a memory leakage on the model, a small amount of GPU memory remains after # the run and accumulates over several runs. It fails with OOM after about 20 runs, # even when all tensors on GPU are explicitly deleted, garbage is collected and # cache is cleared. Tried multiple solutions but this weird little hack is the only # thing that worked. model.to("cpu") return eval_results