def init_dataset_for_finetuning(model_args, data_args, training_args, last_checkpoint=None): datasets = init_datasets_task(data_args, training_args) is_regression, label_list, num_labels = get_labels(datasets, data_args) logging.info(f"Training {data_args.task_name} with {num_labels} labels") # For finetuning required to add labels and task name to config kwargs extra_config_kwargs = dict( num_labels=num_labels, finetuning_task=data_args.task_name, ) config = init_config(model_args, extra_config_kwargs=extra_config_kwargs) tokenizer = init_tokenizer(model_args) model = init_model(model_args, config, tokenizer, finetuning=True) check_sparsity_callback(model, model_args) # Tokenizing and preprocessing the datasets for downstream tasks # TODO: load from cached tokenized datasets for finetuning as well logging.info(f"Tokenizing datasets for finetuning ...") tokenized_datasets = preprocess_datasets_task(datasets, tokenizer, data_args, model, num_labels, label_list, is_regression) # Separate into train, eval and test train_dataset = tokenized_datasets["train"] eval_dataset = tokenized_datasets["validation_matched" if data_args. task_name == "mnli" else "validation"] test_dataset = None if (data_args.task_name is not None or data_args.test_file is not None): if training_args.do_predict: test_dataset = tokenized_datasets["test_matched" if data_args. task_name == "mnli" else "test"] # Log fingerprint used in HF smart caching logging.info(f"Dataset fingerprint: {train_dataset._fingerprint}") # Data collator will default to DataCollatorWithPadding, # so we change it if we already did the padding. if data_args.pad_to_max_length: data_collator = default_data_collator elif training_args.fp16: data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) else: data_collator = None return (tokenizer, data_collator, train_dataset, eval_dataset, test_dataset, model, is_regression, tokenized_datasets, label_list, config)
def run_finetuning_single_task(model_args, data_args, training_args, last_checkpoint=None): """On a single task train, evaluate, and save results""" datasets = init_datasets_task(data_args, training_args) is_regression, label_list, num_labels = get_labels(datasets, data_args) logging.info(f"Training {data_args.task_name} with {num_labels} labels") # For finetuning required to add labels and task name to config kwargs extra_config_kwargs = dict( num_labels=num_labels, finetuning_task=data_args.task_name, ) config = init_config(model_args, extra_config_kwargs=extra_config_kwargs) tokenizer = init_tokenizer(model_args) model = init_model(model_args, config, tokenizer, finetuning=True) # Tokenizing and preprocessing the datasets for downstream tasks # TODO: load from cached tokenized datasets for finetuning as well logging.info(f"Tokenizing datasets for finetuning ...") tokenized_datasets = preprocess_datasets_task(datasets, tokenizer, data_args, model, num_labels, label_list, is_regression) # Separate into train, eval and test train_dataset = tokenized_datasets["train"] eval_dataset = tokenized_datasets["validation_matched" if data_args. task_name == "mnli" else "validation"] test_dataset = None if ((data_args.task_name is not None or data_args.test_file is not None) and training_args.do_predict): test_dataset = tokenized_datasets["test_matched" if data_args. task_name == "mnli" else "test"] # Log fingerprint used in HF smart caching logging.info(f"Dataset fingerprint: {train_dataset._fingerprint}") # Data collator will default to DataCollatorWithPadding, # so we change it if we already did the padding. if data_args.pad_to_max_length: data_collator = default_data_collator elif training_args.fp16: data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) else: data_collator = None # Train trainer = init_trainer( tokenizer=tokenizer, data_collator=data_collator, training_args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, model=init_model(model_args, config, tokenizer), trainer_callbacks=model_args.trainer_callbacks or None, finetuning=True, task_name=data_args.task_name, is_regression=is_regression) if training_args.do_train: train(trainer, training_args.output_dir, last_checkpoint) # Evaluate eval_results = {} if training_args.do_eval: logging.info("*** Evaluate ***") # Handle special case of extra validation dataset for MNLI tasks = [data_args.task_name] eval_datasets = [eval_dataset] if data_args.task_name == "mnli": tasks.append("mnli-mm") eval_datasets.append(tokenized_datasets["validation_mismatched"]) eval_results = evaluate_tasks(trainer, training_args.output_dir, tasks, eval_datasets) # Test/Predict if training_args.do_predict: logging.info("*** Test ***") # Handle special case of extra test dataset for MNLI tasks = [data_args.task_name] test_datasets = [test_dataset] if data_args.task_name == "mnli": tasks.append("mnli-mm") test_datasets.append(tokenized_datasets["test_mismatched"]) test_tasks(trainer, training_args.output_dir, tasks, test_datasets, is_regression, label_list) # There is an existing issue on training multiple models in sequence in this code # There is a memory leakage on the model, a small amount of GPU memory remains after # the run and accumulates over several runs. It fails with OOM after about 20 runs, # even when all tensors on GPU are explicitly deleted, garbage is collected and # cache is cleared. Tried multiple solutions but this weird little hack is the only # thing that worked. model.to("cpu") return eval_results
def init_dataset_for_finetuning(model_args, data_args, training_args, last_checkpoint=None): # TODO # edit multi_eval_sets so you can gather not just multiple eval sets # for a single task, but eval sets from multiple tasks datasets = init_datasets_task(data_args, training_args) is_regression, label_list, num_labels = get_labels(datasets, data_args) logging.info(f"Training {data_args.task_name} with {num_labels} labels") # For finetuning required to add labels and task name to config kwargs extra_config_kwargs = dict( num_labels=num_labels, finetuning_task=data_args.task_name, ) config = init_config(model_args, extra_config_kwargs=extra_config_kwargs) tokenizer = init_tokenizer(model_args) model = init_model(model_args, config, tokenizer, finetuning=True) check_sparsity_callback(model, model_args) check_mnli(model_args, data_args.task_name) # Tokenizing and preprocessing the datasets for downstream tasks # TODO: load from cached tokenized datasets for finetuning as well logging.info(f"Tokenizing datasets for finetuning ...") tokenized_datasets = preprocess_datasets_task(datasets, tokenizer, data_args, model, num_labels, label_list, is_regression) # Separate into train, eval and test train_dataset = tokenized_datasets["train"] # Allow multiple eval sets. For now, assume mnli is the only case eval_dataset = [] if data_args.task_name == "mnli": if "eval_sets" in training_args.trainer_mixin_args: for eval_set in training_args.trainer_mixin_args["eval_sets"]: eval_dataset.append(tokenized_datasets[eval_set]) else: eval_dataset.append(tokenized_datasets["validation_matched"]) else: eval_dataset.append(tokenized_datasets["validation"]) # If only one eval set, no need for a list if len(eval_dataset) == 1: eval_dataset = eval_dataset[0] test_dataset = None if (data_args.task_name is not None or data_args.test_file is not None): if training_args.do_predict: test_dataset = tokenized_datasets["test_matched" if data_args. task_name == "mnli" else "test"] # Log fingerprint used in HF smart caching logging.info(f"Dataset fingerprint: {train_dataset._fingerprint}") # Data collator will default to DataCollatorWithPadding, # so we change it if we already did the padding. if data_args.pad_to_max_length: data_collator = default_data_collator elif training_args.fp16: data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) else: data_collator = None return (tokenizer, data_collator, train_dataset, eval_dataset, test_dataset, model, is_regression, tokenized_datasets, label_list, config)