def init_dataset_for_squad(model_args, data_args, training_args, last_checkpoint=None): datasets = init_datasets_squad(data_args, model_args) # Place holder for now extra_config_kwargs = {} config = init_config(model_args, extra_config_kwargs=extra_config_kwargs) tokenizer = init_tokenizer(model_args) model = init_model(model_args, config, tokenizer, finetuning=True, squad=True) check_sparsity_callback(model, model_args) logging.info(f"Tokenizing datasets for squad ...") (train_dataset, eval_dataset, eval_examples, answer_column_name) = \ preprocess_datasets_squad(datasets, tokenizer, training_args, data_args) if data_args.pad_to_max_length: data_collator = default_data_collator else: pad_to_multiple_of = 8 if training_args.fp16 else None data_collator = \ DataCollatorWithPadding(tokenizer, pad_to_multiple_of=pad_to_multiple_of) return (tokenizer, data_collator, train_dataset, eval_dataset, eval_examples, model, answer_column_name)
def init_dataset_for_finetuning(model_args, data_args, training_args, last_checkpoint=None): datasets = init_datasets_task(data_args, training_args) is_regression, label_list, num_labels = get_labels(datasets, data_args) logging.info(f"Training {data_args.task_name} with {num_labels} labels") # For finetuning required to add labels and task name to config kwargs extra_config_kwargs = dict( num_labels=num_labels, finetuning_task=data_args.task_name, ) config = init_config(model_args, extra_config_kwargs=extra_config_kwargs) tokenizer = init_tokenizer(model_args) model = init_model(model_args, config, tokenizer, finetuning=True) check_sparsity_callback(model, model_args) # Tokenizing and preprocessing the datasets for downstream tasks # TODO: load from cached tokenized datasets for finetuning as well logging.info(f"Tokenizing datasets for finetuning ...") tokenized_datasets = preprocess_datasets_task(datasets, tokenizer, data_args, model, num_labels, label_list, is_regression) # Separate into train, eval and test train_dataset = tokenized_datasets["train"] eval_dataset = tokenized_datasets["validation_matched" if data_args. task_name == "mnli" else "validation"] test_dataset = None if (data_args.task_name is not None or data_args.test_file is not None): if training_args.do_predict: test_dataset = tokenized_datasets["test_matched" if data_args. task_name == "mnli" else "test"] # Log fingerprint used in HF smart caching logging.info(f"Dataset fingerprint: {train_dataset._fingerprint}") # Data collator will default to DataCollatorWithPadding, # so we change it if we already did the padding. if data_args.pad_to_max_length: data_collator = default_data_collator elif training_args.fp16: data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) else: data_collator = None return (tokenizer, data_collator, train_dataset, eval_dataset, test_dataset, model, is_regression, tokenized_datasets, label_list, config)
def convert_to_prunable_checkpoint(checkpoint_folder, experiment): """ This loads a dense models weights and a prunable model of similar architecture (one with SparseWeightsBase layers), copies the weights of the former into the latter, and then saves a new checkpoint at `{checkpoint_folder}_prunable`. :param checkpoint_folder: path to dense checkpoint :param experiment: name of experiment config with a prunable architecture """ # We'll use `sparsity=0` to ensure it's dense but prunable model. exp_config = CONFIGS[experiment] exp_config["config_kwargs"]["sparsity"] = 0 exp_parser = HfArgumentParser(ModelArguments) model_args = exp_parser.parse_dict(exp_config)[0] # Initialize prunable model and dense model. config = init_config(model_args) tokenizer = init_tokenizer(model_args) prunable_model = AutoModelForMaskedLM.from_config(config) prunable_model.resize_token_embeddings(len(tokenizer)) dense_model = AutoModelForMaskedLM.from_pretrained(checkpoint_folder) # Determine which parameters belong to SparseWeightsBase classes. sparse_params = filter_params(prunable_model, include_modules=[SparseWeightsBase]) sparse_dataptrs = [p.data_ptr() for p in sparse_params.values()] # Load the dense params into the prunable params. for n2, p2 in prunable_model.named_parameters(): # e.g. replace `linear.module.weight` with `linear.weight` when appropriate. if p2.data_ptr() in sparse_dataptrs: n1 = n2.replace(".module", "") else: n1 = n2 p1 = get_module_attr(dense_model, n1) p2.data[:] = p1 # Save the prunable model. new_folder_name = checkpoint_folder + "_prunable" prunable_model.save_pretrained(new_folder_name) print(f"Saved prunable model to:\n{new_folder_name}")
def run_finetuning_single_task(model_args, data_args, training_args, last_checkpoint=None): """On a single task train, evaluate, and save results""" datasets = init_datasets_task(data_args, training_args) is_regression, label_list, num_labels = get_labels(datasets, data_args) logging.info(f"Training {data_args.task_name} with {num_labels} labels") # For finetuning required to add labels and task name to config kwargs extra_config_kwargs = dict( num_labels=num_labels, finetuning_task=data_args.task_name, ) config = init_config(model_args, extra_config_kwargs=extra_config_kwargs) tokenizer = init_tokenizer(model_args) model = init_model(model_args, config, tokenizer, finetuning=True) # Tokenizing and preprocessing the datasets for downstream tasks # TODO: load from cached tokenized datasets for finetuning as well logging.info(f"Tokenizing datasets for finetuning ...") tokenized_datasets = preprocess_datasets_task(datasets, tokenizer, data_args, model, num_labels, label_list, is_regression) # Separate into train, eval and test train_dataset = tokenized_datasets["train"] eval_dataset = tokenized_datasets["validation_matched" if data_args. task_name == "mnli" else "validation"] test_dataset = None if ((data_args.task_name is not None or data_args.test_file is not None) and training_args.do_predict): test_dataset = tokenized_datasets["test_matched" if data_args. task_name == "mnli" else "test"] # Log fingerprint used in HF smart caching logging.info(f"Dataset fingerprint: {train_dataset._fingerprint}") # Data collator will default to DataCollatorWithPadding, # so we change it if we already did the padding. if data_args.pad_to_max_length: data_collator = default_data_collator elif training_args.fp16: data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) else: data_collator = None # Train trainer = init_trainer( tokenizer=tokenizer, data_collator=data_collator, training_args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, model=init_model(model_args, config, tokenizer), trainer_callbacks=model_args.trainer_callbacks or None, finetuning=True, task_name=data_args.task_name, is_regression=is_regression) if training_args.do_train: train(trainer, training_args.output_dir, last_checkpoint) # Evaluate eval_results = {} if training_args.do_eval: logging.info("*** Evaluate ***") # Handle special case of extra validation dataset for MNLI tasks = [data_args.task_name] eval_datasets = [eval_dataset] if data_args.task_name == "mnli": tasks.append("mnli-mm") eval_datasets.append(tokenized_datasets["validation_mismatched"]) eval_results = evaluate_tasks(trainer, training_args.output_dir, tasks, eval_datasets) # Test/Predict if training_args.do_predict: logging.info("*** Test ***") # Handle special case of extra test dataset for MNLI tasks = [data_args.task_name] test_datasets = [test_dataset] if data_args.task_name == "mnli": tasks.append("mnli-mm") test_datasets.append(tokenized_datasets["test_mismatched"]) test_tasks(trainer, training_args.output_dir, tasks, test_datasets, is_regression, label_list) # There is an existing issue on training multiple models in sequence in this code # There is a memory leakage on the model, a small amount of GPU memory remains after # the run and accumulates over several runs. It fails with OOM after about 20 runs, # even when all tensors on GPU are explicitly deleted, garbage is collected and # cache is cleared. Tried multiple solutions but this weird little hack is the only # thing that worked. model.to("cpu") return eval_results
def run_pretraining(model_args, data_args, training_args, last_checkpoint=None): """Pretrain and evaluate a language model""" logging.info(f"Pre-training a masked language model.") datasets, tokenized_datasets, dataset_path = init_datasets_mlm(data_args) config = init_config(model_args) tokenizer = init_tokenizer(model_args) if tokenized_datasets is None: # Tokenizing and preprocessing the datasets for language modeling if training_args.do_train: column_names = datasets["train"].column_names else: column_names = datasets["validation"].column_names text_column_name = "text" if "text" in column_names else column_names[0] logging.info(f"Tokenizing datasets for pretraining ...") tokenized_datasets = preprocess_datasets_mlm(datasets, tokenizer, data_args, column_names, text_column_name) # Save only if a dataset_path has been defined in the previous steps # that will be True only when loading from dataset hub if data_args.save_tokenized_data and dataset_path is not None: logging.info(f"Saving tokenized dataset to {dataset_path}") tokenized_datasets.save_to_disk(dataset_path) # Separate into train, eval and test train_dataset = tokenized_datasets["train"] eval_dataset = tokenized_datasets["validation"] # Log fingerprint used in HF smart caching logging.info(f"Dataset fingerprint: {train_dataset._fingerprint}") # Data collator will take care of randomly masking the tokens. # argument defined in experiment config assert hasattr(transformers, data_args.data_collator), \ f"Data collator {data_args.data_collator} not available" data_collator = getattr(transformers, data_args.data_collator)( tokenizer=tokenizer, mlm_probability=data_args.mlm_probability) # Run hp search or regular training if model_args.hp_num_trials >= 1: run_hyperparameter_search( model_args=model_args, config=config, tokenizer=tokenizer, data_collator=data_collator, training_args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, ) else: trainer = init_trainer( tokenizer=tokenizer, data_collator=data_collator, training_args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, model=init_model(model_args, config, tokenizer), trainer_class=model_args.trainer_class, trainer_callbacks=model_args.trainer_callbacks or None, ) if training_args.do_train: train(trainer, training_args.output_dir, last_checkpoint) # Evaluate in full eval dataset. # if using hp search, load best model before running evaluate if training_args.do_eval: logging.info("*** Evaluate ***") evaluate_language_model(trainer, eval_dataset, training_args.output_dir)
def calculate_sparsity_param(sparsity_desired, parameters_desired, experiment, test_sparsity=False): """ :param sparsity_desired: desired sparsity of model :param parameters_desired: desired number of on-params; can't be used with sparsity_desired :param experiment: name of experiment config with a sparse architecture :param test_sparsity: whether to test the calculated sparsity param, this test loads the model and calculates the resulting sparsity. """ # Ensure sparsity_desired or parameters_desired is specified but not both. assert not (sparsity_desired is None and parameters_desired is None) assert sparsity_desired is not None or parameters_desired is not None print(bold("Initializing model... ") + "(this may take a minute)") print(f" experiment: {experiment}") # Load and parse model args from config. exp_config = CONFIGS[experiment] exp_parser = HfArgumentParser(ModelArguments) model_args = exp_parser.parse_dict(exp_config)[0] model_args = replace(model_args, cache_dir=None) # enable to run locally print(bold("\n\nModel parameters:\n") + pdict(model_args.__dict__)) print() # Initialize model. config = init_config(model_args) tokenizer = init_tokenizer(model_args) model = AutoModelForMaskedLM.from_config(config) model.resize_token_embeddings(len(tokenizer)) print(bold("Calculating target sparsity...")) # Get sparse modules and calculate total number of sparsifiable params. sparse_modules = filter_modules(model.bert, include_modules=[SparseWeightsBase]) sparsifiable_params = 0 for _, m in sparse_modules.items(): sparsifiable_params += m.zero_mask.numel() # Calculate the total number of params and the needed sparsity. total_params, _ = count_nonzero_params(model.bert) if parameters_desired is None: parameters_desired = total_params * (1 - sparsity_desired) elif sparsity_desired is None: sparsity_desired = parameters_desired / total_params dense_params = total_params - sparsifiable_params target_sparsity = 1 - (parameters_desired - dense_params) / sparsifiable_params print(f" sparsity_desired: {sparsity_desired}") print(f" parameters_desired: {parameters_desired}") print(f" sparsifiable_params: {sparsifiable_params}") print(f" total_params: {total_params}") print(f" target_sparsity: {target_sparsity} (set your sparsity to this)") print() if not test_sparsity: return print(bold("Testing target sparsity...")) # Edit config to use the new sparsity param (sparsity=target_sparsity). exp_config["config_kwargs"]["sparsity"] = target_sparsity exp_parser = HfArgumentParser(ModelArguments) model_args = exp_parser.parse_dict(exp_config)[0] model_args = replace(model_args, cache_dir=None) # remove to run locally # Initialize model; this time with the new sparsity param. config = init_config(model_args) tokenizer = init_tokenizer(model_args) model = AutoModelForMaskedLM.from_config(config) model.resize_token_embeddings(len(tokenizer)) # Set all on-weights to one to make sure none are randomly off. sparse_modules = filter_modules(model.bert, include_modules=[SparseWeightsBase]) for _, m in sparse_modules.items(): m.weight.data[:] = 1 model.apply(rezero_weights) # set off weights to zero. resulting_sparsity = calc_model_sparsity(model.bert) _, nz_params = count_nonzero_params(model.bert) print( f" Resulting sparsity of model.bert using sparsity={target_sparsity}\n" f" actual_sparsity={resulting_sparsity}\n" f" num_nonzero_params={nz_params}\n") print(f" Note this may not be exactly as desired as there are " "discrete levels of allowable sparsity") print()
def init_dataset_for_finetuning(model_args, data_args, training_args, last_checkpoint=None): # TODO # edit multi_eval_sets so you can gather not just multiple eval sets # for a single task, but eval sets from multiple tasks datasets = init_datasets_task(data_args, training_args) is_regression, label_list, num_labels = get_labels(datasets, data_args) logging.info(f"Training {data_args.task_name} with {num_labels} labels") # For finetuning required to add labels and task name to config kwargs extra_config_kwargs = dict( num_labels=num_labels, finetuning_task=data_args.task_name, ) config = init_config(model_args, extra_config_kwargs=extra_config_kwargs) tokenizer = init_tokenizer(model_args) model = init_model(model_args, config, tokenizer, finetuning=True) check_sparsity_callback(model, model_args) check_mnli(model_args, data_args.task_name) # Tokenizing and preprocessing the datasets for downstream tasks # TODO: load from cached tokenized datasets for finetuning as well logging.info(f"Tokenizing datasets for finetuning ...") tokenized_datasets = preprocess_datasets_task(datasets, tokenizer, data_args, model, num_labels, label_list, is_regression) # Separate into train, eval and test train_dataset = tokenized_datasets["train"] # Allow multiple eval sets. For now, assume mnli is the only case eval_dataset = [] if data_args.task_name == "mnli": if "eval_sets" in training_args.trainer_mixin_args: for eval_set in training_args.trainer_mixin_args["eval_sets"]: eval_dataset.append(tokenized_datasets[eval_set]) else: eval_dataset.append(tokenized_datasets["validation_matched"]) else: eval_dataset.append(tokenized_datasets["validation"]) # If only one eval set, no need for a list if len(eval_dataset) == 1: eval_dataset = eval_dataset[0] test_dataset = None if (data_args.task_name is not None or data_args.test_file is not None): if training_args.do_predict: test_dataset = tokenized_datasets["test_matched" if data_args. task_name == "mnli" else "test"] # Log fingerprint used in HF smart caching logging.info(f"Dataset fingerprint: {train_dataset._fingerprint}") # Data collator will default to DataCollatorWithPadding, # so we change it if we already did the padding. if data_args.pad_to_max_length: data_collator = default_data_collator elif training_args.fp16: data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) else: data_collator = None return (tokenizer, data_collator, train_dataset, eval_dataset, test_dataset, model, is_regression, tokenized_datasets, label_list, config)