def test_huggingface_model_file(tmp_dir, model, args, data, tokenizer): dvclive.init("logs") model_path = tmp_dir / "model_hf" trainer = Trainer( model, args, train_dataset=data[0], eval_dataset=data[1], tokenizer=tokenizer, compute_metrics=compute_metrics, ) trainer.add_callback(DvcLiveCallback(model_file=model_path)) trainer.train() assert model_path.is_dir() assert (model_path / "pytorch_model.bin").exists() assert (model_path / "config.json").exists()
def test_huggingface_integration(tmp_dir, model, args, data, tokenizer): trainer = Trainer( model, args, train_dataset=data[0], eval_dataset=data[1], tokenizer=tokenizer, compute_metrics=compute_metrics, ) trainer.add_callback(DvcLiveCallback()) trainer.train() assert os.path.exists("dvclive") logs, _ = read_logs(tmp_dir / "dvclive" / Scalar.subfolder) assert len(logs) == 10 assert "eval_matthews_correlation" in logs assert "eval_loss" in logs assert len(logs["epoch"]) == 3 assert len(logs["eval_loss"]) == 2
def test_huggingface_model_file(tmp_dir, model, args, data, tokenizer, mocker): model_path = tmp_dir / "model_hf" model_save = mocker.spy(model, "save_pretrained") tokernizer_save = mocker.spy(tokenizer, "save_pretrained") trainer = Trainer( model, args, train_dataset=data[0], eval_dataset=data[1], tokenizer=tokenizer, compute_metrics=compute_metrics, ) trainer.add_callback(DvcLiveCallback(model_file=model_path)) trainer.train() assert model_path.is_dir() assert (model_path / "pytorch_model.bin").exists() assert (model_path / "config.json").exists() assert model_save.call_count == 2 assert (model_path / "tokenizer.json").exists() assert tokernizer_save.call_count == 2
def train(data_args, last_checkpoint, model, model_args, tokenized_datasets, tokenizer, training_args): # Data collator # This one will take care of randomly masking the tokens. data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm_probability=data_args.mlm_probability) tb_logger = pl_loggers.TensorBoardLogger('experiments/dilbert/logs/') # Initialize our Trainer trainer = Trainer(model=model, args=training_args, train_dataset=tokenized_datasets["train"] if training_args.do_train else None, eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator) writer = SummaryWriter('experiments/dilbert/logs') trainer.add_callback(TensorBoardCallback(tb_writer=writer)) trainer.add_callback(GPUMemoryPrinterCallback()) # Training if training_args.do_train: if last_checkpoint is not None: checkpoint = last_checkpoint elif model_args.model_name_or_path is not None and os.path.isdir( model_args.model_name_or_path): checkpoint = model_args.model_name_or_path else: checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload output_train_file = os.path.join(training_args.output_dir, "train_results.txt") if trainer.is_world_process_zero(): with open(output_train_file, "w") as writer: logger.info("***** Train results *****") for key, value in sorted(train_result.metrics.items()): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") # Need to save the state, since Trainer.save_model saves only the tokenizer with the model trainer.state.save_to_json( os.path.join(training_args.output_dir, "trainer_state.json")) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["eval_loss"]) results["perplexity"] = perplexity output_eval_file = os.path.join(training_args.output_dir, "eval_results_mlm.txt") if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in sorted(results.items()): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") return results
def train(self, inoculation_train_df, eval_df, model_path, training_args, max_length=128, inoculation_patience_count=5, pd_format=True, scramble_proportion=0.0, eval_with_scramble=False): if pd_format: datasets = {} datasets["train"] = Dataset.from_pandas(inoculation_train_df) datasets["validation"] = Dataset.from_pandas(eval_df) else: datasets = {} datasets["train"] = inoculation_train_df datasets["validation"] = eval_df logger.info(f"***** Train Sample Count (Verify): %s *****"%(len(datasets["train"]))) logger.info(f"***** Valid Sample Count (Verify): %s *****"%(len(datasets["validation"]))) label_list = datasets["validation"].unique("label") label_list.sort() # Let's sort it for determinism sentence1_key, sentence2_key = self.task_config # we will scramble out input sentence here # TODO: we scramble both train and eval sets if self.task_name == "sst3" or self.task_name == "cola": def scramble_inputs(proportion, example): original_text = example[sentence1_key] original_sentence = basic_tokenizer.tokenize(original_text) max_length = len(original_sentence) scramble_length = int(max_length*proportion) scramble_start = random.randint(0, len(original_sentence)-scramble_length) scramble_end = scramble_start + scramble_length scramble_sentence = original_sentence[scramble_start:scramble_end] random.shuffle(scramble_sentence) scramble_text = original_sentence[:scramble_start] + scramble_sentence + original_sentence[scramble_end:] out_string = " ".join(scramble_text).replace(" ##", "").strip() example[sentence1_key] = out_string return example elif self.task_name == "snli" or self.task_name == "mrpc" or self.task_name == "qnli": def scramble_inputs(proportion, example): original_premise = example[sentence1_key] original_hypothesis = example[sentence2_key] if original_hypothesis == None: original_hypothesis = "" try: original_premise_tokens = basic_tokenizer.tokenize(original_premise) original_hypothesis_tokens = basic_tokenizer.tokenize(original_hypothesis) except: print("Please debug these sequence...") print(original_premise) print(original_hypothesis) max_length = len(original_premise_tokens) scramble_length = int(max_length*proportion) scramble_start = random.randint(0, max_length-scramble_length) scramble_end = scramble_start + scramble_length scramble_sentence = original_premise_tokens[scramble_start:scramble_end] random.shuffle(scramble_sentence) scramble_text_premise = original_premise_tokens[:scramble_start] + scramble_sentence + original_premise_tokens[scramble_end:] max_length = len(original_hypothesis_tokens) scramble_length = int(max_length*proportion) scramble_start = random.randint(0, max_length-scramble_length) scramble_end = scramble_start + scramble_length scramble_sentence = original_hypothesis_tokens[scramble_start:scramble_end] random.shuffle(scramble_sentence) scramble_text_hypothesis = original_hypothesis_tokens[:scramble_start] + scramble_sentence + original_hypothesis_tokens[scramble_end:] out_string_premise = " ".join(scramble_text_premise).replace(" ##", "").strip() out_string_hypothesis = " ".join(scramble_text_hypothesis).replace(" ##", "").strip() example[sentence1_key] = out_string_premise example[sentence2_key] = out_string_hypothesis return example if scramble_proportion > 0.0: logger.info(f"You are scrambling the inputs to test syntactic feature importance!") datasets["train"] = datasets["train"].map(partial(scramble_inputs, scramble_proportion)) if eval_with_scramble: logger.info(f"You are scrambling the evaluation data as well!") datasets["validation"] = datasets["validation"].map(partial(scramble_inputs, scramble_proportion)) padding = "max_length" sentence1_key, sentence2_key = self.task_config label_to_id = None def preprocess_function(examples): # Tokenize the texts args = ( (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key]) ) result = self.tokenizer(*args, padding=padding, max_length=max_length, truncation=True) # Map labels to IDs (not necessary for GLUE tasks) if label_to_id is not None and "label" in examples: result["label"] = [label_to_id[l] for l in examples["label"]] return result datasets["train"] = datasets["train"].map(preprocess_function, batched=True) datasets["validation"] = datasets["validation"].map(preprocess_function, batched=True) train_dataset = datasets["train"] eval_dataset = datasets["validation"] # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") metric = load_metric("glue", "sst2") # any glue task will do the job, just for eval loss def asenti_compute_metrics(p: EvalPrediction): preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions preds = np.argmax(preds, axis=1) result_to_print = classification_report(p.label_ids, preds, digits=5, output_dict=True) print(classification_report(p.label_ids, preds, digits=5)) mcc_scores = matthews_corrcoef(p.label_ids, preds) logger.info(f"MCC scores: {mcc_scores}.") result_to_return = metric.compute(predictions=preds, references=p.label_ids) result_to_return["Macro-F1"] = result_to_print["macro avg"]["f1-score"] result_to_return["MCC"] = mcc_scores return result_to_return # Initialize our Trainer. We are only intersted in evaluations trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=asenti_compute_metrics, tokenizer=self.tokenizer, # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding. data_collator=default_data_collator ) # Early stop if inoculation_patience_count != -1: trainer.add_callback(EarlyStoppingCallback(inoculation_patience_count)) # Training if training_args.do_train: logger.info("*** Training our model ***") trainer.train( # we don't need this now. # model_path=model_path ) trainer.save_model() # Saves the tokenizer too for easy upload # Evaluation eval_results = {} if training_args.do_eval: logger.info("*** Evaluate ***") tasks = [self.task_name] eval_datasets = [eval_dataset] for eval_dataset, task in zip(eval_datasets, tasks): eval_result = trainer.evaluate(eval_dataset=eval_dataset) output_eval_file = os.path.join(training_args.output_dir, f"eval_results_{task}.txt") if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: logger.info(f"***** Eval results {task} *****") for key, value in eval_result.items(): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") eval_results.update(eval_result)
# Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics_pos if args.task_name == "en_ewt" else compute_metrics_ner, ) # Early stop if args.inoculation_patience_count != -1: trainer.add_callback( EarlyStoppingCallback(args.inoculation_patience_count)) # Training if training_args.do_train: checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) metrics = train_result.metrics trainer.save_model() # Saves the tokenizer too for easy upload metrics["train_samples"] = len(train_dataset) # trainer.log_metrics("train", metrics) # trainer.save_metrics("train", metrics) # trainer.save_state() # Evaluation
def main(): if is_wandb_available(): import wandb parser = HfArgumentParser( (ModelArguments, DatasetArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty." "Use --overwrite_output_dir to overcome.") # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN, ) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name) else: datasets = load_from_disk(data_args.dataset_path) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if model_args.model_name_or_path: model = AutoModelForCausalLM.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = AutoModelForCausalLM.from_config(config) model.resize_token_embeddings(len(tokenizer)) # Preprocessing the datasets. # First we tokenize all the texts. if training_args.do_train: column_names = datasets["train"].column_names else: column_names = datasets["validation"].column_names text_column_name = "text" if "text" in column_names else column_names[0] def tokenize_function(examples): return tokenizer(examples[text_column_name]) tokenized_datasets = datasets.map( tokenize_function, batched=True, num_proc=None, remove_columns=column_names, load_from_cache_file=not False, ) block_size = tokenizer.model_max_length if block_size > 1024: logger.warn( f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " "Picking 1024 instead. You can change that default value by passing --block_size xxx." ) block_size = 1024 # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. def group_texts(examples): # Concatenate all texts. concatenated_examples = { k: sum(examples[k], []) for k in examples.keys() } total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. total_length = (total_length // block_size) * block_size # Split by chunks of max_len. result = { k: [t[i:i + block_size] for i in range(0, total_length, block_size)] for k, t in concatenated_examples.items() } result["labels"] = result["input_ids"].copy() return result # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower # to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map lm_datasets = tokenized_datasets.map( group_texts, batched=True, num_proc=None, load_from_cache_file=not False, ) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=lm_datasets["train"] if training_args.do_train else None, eval_dataset=lm_datasets["validation"] if training_args.do_eval else None, tokenizer=tokenizer, # Data collator will default to DataCollatorWithPadding, so we change it. data_collator=default_data_collator, ) # Add Callbacks savvi_callback = SavviCallback() trainer.add_callback(savvi_callback) # Training if training_args.do_train: model_path = (model_args.model_name_or_path if (model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path)) else None) trainer.train(model_path=model_path) trainer.save_model() # Saves the tokenizer too for easy upload # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["eval_loss"]) results["perplexity"] = perplexity output_eval_file = os.path.join(training_args.output_dir, "eval_results_clm.txt") if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in results.items(): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") savvihub.log(step=0, row={'perplexity': results["perplexity"]}) return results
def evaluate_model(cache_dict, layer_id, finally_pruned_layers, config, model_args, data_args, training_args, train_dataset, eval_dataset, compute_metrics, tokenizer, data_collator, datasets): # Set seed before initializing model. set_seed(training_args.seed) if (layer_id in cache_dict): print("Layer %d from cache: %.4f" % (layer_id, cache_dict[layer_id])) return cache_dict[layer_id] print(f"Calculate layer {str(layer_id)}") model = create_model(config, model_args) model.prune_layers(finally_pruned_layers) if isinstance(layer_id, int): model.prune_layers([layer_id]) # for param in model.base_model.parameters(): # param.requires_grad = False trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics, tokenizer=tokenizer, data_collator=data_collator, ) trainer.add_callback(DisableCheckpointCallbackHandler()) # Training train_result = trainer.train(resume_from_checkpoint=None) metrics = train_result.metrics # Evaluation eval_results = {} logger.info("*** Evaluate ***") # Loop to handle MNLI double evaluation (matched, mis-matched) tasks = [data_args.task_name] eval_datasets = [eval_dataset] if data_args.task_name == "mnli": tasks.append("mnli-mm") eval_mismatch = datasets["validation_mismatched"] eval_datasets.append(eval_mismatch) for eval_dataset, task in zip(eval_datasets, tasks): eval_result = trainer.evaluate(eval_dataset=eval_dataset) eval_results.update(eval_result) # res = eval_results.get("eval_loss", None) res = None res = res or eval_results.get("eval_f1", None) res = res or eval_results.get("eval_accuracy", None) res = res or eval_results.get("eval_spearmanr", None) res = res or eval_results.get("eval_matthews_correlation", None) res = round(res, 3) if (res == None): raise Exception("Now performance metric found!") cache_dict[layer_id] = res return cache_dict
def main(): args = get_args() set_seed(args.seed) dataset = load_dataset("codeparrot/codecomplex", split="train") train_test = dataset.train_test_split(test_size=0.2) test_validation = train_test["test"].train_test_split(test_size=0.5) train_test_validation = DatasetDict({ "train": train_test["train"], "test": test_validation["train"], "valid": test_validation["test"], }) print("Loading tokenizer and model") tokenizer = AutoTokenizer.from_pretrained(args.model_ckpt) tokenizer.pad_token = tokenizer.eos_token model = AutoModelForSequenceClassification.from_pretrained(args.model_ckpt, num_labels=7) model.config.pad_token_id = model.config.eos_token_id if args.freeze: for param in model.roberta.parameters(): param.requires_grad = False labels = ClassLabel(num_classes=7, names=list( set(train_test_validation["train"]["complexity"]))) def tokenize(example): inputs = tokenizer(example["src"], truncation=True, max_length=1024) label = labels.str2int(example["complexity"]) return { "input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "label": label, } tokenized_datasets = train_test_validation.map( tokenize, batched=True, remove_columns=train_test_validation["train"].column_names, ) data_collator = DataCollatorWithPadding(tokenizer=tokenizer) training_args = TrainingArguments( output_dir=args.output_dir, learning_rate=args.learning_rate, lr_scheduler_type=args.lr_scheduler_type, evaluation_strategy="epoch", save_strategy="epoch", logging_strategy="epoch", per_device_train_batch_size=args.batch_size, per_device_eval_batch_size=args.batch_size, num_train_epochs=args.num_epochs, gradient_accumulation_steps=args.gradient_accumulation_steps, weight_decay=0.01, metric_for_best_model="accuracy", run_name="complexity-java", report_to="wandb", ) trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["valid"], tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, ) print("Training...") trainer.add_callback(CustomCallback(trainer)) trainer.train()
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Detecting last checkpoint. last_checkpoint = None #training_args.output_dir = f"{training_args.output_dir}/{data_args.task_name}/{model_args.model_name_or_path}/{model_args.prune_method}/{str(model_args.prune_n_layers)}/{str(training_args.seed)}" if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank ) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info(f"Training/evaluation parameters {training_args}") # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below) # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use as labels the column called 'label' and as pair of sentences the # sentences in columns called 'sentence1' and 'sentence2' if such column exists or the first two columns not named # label if at least two columns are provided. # # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this # single column. You can easily tweak this behavior (see below) # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.task_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset("glue", data_args.task_name) else: # Loading a dataset from your local files. # CSV/JSON training and evaluation files are needed. data_files = { "train": data_args.train_file, "validation": data_args.validation_file } # Get the test dataset: you can provide your own CSV/JSON test file (see below) # when you use `do_predict` without specifying a GLUE benchmark task. if training_args.do_predict: if data_args.test_file is not None: train_extension = data_args.train_file.split(".")[-1] test_extension = data_args.test_file.split(".")[-1] assert ( test_extension == train_extension ), "`test_file` should have the same extension (csv or json) as `train_file`." data_files["test"] = data_args.test_file else: raise ValueError( "Need either a GLUE task or a test file for `do_predict`.") for key in data_files.keys(): logger.info(f"load a local file for {key}: {data_files[key]}") if data_args.train_file.endswith(".csv"): # Loading a dataset from local csv files datasets = load_dataset("csv", data_files=data_files) else: # Loading a dataset from local json files datasets = load_dataset("json", data_files=data_files) # See more about loading any type of standard or custom dataset at # https://huggingface.co/docs/datasets/loading_datasets.html. # Labels if data_args.task_name is not None: is_regression = data_args.task_name == "stsb" if not is_regression: label_list = datasets["train"].features["label"].names num_labels = len(label_list) else: num_labels = 1 else: # Trying to have good defaults here, don't hesitate to tweak to your needs. is_regression = datasets["train"].features["label"].dtype in [ "float32", "float64" ] if is_regression: num_labels = 1 else: # A useful fast method: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique label_list = datasets["train"].unique("label") label_list.sort() # Let's sort it for determinism num_labels = len(label_list) # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model = create_model(config, model_args) # # Prune model before training starts # if (model_args.prune_method == "prune-greedy"): current_path = pathlib.Path(__file__).parent.absolute() layer_file_path = os.path.join( current_path, "layer_files/", f"{model.name_or_path}_{data_args.task_name}_greedy.txt") with open(layer_file_path, 'r') as f: layers_to_prune = f.readlines() layers_to_prune = layers_to_prune[:model_args.prune_n_layers] layers_to_prune = [int(l.replace("\n", "")) for l in layers_to_prune] print(f"Pruned {str(layers_to_prune)}") model.prune_layers(layers_to_prune) elif (model_args.prune_method == "top-layers"): print( f"# Prune {model_args.prune_n_layers} layers with {model_args.prune_method}" ) first_layer_to_prune = config.num_hidden_layers - model_args.prune_n_layers model.prune_layers( [i for i in range(first_layer_to_prune, config.num_hidden_layers)]) # # Measure number of parameters # It really depends how pruning is implemented - if its deleted from the layers # module list or if the layer is simply skipped (then torch still measures those values) # if hasattr(model.base_model, "encoder"): # base_class = model.base_model.encoder # else: # base_class = model.base_model # layers = base_class.layer # layers = [l for (i, l) in enumerate(layers) if i not in model.get_pruned_layers()] # layers = nn.ModuleList(layers) # setattr(base_class, "layer", layers) # Print number of parameters num_params = sum(p.numel() for p in model.parameters()) print("NUM Paramerers: %d" % num_params) # Preprocessing the datasets if data_args.task_name is not None: sentence1_key, sentence2_key = task_to_keys[data_args.task_name] else: # Again, we try to have some nice defaults but don't hesitate to tweak to your use case. non_label_column_names = [ name for name in datasets["train"].column_names if name != "label" ] if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names: sentence1_key, sentence2_key = "sentence1", "sentence2" else: if len(non_label_column_names) >= 2: sentence1_key, sentence2_key = non_label_column_names[:2] else: sentence1_key, sentence2_key = non_label_column_names[0], None # Padding strategy if data_args.pad_to_max_length: padding = "max_length" else: # We will pad later, dynamically at batch creation, to the max sequence length in each batch padding = False # Some models have set the order of the labels to use, so let's make sure we do use it. label_to_id = None if (model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id and data_args.task_name is not None and not is_regression): # Some have all caps in their config, some don't. label_name_to_id = { k.lower(): v for k, v in model.config.label2id.items() } if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)): label_to_id = { i: label_name_to_id[label_list[i]] for i in range(num_labels) } else: logger.warn( "Your model seems to have been trained with labels, but they don't match the dataset: ", f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}." "\nIgnoring the model labels as a result.", ) elif data_args.task_name is None and not is_regression: label_to_id = {v: i for i, v in enumerate(label_list)} if data_args.max_seq_length > tokenizer.model_max_length: logger.warn( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) def preprocess_function(examples): # Tokenize the texts args = ((examples[sentence1_key], ) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])) result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True) # Map labels to IDs (not necessary for GLUE tasks) if label_to_id is not None and "label" in examples: result["label"] = [label_to_id[l] for l in examples["label"]] return result datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache) train_dataset = datasets["train"] eval_dataset = datasets["validation_matched" if data_args.task_name == "mnli" else "validation"] if data_args.task_name is not None or data_args.test_file is not None: test_dataset = datasets["test_matched" if data_args.task_name == "mnli" else "test"] # Get the metric function if data_args.task_name is not None: metric = load_metric("glue", data_args.task_name) # TODO: When datasets metrics include regular accuracy, make an else here and remove special branch from # compute_metrics # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a # predictions and label_ids field) and has to return a dictionary string to float. def compute_metrics(p: EvalPrediction): preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1) if data_args.task_name is not None: result = metric.compute(predictions=preds, references=p.label_ids) if len(result) > 1: result["combined_score"] = np.mean(list( result.values())).item() return result elif is_regression: return {"mse": ((preds - p.label_ids)**2).mean().item()} else: return { "accuracy": (preds == p.label_ids).astype(np.float32).mean().item() } # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding. if data_args.pad_to_max_length: data_collator = default_data_collator elif training_args.fp16: data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) else: data_collator = None # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset if training_args.do_eval else None, compute_metrics=compute_metrics, tokenizer=tokenizer, data_collator=data_collator, ) # trainer.add_callback( # PruneCallbackHandler( # model_args.prune_method, # model_args.prune_n_layers, # data_args.task_name) # ) trainer.add_callback(DisableCheckpointCallbackHandler()) # Training if training_args.do_train: if last_checkpoint is not None: checkpoint = last_checkpoint elif os.path.isdir(model_args.model_name_or_path): checkpoint = model_args.model_name_or_path else: checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) metrics = train_result.metrics #trainer.save_model() # Saves the tokenizer too for easy upload output_train_file = os.path.join(training_args.output_dir, "train_results.txt") if trainer.is_world_process_zero(): with open(output_train_file, "w") as writer: logger.info("***** Train results *****") for key, value in sorted(metrics.items()): logger.info(f" {key}: {value}") writer.write(f"{key}: {value}\n") # Need to save the state, since Trainer.save_model saves only the tokenizer with the model trainer.state.save_to_json( os.path.join(training_args.output_dir, "trainer_state.json")) # Evaluation eval_results = {} if training_args.do_eval: logger.info("*** Evaluate ***") # Loop to handle MNLI double evaluation (matched, mis-matched) tasks = [data_args.task_name] eval_datasets = [eval_dataset] if data_args.task_name == "mnli": tasks.append("mnli-mm") eval_datasets.append(datasets["validation_mismatched"]) for eval_dataset, task in zip(eval_datasets, tasks): # eval_result = trainer.evaluate(eval_dataset=eval_dataset) start = time.time() eval_result = trainer.evaluate(eval_dataset=eval_dataset) print(f"TIMING: {(time.time() - start)} ") exit() output_eval_file = os.path.join(training_args.output_dir, f"eval_results_{task}.txt") if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: logger.info(f"***** Eval results {task} *****") for key, value in sorted(eval_result.items()): logger.info(f" {key}: {value}") writer.write(f"{key}: {value}\n") eval_results.update(eval_result) if training_args.do_predict: logger.info("*** Test ***") # Loop to handle MNLI double evaluation (matched, mis-matched) tasks = [data_args.task_name] test_datasets = [test_dataset] if data_args.task_name == "mnli": tasks.append("mnli-mm") test_datasets.append(datasets["test_mismatched"]) for test_dataset, task in zip(test_datasets, tasks): # Removing the `label` columns because it contains -1 and Trainer won't like that. test_dataset.remove_columns_("label") predictions = trainer.predict( test_dataset=test_dataset).predictions predictions = np.squeeze( predictions) if is_regression else np.argmax(predictions, axis=1) output_test_file = os.path.join(training_args.output_dir, f"test_results_{task}.txt") if trainer.is_world_process_zero(): with open(output_test_file, "w") as writer: logger.info(f"***** Test results {task} *****") writer.write("index\tprediction\n") for index, item in enumerate(predictions): if is_regression: writer.write(f"{index}\t{item:3.3f}\n") else: item = label_list[item] writer.write(f"{index}\t{item}\n") return eval_results
metric_for_best_model=args.criterion, load_best_model_at_end=True) collator = get_collator(tokenizer) es_callback = EarlyStoppingCallback(early_stopping_patience=5) print(f"- Training args: {training_args}") trainer = Trainer(model, args=training_args, train_dataset=train_ds, eval_dataset=test_ds, compute_metrics=compute_metrics, optimizers=(optimizer, scheduler), data_collator=collator) trainer.add_callback(es_callback) trainer.train() print(f"- Label encoder mapping:") for i, label in enumerate(label_encoder.classes_): print(f"\t{i}: {label}") if args.save_test_preds: print(f"- Saving predictions to {args.save_test_preds}") preds = trainer.predict(test_ds) y_pred = np.argmax(preds[0], axis=1) test_df = pd.read_csv(args.test_file, index_col=0).dropna() test_df["prediction"] = label_encoder.inverse_transform(y_pred)