def test_default_regression(self): MODEL_ID = "distilroberta-base" tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) data_args = GlueDataTrainingArguments( task_name="sts-b", data_dir="./tests/fixtures/tests_samples/STS-B", overwrite_cache=True ) dataset = GlueDataset(data_args, tokenizer=tokenizer, evaluate=True) data_collator = DefaultDataCollator() batch = data_collator.collate_batch(dataset.features) self.assertEqual(batch["labels"].dtype, torch.float)
def test_default_classification(self): MODEL_ID = "bert-base-cased-finetuned-mrpc" tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) data_args = GlueDataTrainingArguments( task_name="mrpc", data_dir="./tests/fixtures/tests_samples/MRPC", overwrite_cache=True ) dataset = GlueDataset(data_args, tokenizer=tokenizer, evaluate=True) data_collator = DefaultDataCollator() batch = data_collator.collate_batch(dataset.features) self.assertEqual(batch["labels"].dtype, torch.long)
def __init__(self, model: PreTrainedModel, args: TrainingArguments, data_collator: Optional[DataCollator] = None, train_dataset: Optional[Dataset] = None, eval_dataset: Optional[Dataset] = None, compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, prediction_loss_only=False, tb_writer: Optional["SummaryWriter"] = None, optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = None, mi_estimator: Optional[CLUB] = None): """ Trainer is a simple but feature-complete training and eval loop for PyTorch, optimized for Transformers. Args: prediction_loss_only: (Optional) in evaluation and prediction, only return the loss """ self.model = model.to(args.device) self.args = args if data_collator is not None: self.data_collator = data_collator else: self.data_collator = DefaultDataCollator() self.train_dataset = train_dataset self.eval_dataset = eval_dataset self.compute_metrics = compute_metrics self.prediction_loss_only = prediction_loss_only self.mi_estimator = mi_estimator self.optimizers = optimizers if tb_writer is not None: self.tb_writer = tb_writer elif is_tensorboard_available() and self.is_world_master(): self.tb_writer = SummaryWriter(log_dir=self.args.logging_dir) if not is_tensorboard_available(): logger.warning( "You are instantiating a Trainer but Tensorboard is not installed. You should consider installing it." ) if is_wandb_available(): self._setup_wandb() else: logger.info( "You are instantiating a Trainer but W&B is not installed. To use wandb logging, " "run `pip install wandb; wandb login` see https://docs.wandb.com/huggingface." ) set_seed(self.args.seed) # Create output directory if needed if self.is_world_master(): os.makedirs(self.args.output_dir, exist_ok=True) if is_tpu_available(): # Set an xla_device flag on the model's config. # We'll find a more elegant and not need to do this in the future. self.model.config.xla_device = True
def __init__( self, model: PreTrainedModel, args: TrainingArguments, device, data_collator: Optional[DataCollator] = None, train_dataset: Optional[Dataset] = None, eval_dataset: Optional[Dataset] = None, compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, prediction_loss_only=False, tb_writer: Optional["SummaryWriter"] = None, optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = None, ): """ Trainer is a simple but feature-complete training and eval loop for PyTorch, optimized for Transformers. Args: prediction_loss_only: (Optional) in evaluation and prediction, only return the loss """ self.model = model.to(device) self.args = args self.device = device if data_collator is not None: self.data_collator = data_collator else: self.data_collator = DefaultDataCollator() self.train_dataset = train_dataset self.eval_dataset = eval_dataset self.compute_metrics = compute_metrics self.prediction_loss_only = prediction_loss_only self.optimizers = optimizers if tb_writer is not None: self.tb_writer = tb_writer elif is_tensorboard_available() and self.is_world_master(): self.tb_writer = SummaryWriter(log_dir=self.args.logging_dir) if not is_tensorboard_available(): logger.warning( "You are instantiating a Trainer but Tensorboard is not installed. You should consider installing it." ) self._setup_wandb() set_seed(self.args.seed) # Create output directory if needed if self.is_world_master(): os.makedirs(self.args.output_dir, exist_ok=True)
def main(): # region Argument Parsing parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TFTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Sanity checks if data_args.dataset_name is None and data_args.train_file is None and data_args.validation_file is None: raise ValueError( "Need either a dataset name or a training/validation file.") else: if data_args.train_file is not None: extension = data_args.train_file.split(".")[-1] assert extension in [ "csv", "json", "txt" ], "`train_file` should be a csv, json or txt file." if data_args.validation_file is not None: extension = data_args.validation_file.split(".")[-1] assert extension in [ "csv", "json", "txt" ], "`validation_file` should be a csv, json or txt file." if training_args.output_dir is not None: training_args.output_dir = Path(training_args.output_dir) os.makedirs(training_args.output_dir, exist_ok=True) # endregion # region Checkpoints # Detecting last checkpoint. checkpoint = None if len(os.listdir(training_args.output_dir) ) > 0 and not training_args.overwrite_output_dir: config_path = training_args.output_dir / CONFIG_NAME weights_path = training_args.output_dir / TF2_WEIGHTS_NAME if config_path.is_file() and weights_path.is_file(): checkpoint = training_args.output_dir logger.info( f"Checkpoint detected, resuming training from checkpoint in {training_args.output_dir}. To avoid this" " behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) else: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to continue regardless.") # endregion # region Setup logging # accelerator.is_local_main_process is only True for one process per machine. logger.setLevel(logging.INFO) datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() # endregion # If passed along, set the training seed now. if training_args.seed is not None: set_seed(training_args.seed) # region Load datasets # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name) if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[:{data_args.validation_split_percentage}%]", ) raw_datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[{data_args.validation_split_percentage}%:]", ) else: data_files = {} dataset_args = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.train_file.split(".")[-1] if extension == "txt": extension = "text" dataset_args["keep_linebreaks"] = data_args.keep_linebreaks raw_datasets = load_dataset(extension, data_files=data_files, **dataset_args) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # endregion # region Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) # endregion # region Dataset preprocessing # First we tokenize all the texts. column_names = raw_datasets["train"].column_names text_column_name = "text" if "text" in column_names else column_names[0] # First we tokenize all the texts. column_names = raw_datasets["train"].column_names text_column_name = "text" if "text" in column_names else column_names[0] def tokenize_function(examples): return tokenizer(examples[text_column_name]) tokenized_datasets = raw_datasets.map( tokenize_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on dataset", ) if data_args.block_size is None: block_size = tokenizer.model_max_length if block_size > 1024: logger.warning( f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " "Picking 1024 instead. You can change that default value by passing --block_size xxx." ) block_size = 1024 else: if data_args.block_size > tokenizer.model_max_length: logger.warning( f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model" f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." ) block_size = min(data_args.block_size, tokenizer.model_max_length) # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. def group_texts(examples): # Concatenate all texts. concatenated_examples = { k: list(chain(*examples[k])) for k in examples.keys() } total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. if total_length >= block_size: total_length = (total_length // block_size) * block_size # Split by chunks of max_len. result = { k: [t[i:i + block_size] for i in range(0, total_length, block_size)] for k, t in concatenated_examples.items() } result["labels"] = result["input_ids"].copy() return result # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower # to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map lm_datasets = tokenized_datasets.map( group_texts, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, desc=f"Grouping texts in chunks of {block_size}", ) train_dataset = lm_datasets["train"] if data_args.validation_file is not None: eval_dataset = lm_datasets["validation"] else: logger.info( f"Validation file not found: using {data_args.validation_split_percentage}% of the dataset as validation as provided in data_args" ) train_indices, val_indices = train_test_split( list(range(len(train_dataset))), test_size=data_args.validation_split_percentage / 100) eval_dataset = train_dataset.select(val_indices) train_dataset = train_dataset.select(train_indices) if data_args.max_train_samples is not None: train_dataset = train_dataset.select(range( data_args.max_train_samples)) if data_args.max_eval_samples is not None: eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info( f"Sample {index} of the training set: {train_dataset[index]}.") # endregion with training_args.strategy.scope(): # region Prepare model if checkpoint is not None: model = TFAutoModelForCausalLM.from_pretrained(checkpoint, config=config) elif model_args.model_name_or_path: model = TFAutoModelForCausalLM.from_pretrained( model_args.model_name_or_path, config=config) else: logger.info("Training new model from scratch") model = TFAutoModelForCausalLM.from_config(config) model.resize_token_embeddings(len(tokenizer)) # endregion # region TF Dataset preparation num_replicas = training_args.strategy.num_replicas_in_sync data_collator = DefaultDataCollator(return_tensors="tf") options = tf.data.Options() options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF tf_train_dataset = train_dataset.to_tf_dataset( # labels are passed as input, as we will use the model's internal loss columns=[ col for col in train_dataset.features if col != "special_tokens_mask" ], shuffle=True, batch_size=num_replicas * training_args.per_device_train_batch_size, collate_fn=data_collator, drop_remainder=True, ).with_options(options) tf_eval_dataset = eval_dataset.to_tf_dataset( # labels are passed as input, as we will use the model's internal loss columns=[ col for col in eval_dataset.features if col != "special_tokens_mask" ], shuffle=False, batch_size=num_replicas * training_args.per_device_train_batch_size, collate_fn=data_collator, drop_remainder=True, ).with_options(options) # endregion # region Optimizer and loss batches_per_epoch = len(train_dataset) // ( num_replicas * training_args.per_device_train_batch_size) # Bias and layernorm weights are automatically excluded from the decay optimizer, lr_schedule = create_optimizer( init_lr=training_args.learning_rate, num_train_steps=int(training_args.num_train_epochs * batches_per_epoch), num_warmup_steps=training_args.warmup_steps, adam_beta1=training_args.adam_beta1, adam_beta2=training_args.adam_beta2, adam_epsilon=training_args.adam_epsilon, weight_decay_rate=training_args.weight_decay, ) # no user-specified loss = will use the model internal loss model.compile(optimizer=optimizer) # endregion # region Training and validation logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {training_args.num_train_epochs}") logger.info( f" Instantaneous batch size per device = {training_args.per_device_train_batch_size}" ) logger.info( f" Total train batch size = {training_args.per_device_train_batch_size * num_replicas}" ) history = model.fit( tf_train_dataset, validation_data=tf_eval_dataset, epochs=int(training_args.num_train_epochs), steps_per_epoch=len(train_dataset) // (training_args.per_device_train_batch_size * num_replicas), callbacks=[ SavePretrainedCallback(output_dir=training_args.output_dir) ], ) try: train_perplexity = math.exp(history.history["loss"][-1]) except OverflowError: train_perplexity = math.inf try: validation_perplexity = math.exp(history.history["val_loss"][-1]) except OverflowError: validation_perplexity = math.inf logger.info(f" Final train loss: {history.history['loss'][-1]:.3f}") logger.info(f" Final train perplexity: {train_perplexity:.3f}") logger.info( f" Final validation loss: {history.history['val_loss'][-1]:.3f}") logger.info( f" Final validation perplexity: {validation_perplexity:.3f}") # endregion if training_args.output_dir is not None: model.save_pretrained(training_args.output_dir) if training_args.push_to_hub: # You'll probably want to include some of your own metadata here! model.push_to_hub()
def main(): # region Argument parsing # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TFTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) output_dir = Path(training_args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) # endregion # region Logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) transformers.utils.logging.set_verbosity(log_level) transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() # endregion # region Checkpoints checkpoint = None if len(os.listdir(training_args.output_dir) ) > 0 and not training_args.overwrite_output_dir: if (output_dir / CONFIG_NAME).is_file() and ( output_dir / TF2_WEIGHTS_NAME).is_file(): checkpoint = output_dir logger.info( f"Checkpoint detected, resuming training from checkpoint in {training_args.output_dir}. To avoid this" " behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) else: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to continue regardless.") # endregion # Set seed before initializing model. set_seed(training_args.seed) # region Load datasets # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.train_file is not None or data_args.validation_file is not None: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.train_file.split(".")[-1] raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) else: # Downloading and loading the swag dataset from the hub. raw_datasets = load_dataset("swag", "regular", cache_dir=model_args.cache_dir) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # When using your own dataset or a different dataset from swag, you will probably need to change this. ending_names = [f"ending{i}" for i in range(4)] context_name = "sent1" question_header_name = "sent2" # endregion # region Load model config and tokenizer if checkpoint is not None: config_path = training_args.output_dir elif model_args.config_name: config_path = model_args.config_name else: config_path = model_args.model_name_or_path # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( config_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # endregion # region Dataset preprocessing if data_args.max_seq_length is None: max_seq_length = tokenizer.model_max_length if max_seq_length > 1024: logger.warning( f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx." ) max_seq_length = 1024 else: if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) def preprocess_function(examples): first_sentences = [[context] * 4 for context in examples[context_name]] question_headers = examples[question_header_name] second_sentences = [[ f"{header} {examples[end][i]}" for end in ending_names ] for i, header in enumerate(question_headers)] # Flatten out first_sentences = list(chain(*first_sentences)) second_sentences = list(chain(*second_sentences)) # Tokenize tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True, max_length=max_seq_length) # Un-flatten data = { k: [v[i:i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items() } return data if training_args.do_train: if "train" not in raw_datasets: raise ValueError("--do_train requires a train dataset") train_dataset = raw_datasets["train"] non_label_columns = [ feature for feature in train_dataset.features if feature not in ("label", "labels") ] if data_args.max_train_samples is not None: train_dataset = train_dataset.select( range(data_args.max_train_samples)) with training_args.main_process_first( desc="train dataset map pre-processing"): train_dataset = train_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) if training_args.do_eval: if "validation" not in raw_datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = raw_datasets["validation"] if not training_args.do_train: non_label_columns = [ feature for feature in eval_dataset.features if feature not in ("label", "labels") ] if data_args.max_eval_samples is not None: eval_dataset = eval_dataset.select( range(data_args.max_eval_samples)) with training_args.main_process_first( desc="validation dataset map pre-processing"): eval_dataset = eval_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) if data_args.pad_to_max_length: data_collator = DefaultDataCollator(return_tensors="tf") else: # custom class defined above, as HF has no data collator for multiple choice data_collator = DataCollatorForMultipleChoice(tokenizer) # endregion with training_args.strategy.scope(): # region Build model if checkpoint is None: model_path = model_args.model_name_or_path else: model_path = checkpoint model = TFAutoModelForMultipleChoice.from_pretrained( model_path, config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) num_replicas = training_args.strategy.num_replicas_in_sync total_train_batch_size = training_args.per_device_train_batch_size * num_replicas total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas if training_args.do_train: total_train_steps = (len(train_dataset) // total_train_batch_size ) * int(training_args.num_train_epochs) optimizer, lr_schedule = create_optimizer( init_lr=training_args.learning_rate, num_train_steps=int(total_train_steps), num_warmup_steps=0) else: optimizer = "adam" # Just put anything in here, since we're not using it anyway model.compile( optimizer=optimizer, loss=tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True), metrics=[ tf.keras.metrics.SparseCategoricalAccuracy(name="accuracy") ], ) # endregion # region Training if training_args.do_train: dataset_exclude_cols = set(non_label_columns + ["label"]) tf_train_dataset = train_dataset.to_tf_dataset( columns=[ col for col in train_dataset.column_names if col not in dataset_exclude_cols ], shuffle=True, batch_size=total_train_batch_size, collate_fn=data_collator, drop_remainder=True, # `label_cols` is needed for user-defined losses, such as in this example label_cols="label" if "label" in train_dataset.column_names else None, ) if training_args.do_eval: validation_data = eval_dataset.to_tf_dataset( columns=[ col for col in eval_dataset.column_names if col not in dataset_exclude_cols ], shuffle=False, batch_size=total_eval_batch_size, collate_fn=data_collator, drop_remainder=True, # `label_cols` is needed for user-defined losses, such as in this example label_cols="label" if "label" in eval_dataset.column_names else None, ) else: validation_data = None model.fit( tf_train_dataset, validation_data=validation_data, epochs=int(training_args.num_train_epochs), callbacks=[ SavePretrainedCallback(output_dir=training_args.output_dir) ], ) # endregion # region Evaluation if training_args.do_eval and not training_args.do_train: dataset_exclude_cols = set(non_label_columns + ["label"]) # Do a standalone evaluation pass tf_eval_dataset = eval_dataset.to_tf_dataset( columns=[ col for col in eval_dataset.column_names if col not in dataset_exclude_cols ], shuffle=False, batch_size=total_eval_batch_size, collate_fn=data_collator, drop_remainder=True, # `label_cols` is needed for user-defined losses, such as in this example label_cols="label" if "label" in eval_dataset.column_names else None, ) model.evaluate(tf_eval_dataset) # endregion # region Push to hub if training_args.push_to_hub: model.push_to_hub( finetuned_from=model_args.model_name_or_path, tasks="multiple-choice", dataset_tags="swag", dataset_args="regular", dataset="SWAG", language="en", )
def main(): # region Argument parsing # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TFTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_glue", model_args, data_args, framework="tensorflow") if not (training_args.do_train or training_args.do_eval or training_args.do_predict): exit( "Must specify at least one of --do_train, --do_eval or --do_predict!" ) # endregion # region Checkpoints checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: checkpoint = get_last_checkpoint(training_args.output_dir) if checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # endregion # region Logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank ) else logging.WARN) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info(f"Training/evaluation parameters {training_args}") # endregion # region Dataset and labels # Set seed before initializing model. set_seed(training_args.seed) # Downloading and loading a dataset from the hub. In distributed training, the load_dataset function guarantee # that only one local process can concurrently download the dataset. datasets = load_dataset( "glue", data_args.task_name, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) # See more about loading any type of standard or custom dataset at # https://huggingface.co/docs/datasets/loading_datasets.html. is_regression = data_args.task_name == "stsb" if not is_regression: label_list = datasets["train"].features["label"].names num_labels = len(label_list) else: num_labels = 1 if data_args.predict_file is not None: logger.info("Preparing user-supplied file for predictions...") data_files = {"data": data_args.predict_file} for key in data_files.keys(): logger.info(f"Loading a local file for {key}: {data_files[key]}") if data_args.predict_file.endswith(".csv"): # Loading a dataset from local csv files user_dataset = load_dataset("csv", data_files=data_files, cache_dir=model_args.cache_dir) else: # Loading a dataset from local json files user_dataset = load_dataset("json", data_files=data_files, cache_dir=model_args.cache_dir) needed_keys = task_to_keys[data_args.task_name] for key in needed_keys: assert key in user_dataset[ "data"].features, f"Your supplied predict_file is missing the {key} key!" datasets["user_data"] = user_dataset["data"] # endregion # region Load model config and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # endregion # region Dataset preprocessing sentence1_key, sentence2_key = task_to_keys[data_args.task_name] non_label_column_names = [ name for name in datasets["train"].column_names if name != "label" ] # Padding strategy if data_args.pad_to_max_length: padding = "max_length" else: # We will pad later, dynamically at batch creation, to the max sequence length in each batch padding = False # Some models have set the order of the labels to use, so let's make sure we do use it. label_to_id = None if config.label2id != PretrainedConfig( num_labels=num_labels).label2id and not is_regression: # Some have all caps in their config, some don't. label_name_to_id = {k.lower(): v for k, v in config.label2id.items()} if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)): label_to_id = { i: int(label_name_to_id[label_list[i]]) for i in range(num_labels) } else: logger.warning( "Your model seems to have been trained with labels, but they don't match the dataset: ", f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}." "\nIgnoring the model labels as a result.", ) label_to_id = {label: i for i, label in enumerate(label_list)} if label_to_id is not None: config.label2id = label_to_id config.id2label = {id: label for label, id in config.label2id.items()} elif data_args.task_name is not None and not is_regression: config.label2id = {l: i for i, l in enumerate(label_list)} config.id2label = {id: label for label, id in config.label2id.items()} if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) def preprocess_function(examples): # Tokenize the texts args = ((examples[sentence1_key], ) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])) result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True) return result datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache) if data_args.pad_to_max_length: data_collator = DefaultDataCollator(return_tensors="tf") else: data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf") # endregion # region Metric function metric = evaluate.load("glue", data_args.task_name) def compute_metrics(preds, label_ids): preds = preds["logits"] preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1) result = metric.compute(predictions=preds, references=label_ids) if len(result) > 1: result["combined_score"] = np.mean(list(result.values())).item() return result # endregion with training_args.strategy.scope(): # region Load pretrained model if checkpoint is None: model_path = model_args.model_name_or_path else: model_path = checkpoint model = TFAutoModelForSequenceClassification.from_pretrained( model_path, config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # endregion # region Optimizer, loss and compilation optimizer = tf.keras.optimizers.Adam( learning_rate=training_args.learning_rate, beta_1=training_args.adam_beta1, beta_2=training_args.adam_beta2, epsilon=training_args.adam_epsilon, clipnorm=training_args.max_grad_norm, ) if is_regression: loss_fn = tf.keras.losses.MeanSquaredError() metrics = [] else: loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True) metrics = ["accuracy"] model.compile(optimizer=optimizer, loss=loss_fn, metrics=metrics) # endregion # region Convert data to a tf.data.Dataset tf_data = dict() max_samples = { "train": data_args.max_train_samples, "validation": data_args.max_eval_samples, "validation_matched": data_args.max_eval_samples, "validation_mismatched": data_args.max_eval_samples, "test": data_args.max_predict_samples, "test_matched": data_args.max_predict_samples, "test_mismatched": data_args.max_predict_samples, "user_data": None, } for key in datasets.keys(): if key == "train" or key.startswith("validation"): assert "label" in datasets[ key].features, f"Missing labels from {key} data!" if key == "train": shuffle = True batch_size = training_args.per_device_train_batch_size drop_remainder = True # Saves us worrying about scaling gradients for the last batch else: shuffle = False batch_size = training_args.per_device_eval_batch_size drop_remainder = False samples_limit = max_samples[key] dataset = datasets[key] if samples_limit is not None: dataset = dataset.select(range(samples_limit)) data = dataset.to_tf_dataset( columns=[ col for col in dataset.column_names if col not in set(non_label_column_names + ["label"]) ], shuffle=shuffle, batch_size=batch_size, collate_fn=data_collator, drop_remainder=drop_remainder, # `label_cols` is needed for user-defined losses, such as in this example label_cols="label" if "label" in dataset.column_names else None, ) tf_data[key] = data # endregion # region Training and validation if training_args.do_train: callbacks = [ SavePretrainedCallback(output_dir=training_args.output_dir) ] if training_args.do_eval and not data_args.task_name == "mnli": # Do both evaluation and training in the Keras fit loop, unless the task is MNLI # because MNLI has two validation sets validation_data = tf_data["validation"] else: validation_data = None model.fit( tf_data["train"], validation_data=validation_data, epochs=int(training_args.num_train_epochs), callbacks=callbacks, ) # endregion # region Evaluation if training_args.do_eval: # We normally do validation as part of the Keras fit loop, but we run it independently # if there was no fit() step (because we didn't train the model) or if the task is MNLI, # because MNLI has a separate validation-mismatched validation set logger.info("*** Evaluate ***") # Loop to handle MNLI double evaluation (matched, mis-matched) if data_args.task_name == "mnli": tasks = ["mnli", "mnli-mm"] tf_datasets = [ tf_data["validation_matched"], tf_data["validation_mismatched"] ] raw_datasets = [ datasets["validation_matched"], datasets["validation_mismatched"] ] else: tasks = [data_args.task_name] tf_datasets = [tf_data["validation"]] raw_datasets = [datasets["validation"]] for raw_dataset, tf_dataset, task in zip(raw_datasets, tf_datasets, tasks): eval_predictions = model.predict(tf_dataset) eval_metrics = compute_metrics(eval_predictions, raw_dataset["label"]) print(f"Evaluation metrics ({task}):") print(eval_metrics) # endregion # region Prediction if training_args.do_predict or data_args.predict_file: logger.info("*** Predict ***") # Loop to handle MNLI double evaluation (matched, mis-matched) tasks = [] tf_datasets = [] raw_datasets = [] if training_args.do_predict: if data_args.task_name == "mnli": tasks.extend(["mnli", "mnli-mm"]) tf_datasets.extend( [tf_data["test_matched"], tf_data["test_mismatched"]]) raw_datasets.extend([ datasets["test_matched"], datasets["test_mismatched"] ]) else: tasks.append(data_args.task_name) tf_datasets.append(tf_data["test"]) raw_datasets.append(datasets["test"]) if data_args.predict_file: tasks.append("user_data") tf_datasets.append(tf_data["user_data"]) raw_datasets.append(datasets["user_data"]) for raw_dataset, tf_dataset, task in zip(raw_datasets, tf_datasets, tasks): test_predictions = model.predict(tf_dataset) if "label" in raw_dataset: test_metrics = compute_metrics(test_predictions, raw_dataset["label"]) print(f"Test metrics ({task}):") print(test_metrics) if is_regression: predictions_to_write = np.squeeze( test_predictions["logits"]) else: predictions_to_write = np.argmax( test_predictions["logits"], axis=1) output_predict_file = os.path.join( training_args.output_dir, f"predict_results_{task}.txt") with open(output_predict_file, "w") as writer: logger.info( f"***** Writing prediction results for {task} *****") writer.write("index\tprediction\n") for index, item in enumerate(predictions_to_write): if is_regression: writer.write(f"{index}\t{item:3.3f}\n") else: item = model.config.id2label[item] writer.write(f"{index}\t{item}\n")
def main(): # region Argument parsing # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_text_classification", model_args, data_args, framework="tensorflow") output_dir = Path(training_args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) # endregion # region Checkpoints # Detecting last checkpoint. checkpoint = None if len(os.listdir(training_args.output_dir)) > 0 and not training_args.overwrite_output_dir: if (output_dir / CONFIG_NAME).is_file() and (output_dir / TF2_WEIGHTS_NAME).is_file(): checkpoint = output_dir logger.info( f"Checkpoint detected, resuming training from checkpoint in {training_args.output_dir}. To avoid this" " behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) else: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to continue regardless." ) # endregion # region Logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO) logger.info(f"Training/evaluation parameters {training_args}") # endregion # region Loading data # For CSV/JSON files, this script will use the 'label' field as the label and the 'sentence1' and optionally # 'sentence2' fields as inputs if they exist. If not, the first two fields not named label are used if at least two # columns are provided. Note that the term 'sentence' can be slightly misleading, as they often contain more than # a single grammatical sentence, when the task requires it. # # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this # single column. You can easily tweak this behavior (see below) # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. data_files = {"train": data_args.train_file, "validation": data_args.validation_file, "test": data_args.test_file} data_files = {key: file for key, file in data_files.items() if file is not None} for key in data_files.keys(): logger.info(f"Loading a local file for {key}: {data_files[key]}") if data_args.input_file_extension == "csv": # Loading a dataset from local csv files datasets = load_dataset( "csv", data_files=data_files, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) else: # Loading a dataset from local json files datasets = load_dataset("json", data_files=data_files, cache_dir=model_args.cache_dir) # See more about loading any type of standard or custom dataset at # https://huggingface.co/docs/datasets/loading_datasets.html. # endregion # region Label preprocessing # If you've passed us a training set, we try to infer your labels from it if "train" in datasets: # By default we assume that if your label column looks like a float then you're doing regression, # and if not then you're doing classification. This is something you may want to change! is_regression = datasets["train"].features["label"].dtype in ["float32", "float64"] if is_regression: num_labels = 1 else: # A useful fast method: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique label_list = datasets["train"].unique("label") label_list.sort() # Let's sort it for determinism num_labels = len(label_list) # If you haven't passed a training set, we read label info from the saved model (this happens later) else: num_labels = None label_list = None is_regression = None # endregion # region Load model config and tokenizer if checkpoint is not None: config_path = training_args.output_dir elif model_args.config_name: config_path = model_args.config_name else: config_path = model_args.model_name_or_path if num_labels is not None: config = AutoConfig.from_pretrained( config_path, num_labels=num_labels, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) else: config = AutoConfig.from_pretrained( config_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # endregion # region Dataset preprocessing # Again, we try to have some nice defaults but don't hesitate to tweak to your use case. column_names = {col for cols in datasets.column_names.values() for col in cols} non_label_column_names = [name for name in column_names if name != "label"] if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names: sentence1_key, sentence2_key = "sentence1", "sentence2" elif "sentence1" in non_label_column_names: sentence1_key, sentence2_key = "sentence1", None else: if len(non_label_column_names) >= 2: sentence1_key, sentence2_key = non_label_column_names[:2] else: sentence1_key, sentence2_key = non_label_column_names[0], None if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) # Ensure that our labels match the model's, if it has some pre-specified if "train" in datasets: if not is_regression and config.label2id != PretrainedConfig(num_labels=num_labels).label2id: label_name_to_id = config.label2id if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)): label_to_id = label_name_to_id # Use the model's labels else: logger.warning( "Your model seems to have been trained with labels, but they don't match the dataset: ", f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels:" f" {list(sorted(label_list))}.\nIgnoring the model labels as a result.", ) label_to_id = {v: i for i, v in enumerate(label_list)} elif not is_regression: label_to_id = {v: i for i, v in enumerate(label_list)} else: label_to_id = None # Now we've established our label2id, let's overwrite the model config with it. config.label2id = label_to_id if config.label2id is not None: config.id2label = {id: label for label, id in label_to_id.items()} else: config.id2label = None else: label_to_id = config.label2id # Just load the data from the model if "validation" in datasets and config.label2id is not None: validation_label_list = datasets["validation"].unique("label") for val_label in validation_label_list: assert val_label in label_to_id, f"Label {val_label} is in the validation set but not the training set!" def preprocess_function(examples): # Tokenize the texts args = ( (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key]) ) result = tokenizer(*args, max_length=max_seq_length, truncation=True) # Map labels to IDs if config.label2id is not None and "label" in examples: result["label"] = [(config.label2id[l] if l != -1 else -1) for l in examples["label"]] return result datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache) if data_args.pad_to_max_length: data_collator = DefaultDataCollator(return_tensors="tf") else: data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf") # endregion with training_args.strategy.scope(): # region Load pretrained model # Set seed before initializing model set_seed(training_args.seed) # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if checkpoint is None: model_path = model_args.model_name_or_path else: model_path = checkpoint model = TFAutoModelForSequenceClassification.from_pretrained( model_path, config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # endregion # region Optimizer, loss and compilation optimizer = tf.keras.optimizers.Adam( learning_rate=training_args.learning_rate, beta_1=training_args.adam_beta1, beta_2=training_args.adam_beta2, epsilon=training_args.adam_epsilon, clipnorm=training_args.max_grad_norm, ) if is_regression: loss_fn = tf.keras.losses.MeanSquaredError() metrics = [] else: loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) metrics = ["accuracy"] model.compile(optimizer=optimizer, loss=loss_fn, metrics=metrics) # endregion # region Convert data to a tf.data.Dataset tf_data = dict() max_samples = { "train": data_args.max_train_samples, "validation": data_args.max_val_samples, "test": data_args.max_test_samples, } for key in ("train", "validation", "test"): if key not in datasets: tf_data[key] = None continue if key in ("train", "validation"): assert "label" in datasets[key].features, f"Missing labels from {key} data!" if key == "train": shuffle = True batch_size = training_args.per_device_train_batch_size drop_remainder = True # Saves us worrying about scaling gradients for the last batch else: shuffle = False batch_size = training_args.per_device_eval_batch_size drop_remainder = False samples_limit = max_samples[key] dataset = datasets[key] if samples_limit is not None: dataset = dataset.select(range(samples_limit)) data = dataset.to_tf_dataset( columns=[col for col in dataset.column_names if col not in set(non_label_column_names + ["label"])], shuffle=shuffle, batch_size=batch_size, collate_fn=data_collator, drop_remainder=drop_remainder, # `label_cols` is needed for user-defined losses, such as in this example label_cols="label" if "label" in dataset.column_names else None, ) tf_data[key] = data # endregion # region Training and validation if tf_data["train"] is not None: callbacks = [SavePretrainedCallback(output_dir=training_args.output_dir)] model.fit( tf_data["train"], validation_data=tf_data["validation"], epochs=int(training_args.num_train_epochs), callbacks=callbacks, ) elif tf_data["validation"] is not None: # If there's a validation dataset but no training set, just evaluate the metrics logger.info("Computing metrics on validation data...") if is_regression: loss = model.evaluate(tf_data["validation"]) logger.info(f"Loss: {loss:.5f}") else: loss, accuracy = model.evaluate(tf_data["validation"]) logger.info(f"Loss: {loss:.5f}, Accuracy: {accuracy * 100:.4f}%") # endregion # region Prediction if tf_data["test"] is not None: logger.info("Doing predictions on test dataset...") predictions = model.predict(tf_data["test"])["logits"] predicted_class = np.squeeze(predictions) if is_regression else np.argmax(predictions, axis=1) output_test_file = os.path.join(training_args.output_dir, "test_results.txt") with open(output_test_file, "w") as writer: writer.write("index\tprediction\n") for index, item in enumerate(predicted_class): if is_regression: writer.write(f"{index}\t{item:3.3f}\n") else: item = config.id2label[item] writer.write(f"{index}\t{item}\n") logger.info(f"Wrote predictions to {output_test_file}!") # endregion # region Prediction losses # This section is outside the scope() because it's very quick to compute, but behaves badly inside it if "test" in datasets and "label" in datasets["test"].features: print("Computing prediction loss on test labels...") labels = datasets["test"]["label"] loss = float(loss_fn(labels, predictions).numpy()) print(f"Test loss: {loss:.4f}")
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.", ) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pretrained model or model identifier from huggingface.co/models", ) parser.add_argument( "--task_name", default=None, type=str, required=True, help="The name of the task to train selected in the list: " + ", ".join(glue_processors.keys()), ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.", ) # Other parameters parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name_or_path", ) parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name_or_path", ) parser.add_argument( "--cache_dir", default=None, type=str, help="Where do you want to store the pre-trained models downloaded from s3", ) parser.add_argument( "--data_subset", type=int, default=-1, help="If > 0: limit the data to a subset of data_subset instances." ) parser.add_argument( "--overwrite_output_dir", action="store_true", help="Whether to overwrite data in output directory" ) parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets" ) parser.add_argument( "--dont_normalize_importance_by_layer", action="store_true", help="Don't normalize importance score by layers" ) parser.add_argument( "--dont_normalize_global_importance", action="store_true", help="Don't normalize all importance scores between 0 and 1", ) parser.add_argument( "--try_masking", action="store_true", help="Whether to try to mask head until a threshold of accuracy." ) parser.add_argument( "--masking_threshold", default=0.9, type=float, help="masking threshold in term of metrics (stop masking when metric < threshold * original metric value).", ) parser.add_argument( "--masking_amount", default=0.1, type=float, help="Amount to heads to masking at each masking step." ) parser.add_argument("--metric_name", default="acc", type=str, help="Metric to use for head masking.") parser.add_argument( "--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, sequences shorter padded.", ) parser.add_argument("--batch_size", default=1, type=int, help="Batch size.") parser.add_argument("--seed", type=int, default=42) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.") parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup devices and distributed training if args.local_rank == -1 or args.no_cuda: args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) args.n_gpu = 1 torch.distributed.init_process_group(backend="nccl") # Initializes the distributed backend # Setup logging logging.basicConfig(level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info("device: {} n_gpu: {}, distributed: {}".format(args.device, args.n_gpu, bool(args.local_rank != -1))) # Set seeds set_seed(args.seed) # Prepare GLUE task args.task_name = args.task_name.lower() if args.task_name not in glue_processors: raise ValueError("Task not found: %s" % (args.task_name)) processor = glue_processors[args.task_name]() args.output_mode = glue_output_modes[args.task_name] label_list = processor.get_labels() num_labels = len(label_list) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name, output_attentions=True, cache_dir=args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, cache_dir=args.cache_dir, ) model = AutoModelForSequenceClassification.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, cache_dir=args.cache_dir, ) # Distributed and parallel training model.to(args.device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True ) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) # Print/save training arguments os.makedirs(args.output_dir, exist_ok=True) torch.save(args, os.path.join(args.output_dir, "run_args.bin")) logger.info("Training/evaluation parameters %s", args) # Prepare dataset for the GLUE task eval_dataset = GlueDataset(args, tokenizer=tokenizer, mode="dev") if args.data_subset > 0: eval_dataset = Subset(eval_dataset, list(range(min(args.data_subset, len(eval_dataset))))) eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset) eval_dataloader = DataLoader( eval_dataset, sampler=eval_sampler, batch_size=args.batch_size, collate_fn=DefaultDataCollator().collate_batch ) # Compute head entropy and importance score compute_heads_importance(args, model, eval_dataloader) # Try head masking (set heads to zero until the score goes under a threshole) # and head pruning (remove masked heads and see the effect on the network) if args.try_masking and args.masking_threshold > 0.0 and args.masking_threshold < 1.0: head_mask = mask_heads(args, model, eval_dataloader) prune_heads(args, model, eval_dataloader, head_mask)
def main(): # region Argument parsing # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_qa", model_args, data_args, framework="tensorflow") output_dir = Path(training_args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) # endregion # region Checkpoints checkpoint = None if len(os.listdir(training_args.output_dir)) > 0 and not training_args.overwrite_output_dir: if (output_dir / CONFIG_NAME).is_file() and (output_dir / TF2_WEIGHTS_NAME).is_file(): checkpoint = output_dir logger.info( f"Checkpoint detected, resuming training from checkpoint in {training_args.output_dir}. To avoid this" " behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) else: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to continue regardless." ) # endregion # region Logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if training_args.should_log else logging.WARN) # Set the verbosity to info of the Transformers logger (on main process only): if training_args.should_log: transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info(f"Training/evaluation parameters {training_args}") # endregion # Set seed before initializing model. set_seed(training_args.seed) # region Load Data # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset( data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file extension = data_args.train_file.split(".")[-1] if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.validation_file.split(".")[-1] if data_args.test_file is not None: data_files["test"] = data_args.test_file extension = data_args.test_file.split(".")[-1] datasets = load_dataset( extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # endregion # region Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=True, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # endregion # region Tokenizer check: this script requires a fast tokenizer. if not isinstance(tokenizer, PreTrainedTokenizerFast): raise ValueError( "This example script only works for models that have a fast tokenizer. Checkout the big table of models at" " https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet" " this requirement" ) # endregion # region Preprocessing the datasets # Preprocessing is slightly different for training and evaluation. if training_args.do_train: column_names = datasets["train"].column_names elif training_args.do_eval: column_names = datasets["validation"].column_names else: column_names = datasets["test"].column_names question_column_name = "question" if "question" in column_names else column_names[0] context_column_name = "context" if "context" in column_names else column_names[1] answer_column_name = "answers" if "answers" in column_names else column_names[2] # Padding side determines if we do (question|context) or (context|question). pad_on_right = tokenizer.padding_side == "right" if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) if data_args.pad_to_max_length or isinstance(training_args.strategy, tf.distribute.TPUStrategy): logger.info("Padding all batches to max length because argument was set or we're on TPU.") padding = "max_length" else: padding = False # Training preprocessing def prepare_train_features(examples): # Some of the questions have lots of whitespace on the left, which is not useful and will make the # truncation of the context fail (the tokenized question will take a lots of space). So we remove that # left whitespace examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]] # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. tokenized_examples = tokenizer( examples[question_column_name if pad_on_right else context_column_name], examples[context_column_name if pad_on_right else question_column_name], truncation="only_second" if pad_on_right else "only_first", max_length=max_seq_length, stride=data_args.doc_stride, return_overflowing_tokens=True, return_offsets_mapping=True, padding=padding, ) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") # The offset mappings will give us a map from token to character position in the original context. This will # help us compute the start_positions and end_positions. offset_mapping = tokenized_examples.pop("offset_mapping") # Let's label those examples! tokenized_examples["start_positions"] = [] tokenized_examples["end_positions"] = [] for i, offsets in enumerate(offset_mapping): # We will label impossible answers with the index of the CLS token. input_ids = tokenized_examples["input_ids"][i] cls_index = input_ids.index(tokenizer.cls_token_id) # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_examples.sequence_ids(i) # One example can give several spans, this is the index of the example containing this span of text. sample_index = sample_mapping[i] answers = examples[answer_column_name][sample_index] # If no answers are given, set the cls_index as answer. if len(answers["answer_start"]) == 0: tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) else: # Start/end character index of the answer in the text. start_char = answers["answer_start"][0] end_char = start_char + len(answers["text"][0]) # Start token index of the current span in the text. token_start_index = 0 while sequence_ids[token_start_index] != (1 if pad_on_right else 0): token_start_index += 1 # End token index of the current span in the text. token_end_index = len(input_ids) - 1 while sequence_ids[token_end_index] != (1 if pad_on_right else 0): token_end_index -= 1 # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index). if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char): tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) else: # Otherwise move the token_start_index and token_end_index to the two ends of the answer. # Note: we could go after the last offset if the answer is the last word (edge case). while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char: token_start_index += 1 tokenized_examples["start_positions"].append(token_start_index - 1) while offsets[token_end_index][1] >= end_char: token_end_index -= 1 tokenized_examples["end_positions"].append(token_end_index + 1) return tokenized_examples processed_datasets = dict() if training_args.do_train: if "train" not in datasets: raise ValueError("--do_train requires a train dataset") train_dataset = datasets["train"] if data_args.max_train_samples is not None: # We will select sample from whole data if agument is specified max_train_samples = min(len(train_dataset), data_args.max_train_samples) train_dataset = train_dataset.select(range(max_train_samples)) # Create train feature from dataset train_dataset = train_dataset.map( prepare_train_features, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) if data_args.max_train_samples is not None: # Number of samples might increase during Feature Creation, We select only specified max samples max_train_samples = min(len(train_dataset), data_args.max_train_samples) train_dataset = train_dataset.select(range(max_train_samples)) processed_datasets["train"] = train_dataset # Validation preprocessing def prepare_validation_features(examples): # Some of the questions have lots of whitespace on the left, which is not useful and will make the # truncation of the context fail (the tokenized question will take a lots of space). So we remove that # left whitespace examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]] # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. tokenized_examples = tokenizer( examples[question_column_name if pad_on_right else context_column_name], examples[context_column_name if pad_on_right else question_column_name], truncation="only_second" if pad_on_right else "only_first", max_length=max_seq_length, stride=data_args.doc_stride, return_overflowing_tokens=True, return_offsets_mapping=True, padding=padding, ) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the # corresponding example_id and we will store the offset mappings. tokenized_examples["example_id"] = [] for i in range(len(tokenized_examples["input_ids"])): # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_examples.sequence_ids(i) context_index = 1 if pad_on_right else 0 # One example can give several spans, this is the index of the example containing this span of text. sample_index = sample_mapping[i] tokenized_examples["example_id"].append(examples["id"][sample_index]) # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token # position is part of the context or not. tokenized_examples["offset_mapping"][i] = [ (o if sequence_ids[k] == context_index else None) for k, o in enumerate(tokenized_examples["offset_mapping"][i]) ] return tokenized_examples if training_args.do_eval: if "validation" not in datasets: raise ValueError("--do_eval requires a validation dataset") eval_examples = datasets["validation"] if data_args.max_eval_samples is not None: # We will select sample from whole data max_eval_samples = min(len(eval_examples), data_args.max_eval_samples) eval_examples = eval_examples.select(range(max_eval_samples)) # Validation Feature Creation eval_dataset = eval_examples.map( prepare_validation_features, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) if data_args.max_eval_samples is not None: # During Feature creation dataset samples might increase, we will select required samples again max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) eval_dataset = eval_dataset.select(range(max_eval_samples)) processed_datasets["validation"] = eval_dataset if training_args.do_predict: if "test" not in datasets: raise ValueError("--do_predict requires a test dataset") predict_examples = datasets["test"] if data_args.max_predict_samples is not None: # We will select sample from whole data predict_examples = predict_examples.select(range(data_args.max_predict_samples)) # Predict Feature Creation predict_dataset = predict_examples.map( prepare_validation_features, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) if data_args.max_predict_samples is not None: # During Feature creation dataset samples might increase, we will select required samples again max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples) predict_dataset = predict_dataset.select(range(max_predict_samples)) processed_datasets["test"] = predict_dataset # endregion # region Metrics and Post-processing: def post_processing_function(examples, features, predictions, stage="eval"): # Post-processing: we match the start logits and end logits to answers in the original context. predictions = postprocess_qa_predictions( examples=examples, features=features, predictions=predictions, version_2_with_negative=data_args.version_2_with_negative, n_best_size=data_args.n_best_size, max_answer_length=data_args.max_answer_length, null_score_diff_threshold=data_args.null_score_diff_threshold, output_dir=training_args.output_dir, prefix=stage, ) # Format the result to the format the metric expects. if data_args.version_2_with_negative: formatted_predictions = [ {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items() ] else: formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()] references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples] return EvalPrediction(predictions=formatted_predictions, label_ids=references) metric = load_metric("squad_v2" if data_args.version_2_with_negative else "squad") def compute_metrics(p: EvalPrediction): return metric.compute(predictions=p.predictions, references=p.label_ids) # endregion with training_args.strategy.scope(): # region Load model if checkpoint is None: model_path = model_args.model_name_or_path else: model_path = checkpoint model = TFAutoModelForQuestionAnswering.from_pretrained( model_path, config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) optimizer = tf.keras.optimizers.Adam( learning_rate=training_args.learning_rate, beta_1=training_args.adam_beta1, beta_2=training_args.adam_beta2, epsilon=training_args.adam_epsilon, clipnorm=training_args.max_grad_norm, ) # no user-specified loss = will use the model internal loss model.compile(optimizer=optimizer) # endregion # region Training if padding: data_collator = DefaultDataCollator(return_tensors="tf") else: data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf") tensor_keys = ["attention_mask", "input_ids"] label_keys = ["start_positions", "end_positions"] if training_args.do_train: # Make a tf.data.Dataset for this training_dataset = processed_datasets["train"].to_tf_dataset( # labels are passed as input, as we will use the model's internal loss columns=tensor_keys + label_keys, shuffle=True, batch_size=training_args.per_device_train_batch_size, collate_fn=data_collator, drop_remainder=True, ) model.fit(training_dataset, epochs=int(training_args.num_train_epochs)) # endregion # region Evaluation if training_args.do_eval: logger.info("*** Evaluation ***") eval_inputs = { "input_ids": tf.ragged.constant(processed_datasets["validation"]["input_ids"]).to_tensor(), "attention_mask": tf.ragged.constant(processed_datasets["validation"]["attention_mask"]).to_tensor(), } eval_predictions = model.predict(eval_inputs) post_processed_eval = post_processing_function( datasets["validation"], processed_datasets["validation"], (eval_predictions.start_logits, eval_predictions.end_logits), ) metrics = compute_metrics(post_processed_eval) logging.info("Evaluation metrics:") for metric, value in metrics.items(): logging.info(f"{metric}: {value:.3f}") # endregion # region Prediction if training_args.do_predict: logger.info("*** Predict ***") predict_inputs = { "input_ids": tf.ragged.constant(processed_datasets["test"]["input_ids"]).to_tensor(), "attention_mask": tf.ragged.constant(processed_datasets["test"]["attention_mask"]).to_tensor(), } test_predictions = model.predict(predict_inputs) post_processed_test = post_processing_function( datasets["test"], processed_datasets["test"], (test_predictions.start_logits, test_predictions.end_logits), ) metrics = compute_metrics(post_processed_test) logging.info("Test metrics:") for metric, value in metrics.items(): logging.info(f"{metric}: {value:.3f}") # endregion if training_args.push_to_hub: model.push_to_hub()
def main(): # region Argument parsing # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TFTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_swag", model_args, data_args, framework="tensorflow") output_dir = Path(training_args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) # endregion # region Logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) transformers.utils.logging.set_verbosity(log_level) transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() # endregion # region Checkpoints checkpoint = None if len(os.listdir(training_args.output_dir) ) > 0 and not training_args.overwrite_output_dir: if (output_dir / CONFIG_NAME).is_file() and ( output_dir / TF2_WEIGHTS_NAME).is_file(): checkpoint = output_dir logger.info( f"Checkpoint detected, resuming training from checkpoint in {training_args.output_dir}. To avoid this" " behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) else: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to continue regardless.") # endregion # Set seed before initializing model. set_seed(training_args.seed) # region Load datasets # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.train_file is not None or data_args.validation_file is not None: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.train_file.split(".")[-1] raw_datasets = load_dataset( extension, data_files=data_files, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) else: # Downloading and loading the swag dataset from the hub. raw_datasets = load_dataset( "swag", "regular", cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # When using your own dataset or a different dataset from swag, you will probably need to change this. ending_names = [f"ending{i}" for i in range(4)] context_name = "sent1" question_header_name = "sent2" # endregion # region Load model config and tokenizer if checkpoint is not None: config_path = training_args.output_dir elif model_args.config_name: config_path = model_args.config_name else: config_path = model_args.model_name_or_path # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( config_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # endregion # region Dataset preprocessing if data_args.max_seq_length is None: max_seq_length = tokenizer.model_max_length if max_seq_length > 1024: logger.warning( f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx." ) max_seq_length = 1024 else: if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) def preprocess_function(examples): first_sentences = [[context] * 4 for context in examples[context_name]] question_headers = examples[question_header_name] second_sentences = [[ f"{header} {examples[end][i]}" for end in ending_names ] for i, header in enumerate(question_headers)] # Flatten out first_sentences = list(chain(*first_sentences)) second_sentences = list(chain(*second_sentences)) # Tokenize tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True, max_length=max_seq_length) # Un-flatten data = { k: [v[i:i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items() } return data if training_args.do_train: if "train" not in raw_datasets: raise ValueError("--do_train requires a train dataset") train_dataset = raw_datasets["train"] if data_args.max_train_samples is not None: max_train_samples = min(len(train_dataset), data_args.max_train_samples) train_dataset = train_dataset.select(range(max_train_samples)) with training_args.main_process_first( desc="train dataset map pre-processing"): train_dataset = train_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) if training_args.do_eval: if "validation" not in raw_datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = raw_datasets["validation"] if data_args.max_eval_samples is not None: max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) eval_dataset = eval_dataset.select(range(max_eval_samples)) with training_args.main_process_first( desc="validation dataset map pre-processing"): eval_dataset = eval_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) if data_args.pad_to_max_length: data_collator = DefaultDataCollator(return_tensors="tf") else: # custom class defined above, as HF has no data collator for multiple choice data_collator = DataCollatorForMultipleChoice(tokenizer) # endregion with training_args.strategy.scope(): # region Build model if checkpoint is None: model_path = model_args.model_name_or_path else: model_path = checkpoint model = TFAutoModelForMultipleChoice.from_pretrained( model_path, config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) num_replicas = training_args.strategy.num_replicas_in_sync total_train_batch_size = training_args.per_device_train_batch_size * num_replicas total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas if training_args.do_train: num_train_steps = (len(train_dataset) // total_train_batch_size ) * int(training_args.num_train_epochs) if training_args.warmup_steps > 0: num_warmup_steps = training_args.warmup_steps elif training_args.warmup_ratio > 0: num_warmup_steps = int(num_train_steps * training_args.warmup_ratio) else: num_warmup_steps = 0 optimizer, lr_schedule = create_optimizer( init_lr=training_args.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, adam_beta1=training_args.adam_beta1, adam_beta2=training_args.adam_beta2, adam_epsilon=training_args.adam_epsilon, weight_decay_rate=training_args.weight_decay, adam_global_clipnorm=training_args.max_grad_norm, ) else: optimizer = None model.compile(optimizer=optimizer, metrics=["accuracy"], jit_compile=training_args.xla) # endregion # region Preparing push_to_hub and model card push_to_hub_model_id = training_args.push_to_hub_model_id model_name = model_args.model_name_or_path.split("/")[-1] if not push_to_hub_model_id: push_to_hub_model_id = f"{model_name}-finetuned-multiplechoice" model_card_kwargs = { "finetuned_from": model_args.model_name_or_path, "tasks": "multiple-choice" } if training_args.push_to_hub: callbacks = [ PushToHubCallback( output_dir=training_args.output_dir, model_id=push_to_hub_model_id, organization=training_args.push_to_hub_organization, token=training_args.push_to_hub_token, tokenizer=tokenizer, **model_card_kwargs, ) ] else: callbacks = [] # endregion # region Training eval_metrics = None if training_args.do_train: dataset_options = tf.data.Options() dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF # model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in # training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also # use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names # yourself if you use this method, whereas they are automatically inferred from the model input names when # using model.prepare_tf_dataset() # For more info see the docs: # https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset # https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset tf_train_dataset = model.prepare_tf_dataset( train_dataset, shuffle=True, batch_size=total_train_batch_size, collate_fn=data_collator, ).with_options(dataset_options) if training_args.do_eval: validation_data = model.prepare_tf_dataset( eval_dataset, shuffle=False, batch_size=total_eval_batch_size, collate_fn=data_collator, drop_remainder=True, ).with_options(dataset_options) else: validation_data = None history = model.fit( tf_train_dataset, validation_data=validation_data, epochs=int(training_args.num_train_epochs), callbacks=callbacks, ) eval_metrics = { key: val[-1] for key, val in history.history.items() } # endregion # region Evaluation if training_args.do_eval and not training_args.do_train: dataset_options = tf.data.Options() dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF # Do a standalone evaluation pass tf_eval_dataset = model.prepare_tf_dataset( eval_dataset, shuffle=False, batch_size=total_eval_batch_size, collate_fn=data_collator, drop_remainder=True, ).with_options(dataset_options) eval_results = model.evaluate(tf_eval_dataset) eval_metrics = { "val_loss": eval_results[0], "val_accuracy": eval_results[1] } # endregion if eval_metrics is not None and training_args.output_dir is not None: output_eval_file = os.path.join(training_args.output_dir, "all_results.json") with open(output_eval_file, "w") as writer: writer.write(json.dumps(eval_metrics)) # region Push to hub if training_args.output_dir is not None and not training_args.push_to_hub: # If we're not pushing to hub, at least save a local copy when we're done model.save_pretrained(training_args.output_dir)
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader( train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=DefaultDataCollator().collate_batch, ) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total ) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed(args) # Added here for reproductibility (even between python 2 and 3) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): model.train() inputs = {k: t.to(args.device) for k, t in batch.items() if k != "pairID"} outputs = model(**inputs) loss = outputs[0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: logs = {} if ( args.local_rank == -1 and args.evaluate_during_training ): # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): eval_key = "eval_{}".format(key) logs[eval_key] = value loss_scalar = (tr_loss - logging_loss) / args.logging_steps learning_rate_scalar = scheduler.get_lr()[0] logs["learning_rate"] = learning_rate_scalar logs["loss"] = loss_scalar logging_loss = tr_loss for key, value in logs.items(): tb_writer.add_scalar(key, value, global_step) # print(json.dumps({**logs, **{'step': global_step}})) if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def evaluate(args, model, tokenizer, label_list, prefix=""): # Loop to handle MNLI double evaluation (matched, mis-matched) eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,) eval_outputs_dirs = (args.output_dir, args.output_dir + "-MM") if args.task_name == "mnli" else (args.output_dir,) results = {} for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): eval_dataset = HansDataset( args.data_dir, tokenizer, args.task_name, args.max_seq_length, overwrite_cache=args.overwrite_cache, evaluate=True, ) if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader( eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=DefaultDataCollator().collate_batch, ) # multi-gpu eval if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() inputs = {k: t.to(args.device) for k, t in batch.items() if k != "pairID"} pair_ids = batch.pop("pairID", None) with torch.no_grad(): outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs["labels"].detach().cpu().numpy() pair_ids = pair_ids.detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) pair_ids = np.append(pair_ids, pair_ids.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps if args.output_mode == "classification": preds = np.argmax(preds, axis=1) elif args.output_mode == "regression": preds = np.squeeze(preds) output_eval_file = os.path.join(eval_output_dir, "hans_predictions.txt") with open(output_eval_file, "w") as writer: writer.write("pairID,gld_label\n") for pid, pred in zip(pair_ids, preds): writer.write("ex" + str(pid) + "," + label_list[int(pred)] + "\n") return results
is_seq_class=config_dict["is_seq_class"], lowercase=config_dict["lowercase"], ) train_dataset = TSVClassificationDataset(mode=Split.train, **data_config) if config_dict["dataset_split"] == "train": dataset = train_dataset elif config_dict["dataset_split"] == "dev": dataset = TSVClassificationDataset(mode=Split.dev, **data_config) elif config_dict["dataset_split"] == "test": dataset = TSVClassificationDataset(mode=Split.test, **data_config) print(len(dataset.examples)) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) data_collator = DefaultDataCollator() if method == "lime": res = classify_lime( model=model, dataset=dataset, train_dataset=train_dataset, config_dict=config_dict, ) elif method == "soft_attention": res = classify_soft_attention(model, dataset, config_dict, data_collator) elif method == "model_attention": if len(sys.argv) != 5: print( "For model classication, required args are [config_path] [gpu_ids] [layer_id] [head_id]"