def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if ( os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir ): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.warning( "device: %s, n_replicas: %s, 16-bits training: %s", training_args.device, training_args.n_replicas, training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) try: processor = processors[data_args.task_name]() label_list = processor.get_labels() num_labels = len(label_list) except KeyError: raise ValueError("Task not found: %s" % (data_args.task_name)) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) with training_args.strategy.scope(): model = TFAutoModelForMultipleChoice.from_pretrained( model_args.model_name_or_path, from_pt=bool(".bin" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # Get datasets train_dataset = ( TFMultipleChoiceDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, task=data_args.task_name, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.train, ) if training_args.do_train else None ) eval_dataset = ( TFMultipleChoiceDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, task=data_args.task_name, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.dev, ) if training_args.do_eval else None ) def compute_metrics(p: EvalPrediction) -> Dict: preds = np.argmax(p.predictions, axis=1) return {"acc": simple_accuracy(preds, p.label_ids)} # Initialize our Trainer trainer = TFTrainer( model=model, args=training_args, train_dataset=train_dataset.get_dataset() if train_dataset else None, eval_dataset=eval_dataset.get_dataset() if eval_dataset else None, compute_metrics=compute_metrics, ) # Training if training_args.do_train: trainer.train() trainer.save_model() tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") result = trainer.evaluate() trainer.log_metrics("eval", results) trainer.save_metrics("eval", results) results.update(result) return results
def main(): # region Argument parsing # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TFTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) output_dir = Path(training_args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) # endregion # region Logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) transformers.utils.logging.set_verbosity(log_level) transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() # endregion # region Checkpoints checkpoint = None if len(os.listdir(training_args.output_dir) ) > 0 and not training_args.overwrite_output_dir: if (output_dir / CONFIG_NAME).is_file() and ( output_dir / TF2_WEIGHTS_NAME).is_file(): checkpoint = output_dir logger.info( f"Checkpoint detected, resuming training from checkpoint in {training_args.output_dir}. To avoid this" " behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) else: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to continue regardless.") # endregion # Set seed before initializing model. set_seed(training_args.seed) # region Load datasets # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.train_file is not None or data_args.validation_file is not None: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.train_file.split(".")[-1] raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) else: # Downloading and loading the swag dataset from the hub. raw_datasets = load_dataset("swag", "regular", cache_dir=model_args.cache_dir) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # When using your own dataset or a different dataset from swag, you will probably need to change this. ending_names = [f"ending{i}" for i in range(4)] context_name = "sent1" question_header_name = "sent2" # endregion # region Load model config and tokenizer if checkpoint is not None: config_path = training_args.output_dir elif model_args.config_name: config_path = model_args.config_name else: config_path = model_args.model_name_or_path # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( config_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # endregion # region Dataset preprocessing if data_args.max_seq_length is None: max_seq_length = tokenizer.model_max_length if max_seq_length > 1024: logger.warning( f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx." ) max_seq_length = 1024 else: if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) def preprocess_function(examples): first_sentences = [[context] * 4 for context in examples[context_name]] question_headers = examples[question_header_name] second_sentences = [[ f"{header} {examples[end][i]}" for end in ending_names ] for i, header in enumerate(question_headers)] # Flatten out first_sentences = list(chain(*first_sentences)) second_sentences = list(chain(*second_sentences)) # Tokenize tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True, max_length=max_seq_length) # Un-flatten data = { k: [v[i:i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items() } return data if training_args.do_train: if "train" not in raw_datasets: raise ValueError("--do_train requires a train dataset") train_dataset = raw_datasets["train"] non_label_columns = [ feature for feature in train_dataset.features if feature not in ("label", "labels") ] if data_args.max_train_samples is not None: train_dataset = train_dataset.select( range(data_args.max_train_samples)) with training_args.main_process_first( desc="train dataset map pre-processing"): train_dataset = train_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) if training_args.do_eval: if "validation" not in raw_datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = raw_datasets["validation"] if not training_args.do_train: non_label_columns = [ feature for feature in eval_dataset.features if feature not in ("label", "labels") ] if data_args.max_eval_samples is not None: eval_dataset = eval_dataset.select( range(data_args.max_eval_samples)) with training_args.main_process_first( desc="validation dataset map pre-processing"): eval_dataset = eval_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) if data_args.pad_to_max_length: data_collator = DefaultDataCollator(return_tensors="tf") else: # custom class defined above, as HF has no data collator for multiple choice data_collator = DataCollatorForMultipleChoice(tokenizer) # endregion with training_args.strategy.scope(): # region Build model if checkpoint is None: model_path = model_args.model_name_or_path else: model_path = checkpoint model = TFAutoModelForMultipleChoice.from_pretrained( model_path, config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) num_replicas = training_args.strategy.num_replicas_in_sync total_train_batch_size = training_args.per_device_train_batch_size * num_replicas total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas if training_args.do_train: total_train_steps = (len(train_dataset) // total_train_batch_size ) * int(training_args.num_train_epochs) optimizer, lr_schedule = create_optimizer( init_lr=training_args.learning_rate, num_train_steps=int(total_train_steps), num_warmup_steps=0) else: optimizer = "adam" # Just put anything in here, since we're not using it anyway model.compile( optimizer=optimizer, loss=tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True), metrics=[ tf.keras.metrics.SparseCategoricalAccuracy(name="accuracy") ], ) # endregion # region Training if training_args.do_train: dataset_exclude_cols = set(non_label_columns + ["label"]) tf_train_dataset = train_dataset.to_tf_dataset( columns=[ col for col in train_dataset.column_names if col not in dataset_exclude_cols ], shuffle=True, batch_size=total_train_batch_size, collate_fn=data_collator, drop_remainder=True, # `label_cols` is needed for user-defined losses, such as in this example label_cols="label" if "label" in train_dataset.column_names else None, ) if training_args.do_eval: validation_data = eval_dataset.to_tf_dataset( columns=[ col for col in eval_dataset.column_names if col not in dataset_exclude_cols ], shuffle=False, batch_size=total_eval_batch_size, collate_fn=data_collator, drop_remainder=True, # `label_cols` is needed for user-defined losses, such as in this example label_cols="label" if "label" in eval_dataset.column_names else None, ) else: validation_data = None model.fit( tf_train_dataset, validation_data=validation_data, epochs=int(training_args.num_train_epochs), callbacks=[ SavePretrainedCallback(output_dir=training_args.output_dir) ], ) # endregion # region Evaluation if training_args.do_eval and not training_args.do_train: dataset_exclude_cols = set(non_label_columns + ["label"]) # Do a standalone evaluation pass tf_eval_dataset = eval_dataset.to_tf_dataset( columns=[ col for col in eval_dataset.column_names if col not in dataset_exclude_cols ], shuffle=False, batch_size=total_eval_batch_size, collate_fn=data_collator, drop_remainder=True, # `label_cols` is needed for user-defined losses, such as in this example label_cols="label" if "label" in eval_dataset.column_names else None, ) model.evaluate(tf_eval_dataset) # endregion # region Push to hub if training_args.push_to_hub: model.push_to_hub( finetuned_from=model_args.model_name_or_path, tasks="multiple-choice", dataset_tags="swag", dataset_args="regular", dataset="SWAG", language="en", )
def main(): # region Argument parsing # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TFTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_swag", model_args, data_args, framework="tensorflow") output_dir = Path(training_args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) # endregion # region Logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) transformers.utils.logging.set_verbosity(log_level) transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() # endregion # region Checkpoints checkpoint = None if len(os.listdir(training_args.output_dir) ) > 0 and not training_args.overwrite_output_dir: if (output_dir / CONFIG_NAME).is_file() and ( output_dir / TF2_WEIGHTS_NAME).is_file(): checkpoint = output_dir logger.info( f"Checkpoint detected, resuming training from checkpoint in {training_args.output_dir}. To avoid this" " behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) else: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to continue regardless.") # endregion # Set seed before initializing model. set_seed(training_args.seed) # region Load datasets # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.train_file is not None or data_args.validation_file is not None: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.train_file.split(".")[-1] raw_datasets = load_dataset( extension, data_files=data_files, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) else: # Downloading and loading the swag dataset from the hub. raw_datasets = load_dataset( "swag", "regular", cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # When using your own dataset or a different dataset from swag, you will probably need to change this. ending_names = [f"ending{i}" for i in range(4)] context_name = "sent1" question_header_name = "sent2" # endregion # region Load model config and tokenizer if checkpoint is not None: config_path = training_args.output_dir elif model_args.config_name: config_path = model_args.config_name else: config_path = model_args.model_name_or_path # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( config_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # endregion # region Dataset preprocessing if data_args.max_seq_length is None: max_seq_length = tokenizer.model_max_length if max_seq_length > 1024: logger.warning( f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx." ) max_seq_length = 1024 else: if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) def preprocess_function(examples): first_sentences = [[context] * 4 for context in examples[context_name]] question_headers = examples[question_header_name] second_sentences = [[ f"{header} {examples[end][i]}" for end in ending_names ] for i, header in enumerate(question_headers)] # Flatten out first_sentences = list(chain(*first_sentences)) second_sentences = list(chain(*second_sentences)) # Tokenize tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True, max_length=max_seq_length) # Un-flatten data = { k: [v[i:i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items() } return data if training_args.do_train: if "train" not in raw_datasets: raise ValueError("--do_train requires a train dataset") train_dataset = raw_datasets["train"] if data_args.max_train_samples is not None: max_train_samples = min(len(train_dataset), data_args.max_train_samples) train_dataset = train_dataset.select(range(max_train_samples)) with training_args.main_process_first( desc="train dataset map pre-processing"): train_dataset = train_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) if training_args.do_eval: if "validation" not in raw_datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = raw_datasets["validation"] if data_args.max_eval_samples is not None: max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) eval_dataset = eval_dataset.select(range(max_eval_samples)) with training_args.main_process_first( desc="validation dataset map pre-processing"): eval_dataset = eval_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) if data_args.pad_to_max_length: data_collator = DefaultDataCollator(return_tensors="tf") else: # custom class defined above, as HF has no data collator for multiple choice data_collator = DataCollatorForMultipleChoice(tokenizer) # endregion with training_args.strategy.scope(): # region Build model if checkpoint is None: model_path = model_args.model_name_or_path else: model_path = checkpoint model = TFAutoModelForMultipleChoice.from_pretrained( model_path, config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) num_replicas = training_args.strategy.num_replicas_in_sync total_train_batch_size = training_args.per_device_train_batch_size * num_replicas total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas if training_args.do_train: num_train_steps = (len(train_dataset) // total_train_batch_size ) * int(training_args.num_train_epochs) if training_args.warmup_steps > 0: num_warmup_steps = training_args.warmup_steps elif training_args.warmup_ratio > 0: num_warmup_steps = int(num_train_steps * training_args.warmup_ratio) else: num_warmup_steps = 0 optimizer, lr_schedule = create_optimizer( init_lr=training_args.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, adam_beta1=training_args.adam_beta1, adam_beta2=training_args.adam_beta2, adam_epsilon=training_args.adam_epsilon, weight_decay_rate=training_args.weight_decay, adam_global_clipnorm=training_args.max_grad_norm, ) else: optimizer = None model.compile(optimizer=optimizer, metrics=["accuracy"], jit_compile=training_args.xla) # endregion # region Preparing push_to_hub and model card push_to_hub_model_id = training_args.push_to_hub_model_id model_name = model_args.model_name_or_path.split("/")[-1] if not push_to_hub_model_id: push_to_hub_model_id = f"{model_name}-finetuned-multiplechoice" model_card_kwargs = { "finetuned_from": model_args.model_name_or_path, "tasks": "multiple-choice" } if training_args.push_to_hub: callbacks = [ PushToHubCallback( output_dir=training_args.output_dir, model_id=push_to_hub_model_id, organization=training_args.push_to_hub_organization, token=training_args.push_to_hub_token, tokenizer=tokenizer, **model_card_kwargs, ) ] else: callbacks = [] # endregion # region Training eval_metrics = None if training_args.do_train: dataset_options = tf.data.Options() dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF # model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in # training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also # use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names # yourself if you use this method, whereas they are automatically inferred from the model input names when # using model.prepare_tf_dataset() # For more info see the docs: # https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset # https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset tf_train_dataset = model.prepare_tf_dataset( train_dataset, shuffle=True, batch_size=total_train_batch_size, collate_fn=data_collator, ).with_options(dataset_options) if training_args.do_eval: validation_data = model.prepare_tf_dataset( eval_dataset, shuffle=False, batch_size=total_eval_batch_size, collate_fn=data_collator, drop_remainder=True, ).with_options(dataset_options) else: validation_data = None history = model.fit( tf_train_dataset, validation_data=validation_data, epochs=int(training_args.num_train_epochs), callbacks=callbacks, ) eval_metrics = { key: val[-1] for key, val in history.history.items() } # endregion # region Evaluation if training_args.do_eval and not training_args.do_train: dataset_options = tf.data.Options() dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF # Do a standalone evaluation pass tf_eval_dataset = model.prepare_tf_dataset( eval_dataset, shuffle=False, batch_size=total_eval_batch_size, collate_fn=data_collator, drop_remainder=True, ).with_options(dataset_options) eval_results = model.evaluate(tf_eval_dataset) eval_metrics = { "val_loss": eval_results[0], "val_accuracy": eval_results[1] } # endregion if eval_metrics is not None and training_args.output_dir is not None: output_eval_file = os.path.join(training_args.output_dir, "all_results.json") with open(output_eval_file, "w") as writer: writer.write(json.dumps(eval_metrics)) # region Push to hub if training_args.output_dir is not None and not training_args.push_to_hub: # If we're not pushing to hub, at least save a local copy when we're done model.save_pretrained(training_args.output_dir)