def test_model_for_encoder_decoder_lm(self): for model_name in TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: config = AutoConfig.from_pretrained(model_name) self.assertIsNotNone(config) self.assertIsInstance(config, T5Config) model = TFAutoModelForSeq2SeqLM.from_pretrained(model_name) model, loading_info = TFAutoModelForSeq2SeqLM.from_pretrained(model_name, output_loading_info=True) self.assertIsNotNone(model) self.assertIsInstance(model, TFT5ForConditionalGeneration)
def model(self): warnings.simplefilter("error") model: TFMarianMTModel = TFAutoModelForSeq2SeqLM.from_pretrained(self.model_name, from_pt=True) assert isinstance(model, TFMarianMTModel) c = model.config self.assertListEqual(c.bad_words_ids, [[c.pad_token_id]]) self.assertEqual(c.max_length, 512) self.assertEqual(c.decoder_start_token_id, c.pad_token_id) return model
def test_rag_sequence_from_pretrained(self): load_weight_prefix = "tf_rag_model_1" rag_config = self.get_rag_config() rag_decoder_tokenizer = BartTokenizer.from_pretrained( "facebook/bart-large-cnn") rag_question_encoder_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained( "facebook/dpr-question_encoder-single-nq-base") rag_retriever = RagRetriever( rag_config, question_encoder_tokenizer=rag_question_encoder_tokenizer, generator_tokenizer=rag_decoder_tokenizer, ) input_ids = rag_question_encoder_tokenizer( "who sings does he love me with reba", return_tensors="tf").input_ids decoder_input_ids = rag_decoder_tokenizer( "Linda Davis", return_tensors="tf").input_ids with tempfile.TemporaryDirectory() as tmp_dirname: rag_sequence = TFRagSequenceForGeneration.from_pretrained_question_encoder_generator( "facebook/dpr-question_encoder-single-nq-base", "facebook/bart-large-cnn", retriever=rag_retriever, config=rag_config, ) # check that the from pretrained methods work rag_sequence.save_pretrained(tmp_dirname) rag_sequence.from_pretrained(tmp_dirname, retriever=rag_retriever) output = rag_sequence(input_ids, labels=decoder_input_ids) loss_pretrained = output.loss del rag_sequence question_encoder = TFAutoModel.from_pretrained( "facebook/dpr-question_encoder-single-nq-base") generator = TFAutoModelForSeq2SeqLM.from_pretrained( "facebook/bart-large-cnn", load_weight_prefix=load_weight_prefix, name="generator") rag_sequence = TFRagSequenceForGeneration( config=rag_config, question_encoder=question_encoder, generator=generator, retriever=rag_retriever) output = rag_sequence(input_ids, labels=decoder_input_ids) loss_init = output.loss self.assertAlmostEqual(loss_pretrained, loss_init, places=4)
def test_small_integration_test(self): """ For comparision run: >>> import t5 # pip install t5==0.7.1 >>> from t5.data.sentencepiece_vocabulary import SentencePieceVocabulary >>> path_to_mtf_small_mt5_checkpoint = '<fill_in>' >>> path_to_mtf_small_mt5_spm_model_path = '<fill_in>' >>> t5_model = t5.models.MtfModel(model_dir=path_to_mtf_small_mt5_checkpoint, batch_size=1, tpu=None) >>> vocab = SentencePieceVocabulary(path_to_mtf_small_mt5_spm_model_path, extra_ids=100) >>> score = t5_model.score(inputs=["Hello there"], targets=["Hi I am"], vocabulary=vocab) """ model = TFAutoModelForSeq2SeqLM.from_pretrained("google/mt5-small") tokenizer = AutoTokenizer.from_pretrained("google/mt5-small") input_ids = tokenizer("Hello there", return_tensors="tf").input_ids labels = tokenizer("Hi I am", return_tensors="tf").input_ids loss = model(input_ids, labels=labels).loss mtf_score = -tf.math.reduce_sum(loss).numpy() EXPECTED_SCORE = -84.9127 self.assertTrue(abs(mtf_score - EXPECTED_SCORE) < 2e-4)
def model(self): model = TFAutoModelForSeq2SeqLM.from_pretrained(self.model_name) return model
def main(): # region Argument parsing # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() # endregion # region Logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO) datasets.utils.logging.set_verbosity(logging.INFO) transformers.utils.logging.set_verbosity(logging.INFO) # Log on each process the small summary: logger.info(f"Training/evaluation parameters {training_args}") # endregion # region T5 special-casing if data_args.source_prefix is None and model_args.model_name_or_path in [ "t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b", ]: logger.warning( "You're running a t5 model but didn't provide a source prefix, which is the expected, e.g. with " "`--source_prefix 'summarize: ' `" ) # endregion # region Detecting last checkpoint last_checkpoint = None if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome." ) elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # endregion # Set seed before initializing model. set_seed(training_args.seed) # region Load datasets # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files this script will use the first column for the full texts and the second column for the # summaries (unless you specify column names for this with the `text_column` and `summary_column` arguments). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset( data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir ) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file extension = data_args.train_file.split(".")[-1] if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.validation_file.split(".")[-1] if data_args.test_file is not None: data_files["test"] = data_args.test_file extension = data_args.test_file.split(".")[-1] raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # endregion # region Load model config and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) prefix = data_args.source_prefix if data_args.source_prefix is not None else "" # endregion # region Dataset preprocessing # We need to tokenize inputs and targets. if training_args.do_train: column_names = raw_datasets["train"].column_names elif training_args.do_eval: column_names = raw_datasets["validation"].column_names else: logger.info("There is nothing to do. Please pass `do_train`, and/or `do_eval`.") return # Get the column names for input/target. dataset_columns = summarization_name_mapping.get(data_args.dataset_name, None) if data_args.text_column is None: text_column = dataset_columns[0] if dataset_columns is not None else column_names[0] else: text_column = data_args.text_column if text_column not in column_names: raise ValueError( f"--text_column' value '{data_args.text_column}' needs to be one of: {', '.join(column_names)}" ) if data_args.summary_column is None: summary_column = dataset_columns[1] if dataset_columns is not None else column_names[1] else: summary_column = data_args.summary_column if summary_column not in column_names: raise ValueError( f"--summary_column' value '{data_args.summary_column}' needs to be one of: {', '.join(column_names)}" ) # Temporarily set max_target_length for training. max_target_length = data_args.max_target_length padding = "max_length" if data_args.pad_to_max_length else False def preprocess_function(examples): inputs = examples[text_column] targets = examples[summary_column] inputs = [prefix + inp for inp in inputs] model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True) # Setup the tokenizer for targets with tokenizer.as_target_tokenizer(): labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True) # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore # padding in the loss. if padding == "max_length" and data_args.ignore_pad_token_for_loss: labels["input_ids"] = [ [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"] ] model_inputs["labels"] = labels["input_ids"] return model_inputs if training_args.do_train: if "train" not in raw_datasets: raise ValueError("--do_train requires a train dataset") train_dataset = raw_datasets["train"] if data_args.max_train_samples is not None: train_dataset = train_dataset.select(range(data_args.max_train_samples)) with training_args.main_process_first(desc="train dataset map pre-processing"): train_dataset = train_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on train dataset", ) else: train_dataset = None if training_args.do_eval: max_target_length = data_args.val_max_target_length if "validation" not in raw_datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = raw_datasets["validation"] if data_args.max_eval_samples is not None: eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) with training_args.main_process_first(desc="validation dataset map pre-processing"): eval_dataset = eval_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on validation dataset", ) else: eval_dataset = None # endregion # region Text preprocessing def postprocess_text(preds, labels): preds = [pred.strip() for pred in preds] labels = [label.strip() for label in labels] # rougeLSum expects newline after each sentence preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds] labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels] return preds, labels # endregion with training_args.strategy.scope(): # region Prepare model model = TFAutoModelForSeq2SeqLM.from_pretrained( model_args.model_name_or_path, config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model.resize_token_embeddings(len(tokenizer)) # endregion # region Prepare TF Dataset objects if model.config.decoder_start_token_id is None: raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined") num_replicas = training_args.strategy.num_replicas_in_sync total_train_batch_size = training_args.per_device_train_batch_size * num_replicas total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas tf_train_dataset = dataset_to_tf( train_dataset, model, tokenizer, total_batch_size=total_train_batch_size, num_epochs=training_args.num_train_epochs, shuffle=True, ) tf_eval_dataset = dataset_to_tf( eval_dataset, model, tokenizer, total_eval_batch_size, num_epochs=1, shuffle=False, ) # endregion # region Optimizer, loss and LR scheduling # Scheduler and math around the number of training steps. num_update_steps_per_epoch = len(train_dataset) // total_train_batch_size num_train_steps = training_args.num_train_epochs * num_update_steps_per_epoch optimizer, lr_schedule = create_optimizer( init_lr=training_args.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=0 ) def masked_sparse_categorical_crossentropy(y_true, y_pred): # We clip the negative labels to 0 to avoid NaNs appearing in the output and # fouling up everything that comes afterwards. The loss values corresponding to clipped values # will be masked later anyway, but even masked NaNs seem to cause overflows for some reason. # 1e6 is chosen as a reasonable upper bound for the number of token indices - in the unlikely # event that you have more than 1 million tokens in your vocabulary, consider increasing this value. # More pragmatically, consider redesigning your tokenizer. losses = tf.keras.losses.sparse_categorical_crossentropy( tf.clip_by_value(y_true, 0, int(1e6)), y_pred, from_logits=True ) # Compute the per-sample loss only over the unmasked tokens losses = tf.ragged.boolean_mask(losses, y_true != -100) losses = tf.reduce_mean(losses, axis=-1) return losses # endregion # region Metric metric = load_metric("rouge") # endregion # region Training model.compile(loss={"logits": masked_sparse_categorical_crossentropy}, optimizer=optimizer) if training_args.do_train: logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {training_args.num_train_epochs}") logger.info(f" Instantaneous batch size per device = {training_args.per_device_train_batch_size}") logger.info(f" Total train batch size = {total_train_batch_size}") logger.info(f" Total optimization steps = {num_train_steps}") model.fit( tf_train_dataset, epochs=int(training_args.num_train_epochs), steps_per_epoch=num_update_steps_per_epoch, ) # endregion # region Validation if data_args.val_max_target_length is None: data_args.val_max_target_length = data_args.max_target_length gen_kwargs = { "max_length": data_args.val_max_target_length if data_args is not None else config.max_length, "num_beams": data_args.num_beams, } if training_args.do_eval: logger.info("Evaluation...") for batch, labels in tqdm( tf_eval_dataset, total=len(eval_dataset) // training_args.per_device_eval_batch_size ): batch.update(gen_kwargs) generated_tokens = model.generate(**batch) if isinstance(generated_tokens, tuple): generated_tokens = generated_tokens[0] decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) labels = np.where(labels != -100, labels, tokenizer.pad_token_id) decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels) metric.add_batch(predictions=decoded_preds, references=decoded_labels) result = metric.compute(use_stemmer=True) # Extract a few results from ROUGE result = {key: value.mid.fmeasure * 100 for key, value in result.items()} result = {k: round(v, 4) for k, v in result.items()} logger.info(result) # endregion if training_args.output_dir is not None: model.save_pretrained(training_args.output_dir)
def copy_model_files(self, force=False): modified = False src_path = self.checkpoint_path d = None try: if force or not (self.git_path / "tf_model.h5").exists() or not ( self.git_path / "pytorch_model.bin").exists(): d = TemporaryDirectory() if self.task in self.QA_TASKS: model = QASparseXP.compile_model(src_path, dest_path=d.name) elif self.task in self.GLUE_TASKS: model = GlueSparseXP.compile_model(src_path, dest_path=d.name) elif self.task in self.SUMMARIZATION_TASKS: model = SummarizationSparseXP.compile_model( src_path, dest_path=d.name) else: raise Exception(f"Unknown task {self.task}") model = optimize_model(model, "heads") model.save_pretrained(d.name) src_path = d.name if force or not (self.git_path / "tf_model.h5").exists(): with TemporaryDirectory() as d2: if self.task in self.QA_TASKS: QASparseXP.final_fine_tune_bertarize( src_path, d2, remove_head_pruning=True) tf_model = TFAutoModelForQuestionAnswering.from_pretrained( d2, from_pt=True) elif self.task in self.GLUE_TASKS: GlueSparseXP.final_fine_tune_bertarize( src_path, d2, remove_head_pruning=True) tf_model = TFAutoModelForSequenceClassification.from_pretrained( d2, from_pt=True) elif self.task in self.SUMMARIZATION_TASKS: SummarizationSparseXP.final_fine_tune_bertarize( src_path, d2, remove_head_pruning=True) tf_model = TFAutoModelForSeq2SeqLM.from_pretrained( d2, from_pt=True) else: raise Exception(f"Unknown task {self.task}") tf_model.save_pretrained(self.git_path) modified = True if force or not (self.git_path / "pytorch_model.bin").exists(): if self.task in self.QA_TASKS: model = AutoModelForQuestionAnswering.from_pretrained( src_path) elif self.task in self.GLUE_TASKS: model = AutoModelForSequenceClassification.from_pretrained( src_path) elif self.task in self.SUMMARIZATION_TASKS: model = AutoModelForSeq2SeqLM.from_pretrained(src_path) else: raise Exception(f"Unknown task {self.task}") model.save_pretrained(self.git_path) modified = True src_path = Path(src_path) to_copy = self.get_copy_list() for files, dest in to_copy: dest.mkdir(exist_ok=True) for file in files: if force or not (dest / file).exists(): shutil.copyfile(str(src_path / file), str(dest / file)) modified = True finally: if d is not None: d.cleanup() # Reload the config, this may have been changed by compilation / optimization (pruned_heads, gelu_patch, layer_norm_patch) with (self.git_path / "config.json").open() as f: self.checkpoint_info["config"] = json.load(f) return modified
def main(): # region Argument parsing # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TFTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_translation", model_args, data_args, framework="tensorflow") # endregion # region Logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO) datasets.utils.logging.set_verbosity(logging.INFO) transformers.utils.logging.set_verbosity(logging.INFO) # Log on each process the small summary: logger.info(f"Training/evaluation parameters {training_args}") # endregion # region Detecting last checkpoint last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # endregion # Set seed before initializing model. set_seed(training_args.seed) # region Load datasets # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files this script will use the first column for the full texts and the second column for the # summaries (unless you specify column names for this with the `text_column` and `summary_column` arguments). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset( data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file extension = data_args.train_file.split(".")[-1] if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.validation_file.split(".")[-1] raw_datasets = load_dataset( extension, data_files=data_files, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # endregion # region Load model config and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) prefix = data_args.source_prefix if data_args.source_prefix is not None else "" # endregion # region Dataset preprocessing # We need to tokenize inputs and targets. if training_args.do_train: column_names = raw_datasets["train"].column_names elif training_args.do_eval: column_names = raw_datasets["validation"].column_names else: logger.info( "There is nothing to do. Please pass `do_train`, and/or `do_eval`." ) return column_names = raw_datasets["train"].column_names # For translation we set the codes of our source and target languages (only useful for mBART, the others will # ignore those attributes). if isinstance(tokenizer, tuple(MULTILINGUAL_TOKENIZERS)): assert data_args.target_lang is not None and data_args.source_lang is not None, ( f"{tokenizer.__class__.__name__} is a multilingual tokenizer which requires --source_lang and " "--target_lang arguments.") tokenizer.src_lang = data_args.source_lang tokenizer.tgt_lang = data_args.target_lang forced_bos_token_id = ( tokenizer.lang_code_to_id[data_args.forced_bos_token] if data_args.forced_bos_token is not None else None) # Get the language codes for input/target. source_lang = data_args.source_lang.split("_")[0] target_lang = data_args.target_lang.split("_")[0] padding = "max_length" if data_args.pad_to_max_length else False # Temporarily set max_target_length for training. max_target_length = data_args.max_target_length padding = "max_length" if data_args.pad_to_max_length else False def preprocess_function(examples): inputs = [ex[source_lang] for ex in examples["translation"]] targets = [ex[target_lang] for ex in examples["translation"]] inputs = [prefix + inp for inp in inputs] model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True) # Tokenize targets with the `text_target` keyword argument labels = tokenizer(text_target=targets, max_length=max_target_length, padding=padding, truncation=True) # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore # padding in the loss. if padding == "max_length" and data_args.ignore_pad_token_for_loss: labels["input_ids"] = [[ (l if l != tokenizer.pad_token_id else -100) for l in label ] for label in labels["input_ids"]] model_inputs["labels"] = labels["input_ids"] return model_inputs if training_args.do_train: if "train" not in raw_datasets: raise ValueError("--do_train requires a train dataset") train_dataset = raw_datasets["train"] if data_args.max_train_samples is not None: max_train_samples = min(len(train_dataset), data_args.max_train_samples) train_dataset = train_dataset.select(range(max_train_samples)) with training_args.main_process_first( desc="train dataset map pre-processing"): train_dataset = train_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on train dataset", ) else: train_dataset = None if training_args.do_eval: max_target_length = data_args.val_max_target_length if "validation" not in raw_datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = raw_datasets["validation"] if data_args.max_eval_samples is not None: max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) eval_dataset = eval_dataset.select(range(max_eval_samples)) with training_args.main_process_first( desc="validation dataset map pre-processing"): eval_dataset = eval_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on validation dataset", ) else: eval_dataset = None # endregion with training_args.strategy.scope(): # region Prepare model model = TFAutoModelForSeq2SeqLM.from_pretrained( model_args.model_name_or_path, config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model.resize_token_embeddings(len(tokenizer)) if isinstance(tokenizer, tuple(MULTILINGUAL_TOKENIZERS)): model.config.forced_bos_token_id = forced_bos_token_id # endregion # region Set decoder_start_token_id if model.config.decoder_start_token_id is None and isinstance( tokenizer, (MBartTokenizer, MBartTokenizerFast)): assert (data_args.target_lang is not None and data_args.source_lang is not None ), "mBart requires --target_lang and --source_lang" if isinstance(tokenizer, MBartTokenizer): model.config.decoder_start_token_id = tokenizer.lang_code_to_id[ data_args.target_lang] else: model.config.decoder_start_token_id = tokenizer.convert_tokens_to_ids( data_args.target_lang) if model.config.decoder_start_token_id is None: raise ValueError( "Make sure that `config.decoder_start_token_id` is correctly defined" ) # endregion # region Prepare TF Dataset objects label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id data_collator = DataCollatorForSeq2Seq( tokenizer, model=model, label_pad_token_id=label_pad_token_id, pad_to_multiple_of= 64, # Reduce the number of unique shapes for XLA, especially for generation return_tensors="tf", ) num_replicas = training_args.strategy.num_replicas_in_sync total_train_batch_size = training_args.per_device_train_batch_size * num_replicas total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas dataset_options = tf.data.Options() dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF # model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in # training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also # use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names # yourself if you use this method, whereas they are automatically inferred from the model input names when # using model.prepare_tf_dataset() # For more info see the docs: # https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset # https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset tf_train_dataset = model.prepare_tf_dataset( train_dataset, collate_fn=data_collator, batch_size=total_train_batch_size, shuffle=True, ).with_options(dataset_options) tf_eval_dataset = model.prepare_tf_dataset( eval_dataset, collate_fn=data_collator, batch_size=total_eval_batch_size, shuffle=False).with_options(dataset_options) # endregion # region Optimizer and LR scheduling num_train_steps = int( len(tf_train_dataset) * training_args.num_train_epochs) if training_args.warmup_steps > 0: num_warmup_steps = training_args.warmup_steps elif training_args.warmup_ratio > 0: num_warmup_steps = int(num_train_steps * training_args.warmup_ratio) else: num_warmup_steps = 0 if training_args.do_train: optimizer, lr_schedule = create_optimizer( init_lr=training_args.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, adam_beta1=training_args.adam_beta1, adam_beta2=training_args.adam_beta2, adam_epsilon=training_args.adam_epsilon, weight_decay_rate=training_args.weight_decay, adam_global_clipnorm=training_args.max_grad_norm, ) else: optimizer = None # endregion # region Metric and postprocessing if training_args.do_eval: metric = evaluate.load("sacrebleu") if data_args.val_max_target_length is None: data_args.val_max_target_length = data_args.max_target_length gen_kwargs = { "max_length": data_args.val_max_target_length, "num_beams": data_args.num_beams, "no_repeat_ngram_size": 0, # Not supported under XLA right now, and some models set it by default } def postprocess_text(preds, labels): preds = [pred.strip() for pred in preds] labels = [[label.strip()] for label in labels] return preds, labels def compute_metrics(preds): predictions, labels = preds if isinstance(predictions, tuple): predictions = predictions[0] decoded_preds = tokenizer.batch_decode( predictions, skip_special_tokens=True) labels = np.where(labels != -100, labels, tokenizer.pad_token_id) decoded_labels = tokenizer.batch_decode( labels, skip_special_tokens=True) decoded_preds, decoded_labels = postprocess_text( decoded_preds, decoded_labels) metrics = metric.compute(predictions=decoded_preds, references=decoded_labels) return {"bleu": metrics["score"]} # The KerasMetricCallback allows metrics that are too complex to write as standard Keras metrics # to be computed each epoch. Any Python code can be included in the metric_fn. This is especially # useful for metrics like BLEU and ROUGE that perform string comparisons on decoded model outputs. # For more information, see the docs at # https://huggingface.co/docs/transformers/main_classes/keras_callbacks#transformers.KerasMetricCallback metric_callback = KerasMetricCallback( metric_fn=compute_metrics, eval_dataset=tf_eval_dataset, predict_with_generate=True, use_xla_generation=True, generate_kwargs=gen_kwargs, ) callbacks = [metric_callback] else: callbacks = [] # endregion # region Preparing push_to_hub and model card push_to_hub_model_id = training_args.push_to_hub_model_id model_name = model_args.model_name_or_path.split("/")[-1] if not push_to_hub_model_id: push_to_hub_model_id = f"{model_name}-finetuned-{data_args.source_lang}-{data_args.target_lang}" model_card_kwargs = { "finetuned_from": model_args.model_name_or_path, "tasks": "translation" } if data_args.dataset_name is not None: model_card_kwargs["dataset_tags"] = data_args.dataset_name if data_args.dataset_config_name is not None: model_card_kwargs[ "dataset_args"] = data_args.dataset_config_name model_card_kwargs[ "dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" else: model_card_kwargs["dataset"] = data_args.dataset_name languages = [ l for l in [data_args.source_lang, data_args.target_lang] if l is not None ] if len(languages) > 0: model_card_kwargs["language"] = languages if training_args.push_to_hub: # Because this training can be quite long, we save once per epoch. callbacks.append( PushToHubCallback( output_dir=training_args.output_dir, model_id=push_to_hub_model_id, organization=training_args.push_to_hub_organization, token=training_args.push_to_hub_token, tokenizer=tokenizer, **model_card_kwargs, )) # endregion # region Training eval_metrics = None model.compile(optimizer=optimizer, jit_compile=training_args.xla) if training_args.do_train: logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {training_args.num_train_epochs}") logger.info( f" Instantaneous batch size per device = {training_args.per_device_train_batch_size}" ) logger.info(f" Total train batch size = {total_train_batch_size}") logger.info(f" Total optimization steps = {num_train_steps}") if training_args.xla and not data_args.pad_to_max_length: logger.warning( "XLA training may be slow at first when --pad_to_max_length is not set " "until all possible shapes have been compiled.") history = model.fit(tf_train_dataset, epochs=int(training_args.num_train_epochs), callbacks=callbacks) eval_metrics = { key: val[-1] for key, val in history.history.items() } # endregion # region Validation if training_args.do_eval and not training_args.do_train: # Compiling generation with XLA yields enormous speedups, see https://huggingface.co/blog/tf-xla-generate @tf.function(jit_compile=True) def generate(**kwargs): return model.generate(**kwargs) if training_args.do_eval: logger.info("Evaluation...") for batch, labels in tf_eval_dataset: batch.update(gen_kwargs) generated_tokens = generate(**batch) if isinstance(generated_tokens, tuple): generated_tokens = generated_tokens[0] decoded_preds = tokenizer.batch_decode( generated_tokens, skip_special_tokens=True) labels = np.where(labels != -100, labels, tokenizer.pad_token_id) decoded_labels = tokenizer.batch_decode( labels, skip_special_tokens=True) decoded_preds, decoded_labels = postprocess_text( decoded_preds, decoded_labels) metric.add_batch(predictions=decoded_preds, references=decoded_labels) eval_metrics = metric.compute() logger.info({"bleu": eval_metrics["score"]}) # endregion if training_args.output_dir is not None and eval_metrics is not None: output_eval_file = os.path.join(training_args.output_dir, "all_results.json") with open(output_eval_file, "w") as writer: writer.write(json.dumps(eval_metrics)) if training_args.output_dir is not None and not training_args.push_to_hub: # If we're not pushing to hub, at least save a local copy when we're done model.save_pretrained(training_args.output_dir)