def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank ) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # 1. First, let's load the dataset raw_datasets = DatasetDict() task_name = data_args.task lang_id = data_args.language if task_name is None: raise ValueError("Set --task should be set to '<xtreme_s_task>' " "(e.g. 'fleurs-asr', 'mls', 'covost2', 'minds14') ") if lang_id is None: raise ValueError( "Set --language should be set to the language id of the sub dataset " "config to be used (e.g. 'pl', 'en.tr', 'fr-FR') or 'all'" " for multi-lingual fine-tuning.") if data_args.target_column_name is None: target_column_name = TASK_TO_TARGET_COLUMN_NAME[task_name] else: target_column_name = data_args.target_column_name # here we differentiate between tasks with text as the target and classification tasks is_text_target = target_column_name in ("transcription", "translation") config_name = ".".join([task_name.split("-")[0], lang_id]) if training_args.do_train: raw_datasets["train"] = load_dataset( data_args.dataset_name, config_name, split=data_args.train_split_name, use_auth_token=data_args.use_auth_token, cache_dir=model_args.cache_dir, ) if data_args.audio_column_name not in raw_datasets[ "train"].column_names: raise ValueError( f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'. " "Make sure to set `--audio_column_name` to the correct audio column - one of " f"{', '.join(raw_datasets['train'].column_names)}.") if target_column_name not in raw_datasets["train"].column_names: raise ValueError( f"--target_column_name {target_column_name} not found in dataset '{data_args.dataset_name}'. " "Make sure to set `--target_column_name` to the correct text column - one of " f"{', '.join(raw_datasets['train'].column_names)}.") if data_args.max_train_samples is not None: raw_datasets["train"] = raw_datasets["train"].select( range(data_args.max_train_samples)) if not is_text_target: label_list = raw_datasets["train"].features[ target_column_name].names num_labels = len(label_list) if training_args.do_eval: raw_datasets["eval"] = load_dataset( data_args.dataset_name, config_name, split=data_args.eval_split_name, use_auth_token=data_args.use_auth_token, cache_dir=model_args.cache_dir, ) if data_args.max_eval_samples is not None: raw_datasets["eval"] = raw_datasets["eval"].select( range(data_args.max_eval_samples)) if training_args.do_predict: raw_datasets["predict"] = load_dataset( data_args.dataset_name, config_name, split=data_args.predict_split_name, use_auth_token=data_args.use_auth_token, cache_dir=model_args.cache_dir, ) if data_args.max_predict_samples is not None: raw_datasets["predict"] = raw_datasets["predict"].select( range(data_args.max_predict_samples)) # 2. We remove some special characters from the datasets # that make training complicated and do not help in transcribing the speech # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic # that could be easily picked up by the model chars_to_ignore_regex = (f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None) def remove_special_characters(batch): if chars_to_ignore_regex is not None: batch["target_text"] = re.sub( chars_to_ignore_regex, "", batch[target_column_name]).lower() + " " else: batch["target_text"] = batch[target_column_name].lower() + " " return batch if is_text_target: with training_args.main_process_first( desc="dataset map special characters removal"): raw_datasets = raw_datasets.map( remove_special_characters, remove_columns=[target_column_name], desc="remove special characters from datasets", ) # save special tokens for tokenizer word_delimiter_token = data_args.word_delimiter_token unk_token = data_args.unk_token pad_token = data_args.pad_token # 3. Next, let's load the config as we might need it to create # the tokenizer config = AutoConfig.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token) if is_text_target: # 4. (Optional, for ASR and translation) If no tokenizer file is defined, # we create the vocabulary of the model by extracting all unique characters from # the training and evaluation datasets # We need to make sure that only first rank saves vocabulary # make sure all processes wait until vocab is created tokenizer_name_or_path = model_args.tokenizer_name_or_path tokenizer_kwargs = {} if tokenizer_name_or_path is None: # save vocab in training output dir tokenizer_name_or_path = training_args.output_dir vocab_file = os.path.join(tokenizer_name_or_path, "vocab.json") with training_args.main_process_first(): if training_args.overwrite_output_dir and os.path.isfile( vocab_file): os.remove(vocab_file) with training_args.main_process_first( desc="dataset map vocabulary creation"): if not os.path.isfile(vocab_file): os.makedirs(tokenizer_name_or_path, exist_ok=True) vocab_dict = create_vocabulary_from_data( raw_datasets, word_delimiter_token=word_delimiter_token, unk_token=unk_token, pad_token=pad_token, ) # save vocab dict to be loaded into tokenizer with open(vocab_file, "w") as file: json.dump(vocab_dict, file) # if tokenizer has just been created # it is defined by `tokenizer_class` if present in config else by `model_type` if not config.is_encoder_decoder: tokenizer_kwargs = { "config": config if config.tokenizer_class is not None else None, "tokenizer_type": config.model_type if config.tokenizer_class is None else None, "unk_token": unk_token, "pad_token": pad_token, "word_delimiter_token": word_delimiter_token, } else: tokenizer_kwargs = {} # 5. Now we can instantiate the feature extractor, tokenizer and model # Note for distributed training, the .from_pretrained methods guarantee that only # one local process can concurrently download model & vocab. # load feature_extractor and tokenizer if is_text_target: tokenizer = AutoTokenizer.from_pretrained( tokenizer_name_or_path, use_auth_token=data_args.use_auth_token, **tokenizer_kwargs, ) feature_extractor = AutoFeatureExtractor.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token) # adapt config config.update({ "feat_proj_dropout": model_args.feat_proj_dropout, "attention_dropout": model_args.attention_dropout, "hidden_dropout": model_args.hidden_dropout, "final_dropout": model_args.final_dropout, "mask_time_prob": model_args.mask_time_prob, "mask_time_length": model_args.mask_time_length, "mask_feature_prob": model_args.mask_feature_prob, "mask_feature_length": model_args.mask_feature_length, "gradient_checkpointing": training_args.gradient_checkpointing, "layerdrop": model_args.layerdrop, "ctc_loss_reduction": model_args.ctc_loss_reduction, "activation_dropout": model_args.activation_dropout, }) if training_args.do_train: if is_text_target: config.pad_token_id = tokenizer.pad_token_id config.vocab_size = len(tokenizer) else: label_to_id = {v: i for i, v in enumerate(label_list)} config.label2id = label_to_id config.id2label = {id: label for label, id in label_to_id.items()} config.num_labels = num_labels # create model if target_column_name == "transcription": model = AutoModelForCTC.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, config=config, use_auth_token=data_args.use_auth_token, ) elif config.is_encoder_decoder: model = AutoModelForSpeechSeq2Seq.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, config=config, use_auth_token=data_args.use_auth_token, ) if model.config.decoder_start_token_id is None: raise ValueError( "Make sure that `config.decoder_start_token_id` is correctly defined" ) else: model = AutoModelForAudioClassification.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, config=config, use_auth_token=data_args.use_auth_token, ) # freeze encoder if model_args.freeze_feature_encoder: model.freeze_feature_encoder() # 6. Now we preprocess the datasets including loading the audio, resampling and normalization # Thankfully, `datasets` takes care of automatically loading and resampling the audio, # so that we just need to set the correct target sampling rate and normalize the input # via the `feature_extractor` # make sure that dataset decodes audio with correct sampling rate dataset_sampling_rate = next(iter(raw_datasets.values())).features[ data_args.audio_column_name].sampling_rate if dataset_sampling_rate != feature_extractor.sampling_rate: raw_datasets = raw_datasets.cast_column( data_args.audio_column_name, datasets.features.Audio( sampling_rate=feature_extractor.sampling_rate)) # derive max & min input length for sample rate & max duration max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate audio_column_name = data_args.audio_column_name num_workers = data_args.preprocessing_num_workers # `phoneme_language` is only relevant if the model is fine-tuned on phoneme classification phoneme_language = data_args.phoneme_language # Preprocessing the datasets. # We need to read the audio files as arrays and tokenize the targets. def prepare_dataset(batch): # load audio sample = batch[audio_column_name] inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"]) batch["input_values"] = inputs.input_values[0] batch["length"] = len(batch["input_values"]) # encode targets additional_kwargs = {} if phoneme_language is not None: additional_kwargs["phonemizer_lang"] = phoneme_language if is_text_target: batch["labels"] = tokenizer(batch["target_text"], **additional_kwargs).input_ids else: batch["labels"] = batch[target_column_name] return batch with training_args.main_process_first(desc="dataset map preprocessing"): vectorized_datasets = raw_datasets.map( prepare_dataset, remove_columns=next(iter(raw_datasets.values())).column_names, num_proc=num_workers, desc="preprocess datasets", ) if training_args.do_train: def is_audio_in_length_range(length): return length > min_input_length and length < max_input_length # filter data that is shorter than min_input_length vectorized_datasets["train"] = vectorized_datasets["train"].filter( is_audio_in_length_range, num_proc=num_workers, input_columns=["length"], ) # 7. Next, we can prepare for the training step. # Let's use the appropriate XTREME-S evaluation metric, # instantiate a data collator and the trainer # Define evaluation metrics during training, *i.e.* word error rate, character error rate eval_metric = load_metric("xtreme_s", task_name) # for large datasets it is advised to run the preprocessing on a # single machine first with ``args.preprocessing_only`` since there will mostly likely # be a timeout when running the script in distributed mode. # In a second step ``args.preprocessing_only`` can then be set to `False` to load the # cached dataset if data_args.preprocessing_only: logger.info( f"Data preprocessing finished. Files cached at {vectorized_datasets.cache_files}" ) return def compute_asr_metric(pred): pred_logits = pred.predictions pred_ids = np.argmax(pred_logits, axis=-1) pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id pred_str = tokenizer.batch_decode(pred_ids) # we do not want to group tokens when computing the metrics label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False) metric = eval_metric.compute(predictions=pred_str, references=label_str) return metric def compute_classification_metric(pred): pred_ids = np.argmax(pred.predictions, axis=1) metric = eval_metric.compute(predictions=pred_ids, references=pred.label_ids) return metric # Now save everything to be able to create a single processor later if is_main_process(training_args.local_rank): # save feature extractor, tokenizer and config feature_extractor.save_pretrained(training_args.output_dir) if is_text_target: tokenizer.save_pretrained(training_args.output_dir) config.save_pretrained(training_args.output_dir) # wait until configs are saved in the main process before loading the processor torch.distributed.barrier() if is_text_target: processor = AutoProcessor.from_pretrained(training_args.output_dir) else: processor = AutoFeatureExtractor.from_pretrained( training_args.output_dir) # Instantiate custom data collator data_collator = SpeechDataCollatorWithPadding(processor=processor, pad_labels=is_text_target) # Initialize Trainer if target_column_name == "translation": trainer = Seq2SeqTrainer( model=model, data_collator=data_collator, args=training_args, compute_metrics=compute_asr_metric if training_args.predict_with_generate else None, train_dataset=vectorized_datasets["train"] if training_args.do_train else None, eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None, tokenizer=feature_extractor, ) else: trainer = Trainer( model=model, data_collator=data_collator, args=training_args, compute_metrics=compute_asr_metric if is_text_target else compute_classification_metric, train_dataset=vectorized_datasets["train"] if training_args.do_train else None, eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None, tokenizer=feature_extractor, ) # 8. Finally, we can start training # Training if training_args.do_train: # use last checkpoint if exist if last_checkpoint is not None: checkpoint = last_checkpoint elif os.path.isdir(model_args.model_name_or_path): checkpoint = model_args.model_name_or_path else: checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() metrics = train_result.metrics max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(vectorized_datasets["train"])) metrics["train_samples"] = min(max_train_samples, len(vectorized_datasets["train"])) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation on the test set results = {} if training_args.do_predict: logger.info( f"*** Evaluating on the `{data_args.predict_split_name}` set ***") metrics = trainer.evaluate(vectorized_datasets["predict"]) max_predict_samples = (data_args.max_predict_samples if data_args.max_predict_samples is not None else len(vectorized_datasets["predict"])) metrics["predict_samples"] = min(max_predict_samples, len(vectorized_datasets["predict"])) trainer.log_metrics("predict", metrics) trainer.save_metrics("predict", metrics) # Write model card and (optionally) push to hub kwargs = { "finetuned_from": model_args.model_name_or_path, "tasks": task_name, "tags": [task_name, data_args.dataset_name], "dataset_args": f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split: {data_args.eval_split_name}, Predict split: {data_args.predict_split_name}", "dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}", "language": data_args.language, } if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: trainer.create_model_card(**kwargs) return results
def main(): # 1. Parse input arguments # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_speech_recognition_seq2seq", model_args, data_args) # 2. Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) transformers.utils.logging.set_verbosity(log_level) transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.setLevel(logging.INFO if is_main_process(training_args.local_rank ) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() logger.info("Training/evaluation parameters %s", training_args) # 3. Detecting last checkpoint and eventualy continue from last checkpoint last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Set seed before initializing model. set_seed(training_args.seed) # 4. Load dataset raw_datasets = DatasetDict() if training_args.do_train: raw_datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=data_args.train_split_name, use_auth_token=True if model_args.use_auth_token else None, ) if training_args.do_eval: raw_datasets["eval"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=data_args.eval_split_name, use_auth_token=True if model_args.use_auth_token else None, ) if data_args.audio_column_name not in next(iter( raw_datasets.values())).column_names: raise ValueError( f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'. " "Make sure to set `--audio_column_name` to the correct audio column - one of " f"{', '.join(next(iter(raw_datasets.values())).column_names)}.") if data_args.text_column_name not in next(iter( raw_datasets.values())).column_names: raise ValueError( f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. " "Make sure to set `--text_column_name` to the correct text column - one of " f"{', '.join(next(iter(raw_datasets.values())).column_names)}.") # 5. Load pretrained model, tokenizer, and feature extractor # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) feature_extractor = AutoFeatureExtractor.from_pretrained( model_args.feature_extractor_name if model_args.feature_extractor_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model = AutoModelForSpeechSeq2Seq.from_pretrained( model_args.model_name_or_path, config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) if model.config.decoder_start_token_id is None: raise ValueError( "Make sure that `config.decoder_start_token_id` is correctly defined" ) if model_args.freeze_feature_encoder: model.freeze_feature_encoder() # 6. Resample speech dataset if necassary dataset_sampling_rate = next(iter(raw_datasets.values())).features[ data_args.audio_column_name].sampling_rate if dataset_sampling_rate != feature_extractor.sampling_rate: raw_datasets = raw_datasets.cast_column( data_args.audio_column_name, datasets.features.Audio( sampling_rate=feature_extractor.sampling_rate)) # 7. Preprocessing the datasets. # We need to read the audio files as arrays and tokenize the targets. max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate audio_column_name = data_args.audio_column_name num_workers = data_args.preprocessing_num_workers text_column_name = data_args.text_column_name model_input_name = feature_extractor.model_input_names[0] do_lower_case = data_args.do_lower_case if data_args.max_train_samples is not None: raw_datasets["train"] = raw_datasets["train"].select( range(data_args.max_train_samples)) if data_args.max_eval_samples is not None: raw_datasets["eval"] = raw_datasets["eval"].select( range(data_args.max_eval_samples)) def prepare_dataset(batch): # process audio sample = batch[audio_column_name] inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"]) # process audio length batch[model_input_name] = inputs.input_values[0] batch["input_length"] = len(batch["input_values"]) # process targets input_str = batch[text_column_name].lower( ) if do_lower_case else batch[text_column_name] batch["labels"] = tokenizer(input_str).input_ids return batch with training_args.main_process_first(desc="dataset map pre-processing"): vectorized_datasets = raw_datasets.map( prepare_dataset, remove_columns=next(iter(raw_datasets.values())).column_names, num_proc=data_args.preprocessing_num_workers, desc="preprocess train dataset", ) # filter data that is shorter than min_input_length or longer than # max_input_length def is_audio_in_length_range(length): return length > min_input_length and length < max_input_length vectorized_datasets = vectorized_datasets.filter( is_audio_in_length_range, num_proc=num_workers, input_columns=["input_length"], ) # for large datasets it is advised to run the preprocessing on a # single machine first with `args.preprocessing_only` since there will mostly likely # be a timeout when running the script in distributed mode. # In a second step `args.preprocessing_only` can then be set to `False` to load the # cached dataset if data_args.preprocessing_only: cache = {k: v.cache_files for k, v in vectorized_datasets.items()} logger.info(f"Data preprocessing finished. Files cached at {cache}.") return # 8. Load Metric metric = evaluate.load("wer") def compute_metrics(pred): pred_ids = pred.predictions pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True) # we do not want to group tokens when computing the metrics label_str = tokenizer.batch_decode(pred.label_ids, skip_special_tokens=True) wer = metric.compute(predictions=pred_str, references=label_str) return {"wer": wer} # 9. Create a single speech processor if is_main_process(training_args.local_rank): # save feature extractor, tokenizer and config feature_extractor.save_pretrained(training_args.output_dir) tokenizer.save_pretrained(training_args.output_dir) config.save_pretrained(training_args.output_dir) processor = AutoProcessor.from_pretrained(training_args.output_dir) # 10. Define data collator data_collator = DataCollatorSpeechSeq2SeqWithPadding( processor=processor, decoder_start_token_id=model.config.decoder_start_token_id) # 11. Initialize Trainer trainer = Seq2SeqTrainer( model=model, args=training_args, train_dataset=vectorized_datasets["train"] if training_args.do_train else None, eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None, tokenizer=feature_extractor, data_collator=data_collator, compute_metrics=compute_metrics if training_args.predict_with_generate else None, ) # 12. Training if training_args.do_train: checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the feature extractor too for easy upload metrics = train_result.metrics max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(vectorized_datasets["train"])) metrics["train_samples"] = min(max_train_samples, len(vectorized_datasets["train"])) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # 13. Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate(metric_key_prefix="eval", max_length=model.config.max_length, num_beams=model.config.num_beams) max_eval_samples = (data_args.max_eval_samples if data_args.max_eval_samples is not None else len( vectorized_datasets["eval"])) metrics["eval_samples"] = min(max_eval_samples, len(vectorized_datasets["eval"])) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # 14. Write Training Stats kwargs = { "finetuned_from": model_args.model_name_or_path, "tasks": "speech recognition" } if data_args.dataset_name is not None: kwargs["dataset_tags"] = data_args.dataset_name if data_args.dataset_config_name is not None: kwargs["dataset_args"] = data_args.dataset_config_name kwargs[ "dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" else: kwargs["dataset"] = data_args.dataset_name if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: trainer.create_model_card(**kwargs) return results
def main(): raw_datasets = DatasetDict() if training_args.do_train: raw_datasets["train"] = load_dataset(data_args.dataset_name, data_args.dataset_config, split=data_args.train_split_name) if training_args.do_eval: raw_datasets["eval"] = load_dataset(data_args.dataset_name, data_args.dataset_config, split=data_args.eval_split_name) if data_args.audio_column not in next(iter( raw_datasets.values())).column_names: raise ValueError( f"--audio_column '{data_args.audio_column}' not found in dataset '{data_args.dataset_name}'. " "Make sure to set `--audio_column` to the correct audio column - one of " f"{', '.join(next(iter(raw_datasets.values())).column_names)}.") if data_args.text_column not in next(iter( raw_datasets.values())).column_names: raise ValueError( f"--text_column {data_args.text_column} not found in dataset '{data_args.dataset_name}'. " "Make sure to set `--text_column` to the correct text column - one of " f"{', '.join(next(iter(raw_datasets.values())).column_names)}.") # 5. Load pretrained model, tokenizer, and feature extractor # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name, cache_dir=model_args.cache_dir, revision=model_args.model_version, use_auth_token=True if model_args.use_auth_token else None, ) feature_extractor = AutoFeatureExtractor.from_pretrained( model_args.feature_extractor if model_args.feature_extractor else model_args.model_name, cache_dir=model_args.cache_dir, revision=model_args.model_version, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_version, use_auth_token=True if model_args.use_auth_token else None, ) model = AutoModelForSpeechSeq2Seq.from_pretrained( model_args.model_name, config=config, cache_dir=model_args.cache_dir, revision=model_args.model_version, use_auth_token=True if model_args.use_auth_token else None, ) if model.config.dec_START is None: raise ValueError( "Make sure that `config.dec_START` is correctly defined") if model_args.freeze_feature_encoder: model.freeze_feature_encoder() # 6. Resample speech dataset if necassary dataset_sampling_rate = (next(iter( raw_datasets.values())).features[data_args.audio_column].sampling_rate) if dataset_sampling_rate != feature_extractor.sampling_rate: raw_datasets = raw_datasets.cast_column( data_args.audio_column, datasets.features.Audio( sampling_rate=feature_extractor.sampling_rate), ) # 7. Preprocessing the datasets. # We need to read the audio files as arrays and tokenize the targets. max_input_length = data_args.max_duration * feature_extractor.sampling_rate min_input_length = data_args.min_duration * feature_extractor.sampling_rate audio_column = data_args.audio_column num_workers = data_args.num_workers text_column = data_args.text_column model_input_name = feature_extractor.model_input_names[0] lower_case = data_args.lower_case if data_args.max_train_samples is not None: raw_datasets["train"] = raw_datasets["train"].select( range(data_args.max_train_samples)) if data_args.max_eval_samples is not None: raw_datasets["eval"] = raw_datasets["eval"].select( range(data_args.max_eval_samples)) def prepare_dataset(batch): # process audio sample = batch[audio_column] inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"]) # process audio length batch[model_input_name] = inputs.input_values[0] batch["input_length"] = len(batch["input_values"]) # process targets input_str = batch[text_column].lower( ) if lower_case else batch[text_column] batch["labels"] = tokenizer(input_str).input_ids return batch with training_args.main_process_first(desc="dataset map pre-processing"): vectorized_datasets = raw_datasets.map( prepare_dataset, remove_columns=next(iter(raw_datasets.values())).column_names, num_proc=data_args.num_workers, desc="preprocess train dataset", ) # filter data that is shorter than min_input_length or longer than # max_input_length def is_audio_in_length_range(length): return length > min_input_length and length < max_input_length vectorized_datasets = vectorized_datasets.filter( is_audio_in_length_range, num_proc=num_workers, input_columns=["input_length"], ) # for large datasets it is advised to run the preprocessing on a # single machine first with `args.preprocessing_only` since there will mostly likely # be a timeout when running the script in distributed mode. # In a second step `args.preprocessing_only` can then be set to `False` to load the # cached dataset if data_args.preprocessing_only: cache = {k: v.cache_files for k, v in vectorized_datasets.items()} logger.info(f"Data preprocessing finished. Files cached at {cache}.") return # 8. Load Metric metric = load_metric("wer") def compute_metrics(pred): pred_ids = pred.predictions pred.label_ids[pred.label_ids == -100] = tokenizer.PAD pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True) # we do not want to group tokens when computing the metrics label_str = tokenizer.batch_decode(pred.label_ids, skip_special_tokens=True) wer = metric.compute(predictions=pred_str, references=label_str) return {"wer": wer} # 9. Create a single speech processor if is_main_process(training_args.local_rank): # save feature extractor, tokenizer and config feature_extractor.save_pretrained(training_args.out_dir) tokenizer.save_pretrained(training_args.out_dir) config.save_pretrained(training_args.out_dir) processor = AutoProcessor.from_pretrained(training_args.out_dir) # 10. Define data collator data_collator = DataCollatorSpeechSeq2SeqWithPadding( processor=processor, dec_START=model.config.dec_START) # 11. Initialize Trainer trainer = Seq2SeqTrainer( model=model, args=training_args, train_dataset=vectorized_datasets["train"] if training_args.do_train else None, eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None, tokenizer=feature_extractor, data_collator=data_collator, compute_metrics=compute_metrics if training_args.test_with_gen else None, ) # 12. Training if training_args.do_train: checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the feature extractor too for easy upload metrics = train_result.metrics max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(vectorized_datasets["train"])) metrics["train_samples"] = min(max_train_samples, len(vectorized_datasets["train"])) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # 13. Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate( metric_key_prefix="eval", max_len=model.config.max_len, n_beams=model.config.n_beams, ) max_eval_samples = (data_args.max_eval_samples if data_args.max_eval_samples is not None else len( vectorized_datasets["eval"])) metrics["eval_samples"] = min(max_eval_samples, len(vectorized_datasets["eval"])) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # 14. Write Training Stats kw = { "finetuned_from": model_args.model_name, "tasks": "speech recognition" } if data_args.dataset_name is not None: kw["dataset_tags"] = data_args.dataset_name if data_args.dataset_config is not None: kw["dataset_args"] = data_args.dataset_config kw["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config}" else: kw["dataset"] = data_args.dataset_name if training_args.push_to_hub: trainer.push_to_hub(**kw) else: trainer.create_model_card(**kw) return results