def _load_model(self): self.model = AutoModelForCTC.from_pretrained(self.model_path) self.model.to(self.device) try: self.processor = Wav2Vec2Processor.from_pretrained(self.model_path) self.token_set = TokenSet.from_processor(self.processor) except Exception: logger.warning("Not fine-tuned model! You'll need to fine-tune it before use this model for audio transcription") self.processor = None self.token_set = None
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank ) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # 1. First, let's load the dataset raw_datasets = DatasetDict() task_name = data_args.task lang_id = data_args.language if task_name is None: raise ValueError("Set --task should be set to '<xtreme_s_task>' " "(e.g. 'fleurs-asr', 'mls', 'covost2', 'minds14') ") if lang_id is None: raise ValueError( "Set --language should be set to the language id of the sub dataset " "config to be used (e.g. 'pl', 'en.tr', 'fr-FR') or 'all'" " for multi-lingual fine-tuning.") if data_args.target_column_name is None: target_column_name = TASK_TO_TARGET_COLUMN_NAME[task_name] else: target_column_name = data_args.target_column_name # here we differentiate between tasks with text as the target and classification tasks is_text_target = target_column_name in ("transcription", "translation") config_name = ".".join([task_name.split("-")[0], lang_id]) if training_args.do_train: raw_datasets["train"] = load_dataset( data_args.dataset_name, config_name, split=data_args.train_split_name, use_auth_token=data_args.use_auth_token, cache_dir=model_args.cache_dir, ) if data_args.audio_column_name not in raw_datasets[ "train"].column_names: raise ValueError( f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'. " "Make sure to set `--audio_column_name` to the correct audio column - one of " f"{', '.join(raw_datasets['train'].column_names)}.") if target_column_name not in raw_datasets["train"].column_names: raise ValueError( f"--target_column_name {target_column_name} not found in dataset '{data_args.dataset_name}'. " "Make sure to set `--target_column_name` to the correct text column - one of " f"{', '.join(raw_datasets['train'].column_names)}.") if data_args.max_train_samples is not None: raw_datasets["train"] = raw_datasets["train"].select( range(data_args.max_train_samples)) if not is_text_target: label_list = raw_datasets["train"].features[ target_column_name].names num_labels = len(label_list) if training_args.do_eval: raw_datasets["eval"] = load_dataset( data_args.dataset_name, config_name, split=data_args.eval_split_name, use_auth_token=data_args.use_auth_token, cache_dir=model_args.cache_dir, ) if data_args.max_eval_samples is not None: raw_datasets["eval"] = raw_datasets["eval"].select( range(data_args.max_eval_samples)) if training_args.do_predict: raw_datasets["predict"] = load_dataset( data_args.dataset_name, config_name, split=data_args.predict_split_name, use_auth_token=data_args.use_auth_token, cache_dir=model_args.cache_dir, ) if data_args.max_predict_samples is not None: raw_datasets["predict"] = raw_datasets["predict"].select( range(data_args.max_predict_samples)) # 2. We remove some special characters from the datasets # that make training complicated and do not help in transcribing the speech # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic # that could be easily picked up by the model chars_to_ignore_regex = (f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None) def remove_special_characters(batch): if chars_to_ignore_regex is not None: batch["target_text"] = re.sub( chars_to_ignore_regex, "", batch[target_column_name]).lower() + " " else: batch["target_text"] = batch[target_column_name].lower() + " " return batch if is_text_target: with training_args.main_process_first( desc="dataset map special characters removal"): raw_datasets = raw_datasets.map( remove_special_characters, remove_columns=[target_column_name], desc="remove special characters from datasets", ) # save special tokens for tokenizer word_delimiter_token = data_args.word_delimiter_token unk_token = data_args.unk_token pad_token = data_args.pad_token # 3. Next, let's load the config as we might need it to create # the tokenizer config = AutoConfig.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token) if is_text_target: # 4. (Optional, for ASR and translation) If no tokenizer file is defined, # we create the vocabulary of the model by extracting all unique characters from # the training and evaluation datasets # We need to make sure that only first rank saves vocabulary # make sure all processes wait until vocab is created tokenizer_name_or_path = model_args.tokenizer_name_or_path tokenizer_kwargs = {} if tokenizer_name_or_path is None: # save vocab in training output dir tokenizer_name_or_path = training_args.output_dir vocab_file = os.path.join(tokenizer_name_or_path, "vocab.json") with training_args.main_process_first(): if training_args.overwrite_output_dir and os.path.isfile( vocab_file): os.remove(vocab_file) with training_args.main_process_first( desc="dataset map vocabulary creation"): if not os.path.isfile(vocab_file): os.makedirs(tokenizer_name_or_path, exist_ok=True) vocab_dict = create_vocabulary_from_data( raw_datasets, word_delimiter_token=word_delimiter_token, unk_token=unk_token, pad_token=pad_token, ) # save vocab dict to be loaded into tokenizer with open(vocab_file, "w") as file: json.dump(vocab_dict, file) # if tokenizer has just been created # it is defined by `tokenizer_class` if present in config else by `model_type` if not config.is_encoder_decoder: tokenizer_kwargs = { "config": config if config.tokenizer_class is not None else None, "tokenizer_type": config.model_type if config.tokenizer_class is None else None, "unk_token": unk_token, "pad_token": pad_token, "word_delimiter_token": word_delimiter_token, } else: tokenizer_kwargs = {} # 5. Now we can instantiate the feature extractor, tokenizer and model # Note for distributed training, the .from_pretrained methods guarantee that only # one local process can concurrently download model & vocab. # load feature_extractor and tokenizer if is_text_target: tokenizer = AutoTokenizer.from_pretrained( tokenizer_name_or_path, use_auth_token=data_args.use_auth_token, **tokenizer_kwargs, ) feature_extractor = AutoFeatureExtractor.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token) # adapt config config.update({ "feat_proj_dropout": model_args.feat_proj_dropout, "attention_dropout": model_args.attention_dropout, "hidden_dropout": model_args.hidden_dropout, "final_dropout": model_args.final_dropout, "mask_time_prob": model_args.mask_time_prob, "mask_time_length": model_args.mask_time_length, "mask_feature_prob": model_args.mask_feature_prob, "mask_feature_length": model_args.mask_feature_length, "gradient_checkpointing": training_args.gradient_checkpointing, "layerdrop": model_args.layerdrop, "ctc_loss_reduction": model_args.ctc_loss_reduction, "activation_dropout": model_args.activation_dropout, }) if training_args.do_train: if is_text_target: config.pad_token_id = tokenizer.pad_token_id config.vocab_size = len(tokenizer) else: label_to_id = {v: i for i, v in enumerate(label_list)} config.label2id = label_to_id config.id2label = {id: label for label, id in label_to_id.items()} config.num_labels = num_labels # create model if target_column_name == "transcription": model = AutoModelForCTC.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, config=config, use_auth_token=data_args.use_auth_token, ) elif config.is_encoder_decoder: model = AutoModelForSpeechSeq2Seq.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, config=config, use_auth_token=data_args.use_auth_token, ) if model.config.decoder_start_token_id is None: raise ValueError( "Make sure that `config.decoder_start_token_id` is correctly defined" ) else: model = AutoModelForAudioClassification.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, config=config, use_auth_token=data_args.use_auth_token, ) # freeze encoder if model_args.freeze_feature_encoder: model.freeze_feature_encoder() # 6. Now we preprocess the datasets including loading the audio, resampling and normalization # Thankfully, `datasets` takes care of automatically loading and resampling the audio, # so that we just need to set the correct target sampling rate and normalize the input # via the `feature_extractor` # make sure that dataset decodes audio with correct sampling rate dataset_sampling_rate = next(iter(raw_datasets.values())).features[ data_args.audio_column_name].sampling_rate if dataset_sampling_rate != feature_extractor.sampling_rate: raw_datasets = raw_datasets.cast_column( data_args.audio_column_name, datasets.features.Audio( sampling_rate=feature_extractor.sampling_rate)) # derive max & min input length for sample rate & max duration max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate audio_column_name = data_args.audio_column_name num_workers = data_args.preprocessing_num_workers # `phoneme_language` is only relevant if the model is fine-tuned on phoneme classification phoneme_language = data_args.phoneme_language # Preprocessing the datasets. # We need to read the audio files as arrays and tokenize the targets. def prepare_dataset(batch): # load audio sample = batch[audio_column_name] inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"]) batch["input_values"] = inputs.input_values[0] batch["length"] = len(batch["input_values"]) # encode targets additional_kwargs = {} if phoneme_language is not None: additional_kwargs["phonemizer_lang"] = phoneme_language if is_text_target: batch["labels"] = tokenizer(batch["target_text"], **additional_kwargs).input_ids else: batch["labels"] = batch[target_column_name] return batch with training_args.main_process_first(desc="dataset map preprocessing"): vectorized_datasets = raw_datasets.map( prepare_dataset, remove_columns=next(iter(raw_datasets.values())).column_names, num_proc=num_workers, desc="preprocess datasets", ) if training_args.do_train: def is_audio_in_length_range(length): return length > min_input_length and length < max_input_length # filter data that is shorter than min_input_length vectorized_datasets["train"] = vectorized_datasets["train"].filter( is_audio_in_length_range, num_proc=num_workers, input_columns=["length"], ) # 7. Next, we can prepare for the training step. # Let's use the appropriate XTREME-S evaluation metric, # instantiate a data collator and the trainer # Define evaluation metrics during training, *i.e.* word error rate, character error rate eval_metric = load_metric("xtreme_s", task_name) # for large datasets it is advised to run the preprocessing on a # single machine first with ``args.preprocessing_only`` since there will mostly likely # be a timeout when running the script in distributed mode. # In a second step ``args.preprocessing_only`` can then be set to `False` to load the # cached dataset if data_args.preprocessing_only: logger.info( f"Data preprocessing finished. Files cached at {vectorized_datasets.cache_files}" ) return def compute_asr_metric(pred): pred_logits = pred.predictions pred_ids = np.argmax(pred_logits, axis=-1) pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id pred_str = tokenizer.batch_decode(pred_ids) # we do not want to group tokens when computing the metrics label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False) metric = eval_metric.compute(predictions=pred_str, references=label_str) return metric def compute_classification_metric(pred): pred_ids = np.argmax(pred.predictions, axis=1) metric = eval_metric.compute(predictions=pred_ids, references=pred.label_ids) return metric # Now save everything to be able to create a single processor later if is_main_process(training_args.local_rank): # save feature extractor, tokenizer and config feature_extractor.save_pretrained(training_args.output_dir) if is_text_target: tokenizer.save_pretrained(training_args.output_dir) config.save_pretrained(training_args.output_dir) # wait until configs are saved in the main process before loading the processor torch.distributed.barrier() if is_text_target: processor = AutoProcessor.from_pretrained(training_args.output_dir) else: processor = AutoFeatureExtractor.from_pretrained( training_args.output_dir) # Instantiate custom data collator data_collator = SpeechDataCollatorWithPadding(processor=processor, pad_labels=is_text_target) # Initialize Trainer if target_column_name == "translation": trainer = Seq2SeqTrainer( model=model, data_collator=data_collator, args=training_args, compute_metrics=compute_asr_metric if training_args.predict_with_generate else None, train_dataset=vectorized_datasets["train"] if training_args.do_train else None, eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None, tokenizer=feature_extractor, ) else: trainer = Trainer( model=model, data_collator=data_collator, args=training_args, compute_metrics=compute_asr_metric if is_text_target else compute_classification_metric, train_dataset=vectorized_datasets["train"] if training_args.do_train else None, eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None, tokenizer=feature_extractor, ) # 8. Finally, we can start training # Training if training_args.do_train: # use last checkpoint if exist if last_checkpoint is not None: checkpoint = last_checkpoint elif os.path.isdir(model_args.model_name_or_path): checkpoint = model_args.model_name_or_path else: checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() metrics = train_result.metrics max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(vectorized_datasets["train"])) metrics["train_samples"] = min(max_train_samples, len(vectorized_datasets["train"])) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation on the test set results = {} if training_args.do_predict: logger.info( f"*** Evaluating on the `{data_args.predict_split_name}` set ***") metrics = trainer.evaluate(vectorized_datasets["predict"]) max_predict_samples = (data_args.max_predict_samples if data_args.max_predict_samples is not None else len(vectorized_datasets["predict"])) metrics["predict_samples"] = min(max_predict_samples, len(vectorized_datasets["predict"])) trainer.log_metrics("predict", metrics) trainer.save_metrics("predict", metrics) # Write model card and (optionally) push to hub kwargs = { "finetuned_from": model_args.model_name_or_path, "tasks": task_name, "tags": [task_name, data_args.dataset_name], "dataset_args": f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split: {data_args.eval_split_name}, Predict split: {data_args.predict_split_name}", "dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}", "language": data_args.language, } if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: trainer.create_model_card(**kwargs) return results
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank ) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # 1. First, let's load the dataset raw_datasets = DatasetDict() raw_datasets["train"] = load_dataset(data_args.dataset_name, data_args.dataset_config_name, split=data_args.train_split_name) raw_datasets["eval"] = load_dataset(data_args.dataset_name, data_args.dataset_config_name, split=data_args.eval_split_name) if data_args.audio_column_name not in raw_datasets["train"].column_names: raise ValueError( f"--audio_column_name {data_args.audio_column_name} not found in dataset '{data_args.dataset_name}'. " "Make sure to set `--audio_column_name` to the correct audio column - one of " f"{', '.join(raw_datasets['train'].column_names)}.") if data_args.text_column_name not in raw_datasets["train"].column_names: raise ValueError( f"--text_column_name {data_args.audio_column_name} not found in dataset '{data_args.dataset_name}'. " "Make sure to set `--text_column_name` to the correct text column - one of " f"{', '.join(raw_datasets['train'].column_names)}.") # prepare dataset if data_args.max_train_samples is not None: raw_datasets["train"] = raw_datasets["train"].select( range(data_args.max_train_samples)) if data_args.max_eval_samples is not None: raw_datasets["eval"] = raw_datasets["eval"].select( range(data_args.max_eval_samples)) # 2. We remove some special characters from the datasets # that make training complicated and do not help in transcribing the speech # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic # that could be easily picked up by the model chars_to_ignore_regex = (f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None) def remove_special_characters(batch): if chars_to_ignore_regex is not None: batch["target_text"] = re.sub( chars_to_ignore_regex, "", batch[data_args.text_column_name]).lower() + " " else: batch["target_text"] = batch[ data_args.text_column_name].lower() + " " return batch with training_args.main_process_first( desc="dataset map special characters removal"): raw_datasets = raw_datasets.map( remove_special_characters, remove_columns=[data_args.text_column_name], desc="remove special characters from datasets", ) # 3. Next, we create the vocabulary of the model by extracting all unique characters from # the training and evaluation datasets # We need to make sure that only first rank saves vocabulary # make sure all processes wait until vocab is created with training_args.main_process_first( desc="dataset map vocabulary creation"): vocab_dict = create_vocabulary_from_data(raw_datasets) vocab_file = os.path.join(training_args.output_dir, "vocab.json") # save vocab dict to be loaded into tokenizer os.makedirs(training_args.output_dir, exist_ok=True) if training_args.overwrite_output_dir and os.path.isfile(vocab_file): os.remove(vocab_file) if not os.path.isfile(vocab_file): with open(vocab_file, "w") as vocab_file: json.dump(vocab_dict, vocab_file) # 4. Now we can instantiate the configuration, feature extractor, tokenizer and model # Note for distributed training, the .from_pretrained methods guarantee that only # one local process can concurrently download model & vocab. # load config config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) # load feature_extractor, tokenizer and create processor tokenizer = AutoTokenizer.from_pretrained( training_args.output_dir, tokenizer_type=config.model_type, unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|", ) feature_extractor = AutoFeatureExtractor.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir) processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) # adapt config config.update({ "feat_proj_dropout": model_args.feat_proj_dropout, "attention_dropout": model_args.attention_dropout, "hidden_dropout": model_args.hidden_dropout, "final_dropout": model_args.final_dropout, "mask_time_prob": model_args.mask_time_prob, "gradient_checkpointing": training_args.gradient_checkpointing, "layerdrop": model_args.layerdrop, "ctc_loss_reduction": model_args.ctc_loss_reduction, "pad_token_id": processor.tokenizer.pad_token_id, "vocab_size": len(processor.tokenizer), "activation_dropout": model_args.activation_dropout, }) # create model model = AutoModelForCTC.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir, config=config) # freeze encoder if model_args.freeze_feature_extractor: model.freeze_feature_extractor() # 5. Now we preprocess the datasets which includes loading the audio, resampling and padding # The following code should be cleaned up as soon as # https://github.com/huggingface/datasets/pull/2324 is merged # Preprocessing the datasets. # We need to read the audio files as arrays and tokenize the targets. # derive max & min input length for sample rate & max duration max_input_length = data_args.max_duration_in_seconds * processor.feature_extractor.sampling_rate min_input_length = data_args.min_duration_in_seconds * processor.feature_extractor.sampling_rate resampler = None if raw_datasets["train"][data_args.audio_column_name][0].split( ".")[-1] == "mp3": # TODO(PVP) - remove hard-coded 48_000 after audio feature is merged resampler = torchaudio.transforms.Resample( 48_000, processor.feature_extractor.sampling_rate) # Preprocessing the datasets. # We need to read the audio files as arrays and tokenize the targets. def prepare_dataset(batch): # load audio speech_array, sampling_rate = torchaudio.load( batch[data_args.audio_column_name]) speech_array = speech_array.squeeze() # if necessary resample audio if resampler is not None: # TODO(PVP) - remove hard-coded 48_000 after audio feature is merged speech_array = resampler(speech_array) sampling_rate = resampler.new_freq speech_array = speech_array.numpy() batch["input_values"] = processor( speech_array, sampling_rate=sampling_rate, truncate=True, max_length=max_input_length).input_values[0] # Setup the processor for targets with processor.as_target_processor(): batch["labels"] = processor(batch["target_text"]).input_ids return batch with training_args.main_process_first(desc="dataset map preprocessing"): vectorized_datasets = raw_datasets.map( prepare_dataset, remove_columns=raw_datasets["train"].column_names, num_proc=data_args.preprocessing_num_workers, desc="preprocess datasets", ) if min_input_length > 0.0: # filter data that is shorter than min_input_length vectorized_datasets = vectorized_datasets.filter( lambda data: len(data["input_values"]) > min_input_length, num_proc=data_args.preprocessing_num_workers, ) # 6. Next, we can prepare the training. # Let's use word error rate (WER) as our evaluation metric, # instantiate a data collator and the trainer # Define Metric during training wer_metric = load_metric("wer") if data_args.only_data_preprocessing: logger.info("Data preprocessing finished.") return def compute_metrics(pred): pred_logits = pred.predictions pred_ids = np.argmax(pred_logits, axis=-1) pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id pred_str = processor.batch_decode(pred_ids) # we do not want to group tokens when computing the metrics label_str = processor.batch_decode(pred.label_ids, group_tokens=False) wer = wer_metric.compute(predictions=pred_str, references=label_str) return {"wer": wer} # Instantiate custom data collator data_collator = DataCollatorCTCWithPadding(processor=processor) # Initialize Trainer trainer = Trainer( model=model, data_collator=data_collator, args=training_args, compute_metrics=compute_metrics, train_dataset=vectorized_datasets["train"] if training_args.do_train else None, eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None, tokenizer=processor.feature_extractor, ) # 7. Finally, we can start training # Training if training_args.do_train: # use last checkpoint if exist if last_checkpoint is not None: checkpoint = last_checkpoint elif os.path.isdir(model_args.model_name_or_path): checkpoint = model_args.model_name_or_path else: checkpoint = None # Save the feature_extractor and the tokenizer if is_main_process(training_args.local_rank): processor.save_pretrained(training_args.output_dir) train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() metrics = train_result.metrics max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(vectorized_datasets["train"])) metrics["train_samples"] = min(max_train_samples, len(vectorized_datasets["train"])) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate() max_eval_samples = (data_args.max_eval_samples if data_args.max_eval_samples is not None else len( vectorized_datasets["eval"])) metrics["eval_samples"] = min(max_eval_samples, len(vectorized_datasets["eval"])) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # Write model card and (optionally) push to hub kwargs = { "finetuned_from": model_args.model_name_or_path, "tasks": "speech-recognition", "tags": ["automatic-speech-recognition", data_args.dataset_name], "dataset_args": f"Config: {data_args.dataset_config_name}, Training split: {data_args.train_split_name}, Eval split: {data_args.eval_split_name}", "dataset": f"{data_args.dataset_name.upper()} - {data_args.dataset_config_name.upper()}", } if "common_voice" in data_args.dataset_name: kwargs["language"] = data_args.dataset_config_name if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: trainer.create_model_card(**kwargs) return results
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank ) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # 1. First, let's load the dataset raw_datasets = IterableDatasetDict() raw_column_names = {} def load_streaming_dataset(split, sampling_rate, **kwargs): if "+" in split: dataset_splits = [ load_dataset(split=split_name, **kwargs) for split_name in split.split("+") ] # `features` and `cast_column` won't be available after interleaving, so we'll use them here features = dataset_splits[0].features # make sure that the dataset decodes audio with a correct sampling rate dataset_splits = [ dataset.cast_column( data_args.audio_column_name, datasets.features.Audio(sampling_rate=sampling_rate)) for dataset in dataset_splits ] interleaved_dataset = interleave_datasets(dataset_splits) return interleaved_dataset, features else: dataset = load_dataset(split=split, **kwargs) features = dataset.features # make sure that the dataset decodes audio with a correct sampling rate dataset = dataset.cast_column( data_args.audio_column_name, datasets.features.Audio(sampling_rate=sampling_rate)) return dataset, features # `datasets` takes care of automatically loading and resampling the audio, # so we just need to set the correct target sampling rate and normalize the input # via the `feature_extractor` feature_extractor = AutoFeatureExtractor.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token) if training_args.do_train: raw_datasets["train"], train_features = load_streaming_dataset( path=data_args.dataset_name, name=data_args.dataset_config_name, split=data_args.train_split_name, use_auth_token=data_args.use_auth_token, streaming=True, sampling_rate=feature_extractor.sampling_rate, ) raw_column_names["train"] = list(train_features.keys()) if data_args.audio_column_name not in raw_column_names["train"]: raise ValueError( f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'." " Make sure to set `--audio_column_name` to the correct audio column - one of" f" {', '.join(raw_column_names['train'])}.") if data_args.text_column_name not in raw_column_names["train"]: raise ValueError( f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. " "Make sure to set `--text_column_name` to the correct text column - one of " f"{', '.join(raw_column_names['train'])}.") if data_args.max_train_samples is not None: raw_datasets["train"] = raw_datasets["train"].take( range(data_args.max_train_samples)) if training_args.do_eval: raw_datasets["eval"], eval_features = load_streaming_dataset( path=data_args.dataset_name, name=data_args.dataset_config_name, split=data_args.eval_split_name, use_auth_token=data_args.use_auth_token, streaming=True, sampling_rate=feature_extractor.sampling_rate, ) raw_column_names["eval"] = list(eval_features.keys()) if data_args.max_eval_samples is not None: raw_datasets["eval"] = raw_datasets["eval"].take( range(data_args.max_eval_samples)) # 2. We remove some special characters from the datasets # that make training complicated and do not help in transcribing the speech # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic # that could be easily picked up by the model chars_to_ignore_regex = (f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None) text_column_name = data_args.text_column_name def remove_special_characters(batch): if chars_to_ignore_regex is not None: batch["target_text"] = re.sub( chars_to_ignore_regex, "", batch[text_column_name]).lower() + " " else: batch["target_text"] = batch[text_column_name].lower() + " " return batch with training_args.main_process_first( desc="dataset map special characters removal"): for split, dataset in raw_datasets.items(): raw_datasets[split] = dataset.map( remove_special_characters, ).remove_columns([text_column_name]) # 3. Next, let's load the config as we might need it to create # the tokenizer config = AutoConfig.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token) # 4. Now we can instantiate the tokenizer and model # Note for distributed training, the .from_pretrained methods guarantee that only # one local process can concurrently download model & vocab. tokenizer_name_or_path = model_args.tokenizer_name_or_path if tokenizer_name_or_path is None: raise ValueError( "Tokenizer has to be created before training in streaming mode. Please specify --tokenizer_name_or_path" ) # load feature_extractor and tokenizer tokenizer = AutoTokenizer.from_pretrained( tokenizer_name_or_path, config=config, use_auth_token=data_args.use_auth_token, ) # adapt config config.update({ "feat_proj_dropout": model_args.feat_proj_dropout, "attention_dropout": model_args.attention_dropout, "hidden_dropout": model_args.hidden_dropout, "final_dropout": model_args.final_dropout, "mask_time_prob": model_args.mask_time_prob, "mask_time_length": model_args.mask_time_length, "mask_feature_prob": model_args.mask_feature_prob, "mask_feature_length": model_args.mask_feature_length, "gradient_checkpointing": training_args.gradient_checkpointing, "layerdrop": model_args.layerdrop, "ctc_loss_reduction": model_args.ctc_loss_reduction, "pad_token_id": tokenizer.pad_token_id, "vocab_size": len(tokenizer), "activation_dropout": model_args.activation_dropout, }) # create model model = AutoModelForCTC.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, config=config, use_auth_token=data_args.use_auth_token, ) # freeze encoder if model_args.freeze_feature_encoder: model.freeze_feature_encoder() # 5. Now we preprocess the datasets including loading the audio, resampling and normalization audio_column_name = data_args.audio_column_name # `phoneme_language` is only relevant if the model is fine-tuned on phoneme classification phoneme_language = data_args.phoneme_language # Preprocessing the datasets. # We need to read the audio files as arrays and tokenize the targets. def prepare_dataset(batch): # load audio sample = batch[audio_column_name] inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"]) batch["input_values"] = inputs.input_values[0] batch["input_length"] = len(batch["input_values"]) # encode targets additional_kwargs = {} if phoneme_language is not None: additional_kwargs["phonemizer_lang"] = phoneme_language batch["labels"] = tokenizer(batch["target_text"], **additional_kwargs).input_ids return batch vectorized_datasets = IterableDatasetDict() with training_args.main_process_first(desc="dataset map preprocessing"): for split, dataset in raw_datasets.items(): vectorized_datasets[split] = ( dataset.map(prepare_dataset).remove_columns( raw_column_names[split] + ["target_text"]).with_format("torch")) if split == "train": vectorized_datasets[split] = vectorized_datasets[ split].shuffle( buffer_size=data_args.shuffle_buffer_size, seed=training_args.seed, ) # 6. Next, we can prepare the training. # Let's use word error rate (WER) as our evaluation metric, # instantiate a data collator and the trainer # Define evaluation metrics during training, *i.e.* word error rate, character error rate eval_metrics = { metric: load_metric(metric) for metric in data_args.eval_metrics } def compute_metrics(pred): pred_logits = pred.predictions pred_ids = np.argmax(pred_logits, axis=-1) pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id pred_str = tokenizer.batch_decode(pred_ids) # we do not want to group tokens when computing the metrics label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False) metrics = { k: v.compute(predictions=pred_str, references=label_str) for k, v in eval_metrics.items() } return metrics # Now save everything to be able to create a single processor later if is_main_process(training_args.local_rank): # save feature extractor, tokenizer and config feature_extractor.save_pretrained(training_args.output_dir) tokenizer.save_pretrained(training_args.output_dir) config.save_pretrained(training_args.output_dir) try: processor = AutoProcessor.from_pretrained(training_args.output_dir) except (OSError, KeyError): warnings.warn( "Loading a processor from a feature extractor config that does not" " include a `processor_class` attribute is deprecated and will be removed in v5. Please add the following " " attribute to your `preprocessor_config.json` file to suppress this warning: " " `'processor_class': 'Wav2Vec2Processor'`", FutureWarning, ) processor = Wav2Vec2Processor.from_pretrained(training_args.output_dir) # Instantiate custom data collator max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate data_collator = DataCollatorCTCWithPadding(processor=processor, max_length=max_input_length) # trainer callback to reinitialize and reshuffle the streamable datasets at the beginning of each epoch class ShuffleCallback(TrainerCallback): def on_epoch_begin(self, args, state, control, train_dataloader, **kwargs): if isinstance(train_dataloader.dataset, IterableDatasetShard): pass # set_epoch() is handled by the Trainer elif isinstance(train_dataloader.dataset, IterableDataset): train_dataloader.dataset.set_epoch( train_dataloader.dataset._epoch + 1) # Initialize Trainer trainer = Trainer( model=model, data_collator=data_collator, args=training_args, compute_metrics=compute_metrics, train_dataset=vectorized_datasets["train"] if training_args.do_train else None, eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None, tokenizer=processor, callbacks=[ShuffleCallback()], ) # 7. Finally, we can start training # Training if training_args.do_train: # use last checkpoint if exist if last_checkpoint is not None: checkpoint = last_checkpoint elif os.path.isdir(model_args.model_name_or_path): checkpoint = model_args.model_name_or_path else: checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() metrics = train_result.metrics if data_args.max_train_samples: metrics["train_samples"] = data_args.max_train_samples trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate() if data_args.max_eval_samples: metrics["eval_samples"] = data_args.max_eval_samples trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # Write model card and (optionally) push to hub config_name = data_args.dataset_config_name if data_args.dataset_config_name is not None else "na" kwargs = { "finetuned_from": model_args.model_name_or_path, "tasks": "speech-recognition", "tags": ["automatic-speech-recognition", data_args.dataset_name], "dataset_args": (f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split:" f" {data_args.eval_split_name}"), "dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}", } if "common_voice" in data_args.dataset_name: kwargs["language"] = config_name if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: trainer.create_model_card(**kwargs) return results
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_speech_recognition_ctc", model_args, data_args) # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank ) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # 1. First, let's load the dataset raw_datasets = DatasetDict() if training_args.do_train: raw_datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=data_args.train_split_name, use_auth_token=data_args.use_auth_token, ) if data_args.audio_column_name not in raw_datasets[ "train"].column_names: raise ValueError( f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'." " Make sure to set `--audio_column_name` to the correct audio column - one of" f" {', '.join(raw_datasets['train'].column_names)}.") if data_args.text_column_name not in raw_datasets[ "train"].column_names: raise ValueError( f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. " "Make sure to set `--text_column_name` to the correct text column - one of " f"{', '.join(raw_datasets['train'].column_names)}.") if data_args.max_train_samples is not None: raw_datasets["train"] = raw_datasets["train"].select( range(data_args.max_train_samples)) if training_args.do_eval: raw_datasets["eval"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=data_args.eval_split_name, use_auth_token=data_args.use_auth_token, ) if data_args.max_eval_samples is not None: raw_datasets["eval"] = raw_datasets["eval"].select( range(data_args.max_eval_samples)) # 2. We remove some special characters from the datasets # that make training complicated and do not help in transcribing the speech # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic # that could be easily picked up by the model chars_to_ignore_regex = (f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None) text_column_name = data_args.text_column_name def remove_special_characters(batch): if chars_to_ignore_regex is not None: batch["target_text"] = re.sub( chars_to_ignore_regex, "", batch[text_column_name]).lower() + " " else: batch["target_text"] = batch[text_column_name].lower() + " " return batch with training_args.main_process_first( desc="dataset map special characters removal"): raw_datasets = raw_datasets.map( remove_special_characters, remove_columns=[text_column_name], desc="remove special characters from datasets", ) # save special tokens for tokenizer word_delimiter_token = data_args.word_delimiter_token unk_token = data_args.unk_token pad_token = data_args.pad_token # 3. Next, let's load the config as we might need it to create # the tokenizer # load config config = AutoConfig.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token) # 4. Next, if no tokenizer file is defined, # we create the vocabulary of the model by extracting all unique characters from # the training and evaluation datasets # We need to make sure that only first rank saves vocabulary # make sure all processes wait until vocab is created tokenizer_name_or_path = model_args.tokenizer_name_or_path tokenizer_kwargs = {} if tokenizer_name_or_path is None: # save vocab in training output dir tokenizer_name_or_path = training_args.output_dir vocab_file = os.path.join(tokenizer_name_or_path, "vocab.json") with training_args.main_process_first(): if training_args.overwrite_output_dir and os.path.isfile( vocab_file): try: os.remove(vocab_file) except OSError: # in shared file-systems it might be the case that # two processes try to delete the vocab file at the some time pass with training_args.main_process_first( desc="dataset map vocabulary creation"): if not os.path.isfile(vocab_file): os.makedirs(tokenizer_name_or_path, exist_ok=True) vocab_dict = create_vocabulary_from_data( raw_datasets, word_delimiter_token=word_delimiter_token, unk_token=unk_token, pad_token=pad_token, ) # save vocab dict to be loaded into tokenizer with open(vocab_file, "w") as file: json.dump(vocab_dict, file) # if tokenizer has just been created # it is defined by `tokenizer_class` if present in config else by `model_type` tokenizer_kwargs = { "config": config if config.tokenizer_class is not None else None, "tokenizer_type": config.model_type if config.tokenizer_class is None else None, "unk_token": unk_token, "pad_token": pad_token, "word_delimiter_token": word_delimiter_token, } # 5. Now we can instantiate the feature extractor, tokenizer and model # Note for distributed training, the .from_pretrained methods guarantee that only # one local process can concurrently download model & vocab. # load feature_extractor and tokenizer tokenizer = AutoTokenizer.from_pretrained( tokenizer_name_or_path, use_auth_token=data_args.use_auth_token, **tokenizer_kwargs, ) feature_extractor = AutoFeatureExtractor.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token) # adapt config config.update({ "feat_proj_dropout": model_args.feat_proj_dropout, "attention_dropout": model_args.attention_dropout, "hidden_dropout": model_args.hidden_dropout, "final_dropout": model_args.final_dropout, "mask_time_prob": model_args.mask_time_prob, "mask_time_length": model_args.mask_time_length, "mask_feature_prob": model_args.mask_feature_prob, "mask_feature_length": model_args.mask_feature_length, "gradient_checkpointing": training_args.gradient_checkpointing, "layerdrop": model_args.layerdrop, "ctc_loss_reduction": model_args.ctc_loss_reduction, "pad_token_id": tokenizer.pad_token_id, "vocab_size": len(tokenizer), "activation_dropout": model_args.activation_dropout, }) # create model model = AutoModelForCTC.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, config=config, use_auth_token=data_args.use_auth_token, ) # freeze encoder if model_args.freeze_feature_encoder: model.freeze_feature_encoder() # 6. Now we preprocess the datasets including loading the audio, resampling and normalization # Thankfully, `datasets` takes care of automatically loading and resampling the audio, # so that we just need to set the correct target sampling rate and normalize the input # via the `feature_extractor` # make sure that dataset decodes audio with correct sampling rate dataset_sampling_rate = next(iter(raw_datasets.values())).features[ data_args.audio_column_name].sampling_rate if dataset_sampling_rate != feature_extractor.sampling_rate: raw_datasets = raw_datasets.cast_column( data_args.audio_column_name, datasets.features.Audio( sampling_rate=feature_extractor.sampling_rate)) # derive max & min input length for sample rate & max duration max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate audio_column_name = data_args.audio_column_name num_workers = data_args.preprocessing_num_workers # `phoneme_language` is only relevant if the model is fine-tuned on phoneme classification phoneme_language = data_args.phoneme_language # Preprocessing the datasets. # We need to read the audio files as arrays and tokenize the targets. def prepare_dataset(batch): # load audio sample = batch[audio_column_name] inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"]) batch["input_values"] = inputs.input_values[0] batch["input_length"] = len(batch["input_values"]) # encode targets additional_kwargs = {} if phoneme_language is not None: additional_kwargs["phonemizer_lang"] = phoneme_language batch["labels"] = tokenizer(batch["target_text"], **additional_kwargs).input_ids return batch with training_args.main_process_first(desc="dataset map preprocessing"): vectorized_datasets = raw_datasets.map( prepare_dataset, remove_columns=next(iter(raw_datasets.values())).column_names, num_proc=num_workers, desc="preprocess datasets", ) def is_audio_in_length_range(length): return length > min_input_length and length < max_input_length # filter data that is shorter than min_input_length vectorized_datasets = vectorized_datasets.filter( is_audio_in_length_range, num_proc=num_workers, input_columns=["input_length"], ) # 7. Next, we can prepare the training. # Let's use word error rate (WER) as our evaluation metric, # instantiate a data collator and the trainer # Define evaluation metrics during training, *i.e.* word error rate, character error rate eval_metrics = { metric: evaluate.load(metric) for metric in data_args.eval_metrics } # for large datasets it is advised to run the preprocessing on a # single machine first with ``args.preprocessing_only`` since there will mostly likely # be a timeout when running the script in distributed mode. # In a second step ``args.preprocessing_only`` can then be set to `False` to load the # cached dataset if data_args.preprocessing_only: logger.info( f"Data preprocessing finished. Files cached at {vectorized_datasets.cache_files}" ) return def compute_metrics(pred): pred_logits = pred.predictions pred_ids = np.argmax(pred_logits, axis=-1) pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id pred_str = tokenizer.batch_decode(pred_ids) # we do not want to group tokens when computing the metrics label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False) metrics = { k: v.compute(predictions=pred_str, references=label_str) for k, v in eval_metrics.items() } return metrics # Now save everything to be able to create a single processor later if is_main_process(training_args.local_rank): # save feature extractor, tokenizer and config feature_extractor.save_pretrained(training_args.output_dir) tokenizer.save_pretrained(training_args.output_dir) config.save_pretrained(training_args.output_dir) try: processor = AutoProcessor.from_pretrained(training_args.output_dir) except (OSError, KeyError): warnings.warn( "Loading a processor from a feature extractor config that does not" " include a `processor_class` attribute is deprecated and will be removed in v5. Please add the following " " attribute to your `preprocessor_config.json` file to suppress this warning: " " `'processor_class': 'Wav2Vec2Processor'`", FutureWarning, ) processor = Wav2Vec2Processor.from_pretrained(training_args.output_dir) # Instantiate custom data collator data_collator = DataCollatorCTCWithPadding(processor=processor) # Initialize Trainer trainer = Trainer( model=model, data_collator=data_collator, args=training_args, compute_metrics=compute_metrics, train_dataset=vectorized_datasets["train"] if training_args.do_train else None, eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None, tokenizer=feature_extractor, ) # 8. Finally, we can start training # Training if training_args.do_train: # use last checkpoint if exist if last_checkpoint is not None: checkpoint = last_checkpoint elif os.path.isdir(model_args.model_name_or_path): checkpoint = model_args.model_name_or_path else: checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() metrics = train_result.metrics max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(vectorized_datasets["train"])) metrics["train_samples"] = min(max_train_samples, len(vectorized_datasets["train"])) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate() max_eval_samples = (data_args.max_eval_samples if data_args.max_eval_samples is not None else len( vectorized_datasets["eval"])) metrics["eval_samples"] = min(max_eval_samples, len(vectorized_datasets["eval"])) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # Write model card and (optionally) push to hub config_name = data_args.dataset_config_name if data_args.dataset_config_name is not None else "na" kwargs = { "finetuned_from": model_args.model_name_or_path, "tasks": "speech-recognition", "tags": ["automatic-speech-recognition", data_args.dataset_name], "dataset_args": (f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split:" f" {data_args.eval_split_name}"), "dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}", } if "common_voice" in data_args.dataset_name: kwargs["language"] = config_name if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: trainer.create_model_card(**kwargs) return results
def main(): raw_datasets = DatasetDict() if training_args.do_train: raw_datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config, split=data_args.train_split_name, use_auth_token=data_args.use_auth_token, ) if data_args.audio_column not in raw_datasets["train"].column_names: raise ValueError( f"--audio_column '{data_args.audio_column}' not found in dataset '{data_args.dataset_name}'. " "Make sure to set `--audio_column` to the correct audio column - one of " f"{', '.join(raw_datasets['train'].column_names)}." ) if data_args.text_column not in raw_datasets["train"].column_names: raise ValueError( f"--text_column {data_args.text_column} not found in dataset '{data_args.dataset_name}'. " "Make sure to set `--text_column` to the correct text column - one of " f"{', '.join(raw_datasets['train'].column_names)}." ) if data_args.max_train_samples is not None: raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples)) if training_args.do_eval: raw_datasets["eval"] = load_dataset( data_args.dataset_name, data_args.dataset_config, split=data_args.eval_split_name, use_auth_token=data_args.use_auth_token, ) if data_args.max_eval_samples is not None: raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples)) # 2. We remove some special characters from the datasets # that make training complicated and do not help in transcribing the speech # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic # that could be easily picked up by the model chars_to_ignore_regex = ( f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None ) text_column = data_args.text_column def remove_special_characters(batch): if chars_to_ignore_regex is not None: batch["target_text"] = ( re.sub(chars_to_ignore_regex, "", batch[text_column]).lower() + " " ) else: batch["target_text"] = batch[text_column].lower() + " " return batch with training_args.main_process_first(desc="dataset map special characters removal"): raw_datasets = raw_datasets.map( remove_special_characters, remove_columns=[text_column], desc="remove special characters from datasets", ) # save special tokens for tokenizer word_delimiter_token = data_args.word_delimiter_token unk = data_args.unk pad = data_args.pad # 3. Next, let's load the config as we might need it to create # the tokenizer # load config config = AutoConfig.from_pretrained( model_args.model_name, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token, ) # 4. Next, if no tokenizer file is defined, # we create the vocabulary of the model by extracting all unique characters from # the training and evaluation datasets # We need to make sure that only first rank saves vocabulary # make sure all processes wait until vocab is created tokenizer_name_or_path = model_args.tokenizer_name_or_path tokenizer_kw = {} if tokenizer_name_or_path is None: # save vocab in training output dir tokenizer_name_or_path = training_args.out_dir vocab_file = os.path.join(tokenizer_name_or_path, "vocab.json") with training_args.main_process_first(): if training_args.overwrite_out_dir and os.path.isfile(vocab_file): os.remove(vocab_file) with training_args.main_process_first(desc="dataset map vocabulary creation"): if not os.path.isfile(vocab_file): os.makedirs(tokenizer_name_or_path, exist_ok=True) vocab_dict = create_vocabulary_from_data( raw_datasets, word_delimiter_token=word_delimiter_token, unk=unk, pad=pad, ) # save vocab dict to be loaded into tokenizer with open(vocab_file, "w") as file: json.dump(vocab_dict, file) # if tokenizer has just been created # it is defined by `tokenizer_class` if present in config else by `model_type` tokenizer_kw = { "config": config if config.tokenizer_class is not None else None, "tokenizer_type": config.model_type if config.tokenizer_class is None else None, "unk": unk, "pad": pad, "word_delimiter_token": word_delimiter_token, } tokenizer = AutoTokenizer.from_pretrained( tokenizer_name_or_path, use_auth_token=data_args.use_auth_token, **tokenizer_kw, ) feature_extractor = AutoFeatureExtractor.from_pretrained( model_args.model_name, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token, ) # adapt config config.update( { "feat_proj_dropout": model_args.feat_proj_dropout, "drop_attn": model_args.drop_attn, "drop": model_args.drop, "final_dropout": model_args.final_dropout, "mask_time_prob": model_args.mask_time_prob, "mask_time_length": model_args.mask_time_length, "mask_feature_prob": model_args.mask_feature_prob, "mask_feature_length": model_args.mask_feature_length, "grad_checkpoint": training_args.grad_checkpoint, "layerdrop": model_args.layerdrop, "ctc_loss_reduction": model_args.ctc_loss_reduction, "PAD": tokenizer.PAD, "s_vocab": len(tokenizer), "drop_act": model_args.drop_act, } ) # create model model = AutoModelForCTC.from_pretrained( model_args.model_name, cache_dir=model_args.cache_dir, config=config, use_auth_token=data_args.use_auth_token, ) # freeze encoder if model_args.freeze_feature_encoder: model.freeze_feature_encoder() # 6. Now we preprocess the datasets including loading the audio, resampling and normalization # Thankfully, `datasets` takes care of automatically loading and resampling the audio, # so that we just need to set the correct target sampling rate and normalize the input # via the `feature_extractor` # make sure that dataset decodes audio with correct sampling rate dataset_sampling_rate = ( next(iter(raw_datasets.values())).features[data_args.audio_column].sampling_rate ) if dataset_sampling_rate != feature_extractor.sampling_rate: raw_datasets = raw_datasets.cast_column( data_args.audio_column, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate), ) # derive max & min input length for sample rate & max duration max_input_length = data_args.max_duration * feature_extractor.sampling_rate min_input_length = data_args.min_duration * feature_extractor.sampling_rate audio_column = data_args.audio_column num_workers = data_args.num_workers # `phoneme_language` is only relevant if the model is fine-tuned on phoneme classification phoneme_language = data_args.phoneme_language # Preprocessing the datasets. # We need to read the audio files as arrays and tokenize the targets. def prepare_dataset(batch): # load audio sample = batch[audio_column] inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"]) batch["input_values"] = inputs.input_values[0] batch["input_length"] = len(batch["input_values"]) # encode targets additional_kw = {} if phoneme_language is not None: additional_kw["phonemizer_lang"] = phoneme_language batch["labels"] = tokenizer(batch["target_text"], **additional_kw).input_ids return batch with training_args.main_process_first(desc="dataset map preprocessing"): vectorized_datasets = raw_datasets.map( prepare_dataset, remove_columns=next(iter(raw_datasets.values())).column_names, num_proc=num_workers, desc="preprocess datasets", ) def is_audio_in_length_range(length): return length > min_input_length and length < max_input_length # filter data that is shorter than min_input_length vectorized_datasets = vectorized_datasets.filter( is_audio_in_length_range, num_proc=num_workers, input_columns=["input_length"], ) # 7. Next, we can prepare the training. # Let's use word error rate (WER) as our evaluation metric, # instantiate a data collator and the trainer # Define evaluation metrics during training, *i.e.* word error rate, character error rate eval_metrics = {metric: load_metric(metric) for metric in data_args.eval_metrics} # for large datasets it is advised to run the preprocessing on a # single machine first with ``args.preprocessing_only`` since there will mostly likely # be a timeout when running the script in distributed mode. # In a second step ``args.preprocessing_only`` can then be set to `False` to load the # cached dataset if data_args.preprocessing_only: logger.info( f"Data preprocessing finished. Files cached at {vectorized_datasets.cache_files}" ) return def compute_metrics(pred): pred_logits = pred.predictions pred_ids = np.argmax(pred_logits, axis=-1) pred.label_ids[pred.label_ids == -100] = tokenizer.PAD pred_str = tokenizer.batch_decode(pred_ids) # we do not want to group tokens when computing the metrics label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False) metrics = { k: v.compute(predictions=pred_str, references=label_str) for k, v in eval_metrics.items() } return metrics # Now save everything to be able to create a single processor later if is_main_process(training_args.local_rank): # save feature extractor, tokenizer and config feature_extractor.save_pretrained(training_args.out_dir) tokenizer.save_pretrained(training_args.out_dir) config.save_pretrained(training_args.out_dir) try: processor = AutoProcessor.from_pretrained(training_args.out_dir) except (OSError, KeyError): warnings.warn( "Loading a processor from a feature extractor config that does not" " include a `processor_class` attribute is deprecated and will be removed in v5. Please add the following " " attribute to your `preprocessor_config.json` file to suppress this warning: " " `'processor_class': 'Wav2Vec2Processor'`", FutureWarning, ) processor = Wav2Vec2Processor.from_pretrained(training_args.out_dir) # Instantiate custom data collator data_collator = DataCollatorCTCWithPadding(processor=processor) # Initialize Trainer trainer = Trainer( model=model, data_collator=data_collator, args=training_args, compute_metrics=compute_metrics, train_dataset=vectorized_datasets["train"] if training_args.do_train else None, eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None, tokenizer=feature_extractor, ) # 8. Finally, we can start training # Training if training_args.do_train: # use last checkpoint if exist if last_checkpoint is not None: checkpoint = last_checkpoint elif os.path.isdir(model_args.model_name): checkpoint = model_args.model_name else: checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() metrics = train_result.metrics max_train_samples = ( data_args.max_train_samples if data_args.max_train_samples is not None else len(vectorized_datasets["train"]) ) metrics["train_samples"] = min(max_train_samples, len(vectorized_datasets["train"])) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate() max_eval_samples = ( data_args.max_eval_samples if data_args.max_eval_samples is not None else len(vectorized_datasets["eval"]) ) metrics["eval_samples"] = min(max_eval_samples, len(vectorized_datasets["eval"])) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # Write model card and (optionally) push to hub config_name = data_args.dataset_config if data_args.dataset_config is not None else "na" kw = { "finetuned_from": model_args.model_name, "tasks": "speech-recognition", "tags": ["automatic-speech-recognition", data_args.dataset_name], "dataset_args": f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split: {data_args.eval_split_name}", "dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}", } if "common_voice" in data_args.dataset_name: kw["language"] = config_name if training_args.push_to_hub: trainer.push_to_hub(**kw) else: trainer.create_model_card(**kw) return results
def __init__(self, model_name, hotwords=[]): self.processor = AutoProcessor.from_pretrained(model_name) self.model = AutoModelForCTC.from_pretrained(model_name) self.hotwords = hotwords