def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank ) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # 1. First, let's load the dataset raw_datasets = DatasetDict() raw_datasets["train"] = load_dataset(data_args.dataset_name, data_args.dataset_config_name, split=data_args.train_split_name) raw_datasets["eval"] = load_dataset(data_args.dataset_name, data_args.dataset_config_name, split=data_args.eval_split_name) if data_args.audio_column_name not in raw_datasets["train"].column_names: raise ValueError( f"--audio_column_name {data_args.audio_column_name} not found in dataset '{data_args.dataset_name}'. " "Make sure to set `--audio_column_name` to the correct audio column - one of " f"{', '.join(raw_datasets['train'].column_names)}.") if data_args.text_column_name not in raw_datasets["train"].column_names: raise ValueError( f"--text_column_name {data_args.audio_column_name} not found in dataset '{data_args.dataset_name}'. " "Make sure to set `--text_column_name` to the correct text column - one of " f"{', '.join(raw_datasets['train'].column_names)}.") # prepare dataset if data_args.max_train_samples is not None: raw_datasets["train"] = raw_datasets["train"].select( range(data_args.max_train_samples)) if data_args.max_eval_samples is not None: raw_datasets["eval"] = raw_datasets["eval"].select( range(data_args.max_eval_samples)) # 2. We remove some special characters from the datasets # that make training complicated and do not help in transcribing the speech # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic # that could be easily picked up by the model chars_to_ignore_regex = (f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None) def remove_special_characters(batch): if chars_to_ignore_regex is not None: batch["target_text"] = re.sub( chars_to_ignore_regex, "", batch[data_args.text_column_name]).lower() + " " else: batch["target_text"] = batch[ data_args.text_column_name].lower() + " " return batch with training_args.main_process_first( desc="dataset map special characters removal"): raw_datasets = raw_datasets.map( remove_special_characters, remove_columns=[data_args.text_column_name], desc="remove special characters from datasets", ) # 3. Next, we create the vocabulary of the model by extracting all unique characters from # the training and evaluation datasets # We need to make sure that only first rank saves vocabulary # make sure all processes wait until vocab is created with training_args.main_process_first( desc="dataset map vocabulary creation"): vocab_dict = create_vocabulary_from_data(raw_datasets) vocab_file = os.path.join(training_args.output_dir, "vocab.json") # save vocab dict to be loaded into tokenizer os.makedirs(training_args.output_dir, exist_ok=True) if training_args.overwrite_output_dir and os.path.isfile(vocab_file): os.remove(vocab_file) if not os.path.isfile(vocab_file): with open(vocab_file, "w") as vocab_file: json.dump(vocab_dict, vocab_file) # 4. Now we can instantiate the configuration, feature extractor, tokenizer and model # Note for distributed training, the .from_pretrained methods guarantee that only # one local process can concurrently download model & vocab. # load config config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) # load feature_extractor, tokenizer and create processor tokenizer = AutoTokenizer.from_pretrained( training_args.output_dir, tokenizer_type=config.model_type, unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|", ) feature_extractor = AutoFeatureExtractor.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir) processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) # adapt config config.update({ "feat_proj_dropout": model_args.feat_proj_dropout, "attention_dropout": model_args.attention_dropout, "hidden_dropout": model_args.hidden_dropout, "final_dropout": model_args.final_dropout, "mask_time_prob": model_args.mask_time_prob, "gradient_checkpointing": training_args.gradient_checkpointing, "layerdrop": model_args.layerdrop, "ctc_loss_reduction": model_args.ctc_loss_reduction, "pad_token_id": processor.tokenizer.pad_token_id, "vocab_size": len(processor.tokenizer), "activation_dropout": model_args.activation_dropout, }) # create model model = AutoModelForCTC.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir, config=config) # freeze encoder if model_args.freeze_feature_extractor: model.freeze_feature_extractor() # 5. Now we preprocess the datasets which includes loading the audio, resampling and padding # The following code should be cleaned up as soon as # https://github.com/huggingface/datasets/pull/2324 is merged # Preprocessing the datasets. # We need to read the audio files as arrays and tokenize the targets. # derive max & min input length for sample rate & max duration max_input_length = data_args.max_duration_in_seconds * processor.feature_extractor.sampling_rate min_input_length = data_args.min_duration_in_seconds * processor.feature_extractor.sampling_rate resampler = None if raw_datasets["train"][data_args.audio_column_name][0].split( ".")[-1] == "mp3": # TODO(PVP) - remove hard-coded 48_000 after audio feature is merged resampler = torchaudio.transforms.Resample( 48_000, processor.feature_extractor.sampling_rate) # Preprocessing the datasets. # We need to read the audio files as arrays and tokenize the targets. def prepare_dataset(batch): # load audio speech_array, sampling_rate = torchaudio.load( batch[data_args.audio_column_name]) speech_array = speech_array.squeeze() # if necessary resample audio if resampler is not None: # TODO(PVP) - remove hard-coded 48_000 after audio feature is merged speech_array = resampler(speech_array) sampling_rate = resampler.new_freq speech_array = speech_array.numpy() batch["input_values"] = processor( speech_array, sampling_rate=sampling_rate, truncate=True, max_length=max_input_length).input_values[0] # Setup the processor for targets with processor.as_target_processor(): batch["labels"] = processor(batch["target_text"]).input_ids return batch with training_args.main_process_first(desc="dataset map preprocessing"): vectorized_datasets = raw_datasets.map( prepare_dataset, remove_columns=raw_datasets["train"].column_names, num_proc=data_args.preprocessing_num_workers, desc="preprocess datasets", ) if min_input_length > 0.0: # filter data that is shorter than min_input_length vectorized_datasets = vectorized_datasets.filter( lambda data: len(data["input_values"]) > min_input_length, num_proc=data_args.preprocessing_num_workers, ) # 6. Next, we can prepare the training. # Let's use word error rate (WER) as our evaluation metric, # instantiate a data collator and the trainer # Define Metric during training wer_metric = load_metric("wer") if data_args.only_data_preprocessing: logger.info("Data preprocessing finished.") return def compute_metrics(pred): pred_logits = pred.predictions pred_ids = np.argmax(pred_logits, axis=-1) pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id pred_str = processor.batch_decode(pred_ids) # we do not want to group tokens when computing the metrics label_str = processor.batch_decode(pred.label_ids, group_tokens=False) wer = wer_metric.compute(predictions=pred_str, references=label_str) return {"wer": wer} # Instantiate custom data collator data_collator = DataCollatorCTCWithPadding(processor=processor) # Initialize Trainer trainer = Trainer( model=model, data_collator=data_collator, args=training_args, compute_metrics=compute_metrics, train_dataset=vectorized_datasets["train"] if training_args.do_train else None, eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None, tokenizer=processor.feature_extractor, ) # 7. Finally, we can start training # Training if training_args.do_train: # use last checkpoint if exist if last_checkpoint is not None: checkpoint = last_checkpoint elif os.path.isdir(model_args.model_name_or_path): checkpoint = model_args.model_name_or_path else: checkpoint = None # Save the feature_extractor and the tokenizer if is_main_process(training_args.local_rank): processor.save_pretrained(training_args.output_dir) train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() metrics = train_result.metrics max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(vectorized_datasets["train"])) metrics["train_samples"] = min(max_train_samples, len(vectorized_datasets["train"])) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate() max_eval_samples = (data_args.max_eval_samples if data_args.max_eval_samples is not None else len( vectorized_datasets["eval"])) metrics["eval_samples"] = min(max_eval_samples, len(vectorized_datasets["eval"])) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # Write model card and (optionally) push to hub kwargs = { "finetuned_from": model_args.model_name_or_path, "tasks": "speech-recognition", "tags": ["automatic-speech-recognition", data_args.dataset_name], "dataset_args": f"Config: {data_args.dataset_config_name}, Training split: {data_args.train_split_name}, Eval split: {data_args.eval_split_name}", "dataset": f"{data_args.dataset_name.upper()} - {data_args.dataset_config_name.upper()}", } if "common_voice" in data_args.dataset_name: kwargs["language"] = data_args.dataset_config_name if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: trainer.create_model_card(**kwargs) return results
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() # Detecting last checkpoint. last_checkpoint = None if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome." ) elif last_checkpoint is not None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # 1. First, let's load the dataset raw_datasets = DatasetDict() if training_args.do_train: raw_datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=data_args.train_split_name ) if data_args.audio_column_name not in raw_datasets["train"].column_names: raise ValueError( f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'. " "Make sure to set `--audio_column_name` to the correct audio column - one of " f"{', '.join(raw_datasets['train'].column_names)}." ) if data_args.text_column_name not in raw_datasets["train"].column_names: raise ValueError( f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. " "Make sure to set `--text_column_name` to the correct text column - one of " f"{', '.join(raw_datasets['train'].column_names)}." ) if data_args.max_train_samples is not None: raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples)) if training_args.do_eval: raw_datasets["eval"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=data_args.eval_split_name ) if data_args.max_eval_samples is not None: raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples)) # 2. We remove some special characters from the datasets # that make training complicated and do not help in transcribing the speech # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic # that could be easily picked up by the model chars_to_ignore_regex = ( f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None ) text_column_name = data_args.text_column_name def remove_special_characters(batch): if chars_to_ignore_regex is not None: batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name]).lower() + " " else: batch["target_text"] = batch[text_column_name].lower() + " " return batch with training_args.main_process_first(desc="dataset map special characters removal"): raw_datasets = raw_datasets.map( remove_special_characters, remove_columns=[text_column_name], desc="remove special characters from datasets", ) # save special tokens for tokenizer word_delimiter_token = data_args.word_delimiter_token unk_token = data_args.unk_token pad_token = data_args.pad_token # 3. Next, let's load the config as we might need it to create # the tokenizer # load config config = AutoConfig.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token ) # 4. Next, if no tokenizer file is defined, # we create the vocabulary of the model by extracting all unique characters from # the training and evaluation datasets # We need to make sure that only first rank saves vocabulary # make sure all processes wait until vocab is created tokenizer_name_or_path = model_args.tokenizer_name_or_path tokenizer_kwargs = {} if tokenizer_name_or_path is None: # save vocab in training output dir tokenizer_name_or_path = training_args.output_dir vocab_file = os.path.join(tokenizer_name_or_path, "vocab.json") with training_args.main_process_first(): if training_args.overwrite_output_dir and os.path.isfile(vocab_file): os.remove(vocab_file) with training_args.main_process_first(desc="dataset map vocabulary creation"): if not os.path.isfile(vocab_file): os.makedirs(tokenizer_name_or_path, exist_ok=True) vocab_dict = create_vocabulary_from_data( raw_datasets, word_delimiter_token=word_delimiter_token, unk_token=unk_token, pad_token=pad_token, ) # save vocab dict to be loaded into tokenizer with open(vocab_file, "w") as file: json.dump(vocab_dict, file) # if tokenizer has just been created # it is defined by `tokenizer_class` if present in config else by `model_type` tokenizer_kwargs = { "config": config if config.tokenizer_class is not None else None, "tokenizer_type": config.model_type if config.tokenizer_class is None else None, "unk_token": unk_token, "pad_token": pad_token, "word_delimiter_token": word_delimiter_token, } # 5. Now we can instantiate the feature extractor, tokenizer and model # Note for distributed training, the .from_pretrained methods guarantee that only # one local process can concurrently download model & vocab. # load feature_extractor and tokenizer tokenizer = AutoTokenizer.from_pretrained( tokenizer_name_or_path, use_auth_token=data_args.use_auth_token, **tokenizer_kwargs, ) feature_extractor = AutoFeatureExtractor.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token ) # adapt config config.update( { "feat_proj_dropout": model_args.feat_proj_dropout, "attention_dropout": model_args.attention_dropout, "hidden_dropout": model_args.hidden_dropout, "final_dropout": model_args.final_dropout, "mask_time_prob": model_args.mask_time_prob, "mask_time_length": model_args.mask_time_length, "mask_feature_prob": model_args.mask_feature_prob, "mask_feature_length": model_args.mask_feature_length, "gradient_checkpointing": training_args.gradient_checkpointing, "layerdrop": model_args.layerdrop, "ctc_loss_reduction": model_args.ctc_loss_reduction, "pad_token_id": tokenizer.pad_token_id, "vocab_size": len(tokenizer), "activation_dropout": model_args.activation_dropout, } ) # create model model = AutoModelForCTC.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, config=config, use_auth_token=data_args.use_auth_token, ) # freeze encoder if model_args.freeze_feature_extractor: model.freeze_feature_extractor() # 6. Now we preprocess the datasets including loading the audio, resampling and normalization # Thankfully, `datasets` takes care of automatically loading and resampling the audio, # so that we just need to set the correct target sampling rate and normalize the input # via the `feature_extractor` # make sure that dataset decodes audio with correct sampling rate dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate if dataset_sampling_rate != feature_extractor.sampling_rate: raw_datasets = raw_datasets.cast_column( data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate) ) # derive max & min input length for sample rate & max duration max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate audio_column_name = data_args.audio_column_name num_workers = data_args.preprocessing_num_workers # `phoneme_language` is only relevant if the model is fine-tuned on phoneme classification phoneme_language = data_args.phoneme_language # Preprocessing the datasets. # We need to read the audio files as arrays and tokenize the targets. def prepare_dataset(batch): # load audio sample = batch[audio_column_name] inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"]) batch["input_values"] = inputs.input_values[0] batch["input_length"] = len(batch["input_values"]) # encode targets additional_kwargs = {} if phoneme_language is not None: additional_kwargs["phonemizer_lang"] = phoneme_language batch["labels"] = tokenizer(batch["target_text"], **additional_kwargs).input_ids return batch with training_args.main_process_first(desc="dataset map preprocessing"): vectorized_datasets = raw_datasets.map( prepare_dataset, remove_columns=next(iter(raw_datasets.values())).column_names, num_proc=num_workers, desc="preprocess datasets", ) def is_audio_in_length_range(length): return length > min_input_length and length < max_input_length # filter data that is shorter than min_input_length vectorized_datasets = vectorized_datasets.filter( is_audio_in_length_range, num_proc=num_workers, input_columns=["input_length"], ) # 7. Next, we can prepare the training. # Let's use word error rate (WER) as our evaluation metric, # instantiate a data collator and the trainer # Define evaluation metrics during training, *i.e.* word error rate, character error rate eval_metrics = {metric: load_metric(metric) for metric in data_args.eval_metrics} # for large datasets it is advised to run the preprocessing on a # single machine first with ``args.preprocessing_only`` since there will mostly likely # be a timeout when running the script in distributed mode. # In a second step ``args.preprocessing_only`` can then be set to `False` to load the # cached dataset if data_args.preprocessing_only: logger.info(f"Data preprocessing finished. Files cached at {vectorized_datasets.cache_files}") return def compute_metrics(pred): pred_logits = pred.predictions pred_ids = np.argmax(pred_logits, axis=-1) pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id pred_str = tokenizer.batch_decode(pred_ids) # we do not want to group tokens when computing the metrics label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False) metrics = {k: v.compute(predictions=pred_str, references=label_str) for k, v in eval_metrics.items()} return metrics # Now create a single processor if is_main_process(training_args.local_rank): # save feature extractor, tokenizer and config feature_extractor.save_pretrained(training_args.output_dir) tokenizer.save_pretrained(training_args.output_dir) config.save_pretrained(training_args.output_dir) # load processor try: processor = AutoProcessor.from_pretrained(training_args.output_dir) except (OSError, KeyError): warnings.warn( "Loading a processor from a feature extractor config that does not" " include a `processor_class` attribute is deprecated and will be removed in v5. Please add the following " " attribute to your `preprocessor_config.json` file to suppress this warning: " " `'processor_class': 'Wav2Vec2Processor'`", FutureWarning, ) processor = Wav2Vec2Processor.from_pretrained(training_args.output_dir) # Instantiate custom data collator data_collator = DataCollatorCTCWithPadding(processor=processor) # Initialize Trainer trainer = Trainer( model=model, data_collator=data_collator, args=training_args, compute_metrics=compute_metrics, train_dataset=vectorized_datasets["train"] if training_args.do_train else None, eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None, tokenizer=feature_extractor, ) # 8. Finally, we can start training # Training if training_args.do_train: # use last checkpoint if exist if last_checkpoint is not None: checkpoint = last_checkpoint elif os.path.isdir(model_args.model_name_or_path): checkpoint = model_args.model_name_or_path else: checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() metrics = train_result.metrics max_train_samples = ( data_args.max_train_samples if data_args.max_train_samples is not None else len(vectorized_datasets["train"]) ) metrics["train_samples"] = min(max_train_samples, len(vectorized_datasets["train"])) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate() max_eval_samples = ( data_args.max_eval_samples if data_args.max_eval_samples is not None else len(vectorized_datasets["eval"]) ) metrics["eval_samples"] = min(max_eval_samples, len(vectorized_datasets["eval"])) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # Write model card and (optionally) push to hub config_name = data_args.dataset_config_name if data_args.dataset_config_name is not None else "na" kwargs = { "finetuned_from": model_args.model_name_or_path, "tasks": "speech-recognition", "tags": ["automatic-speech-recognition", data_args.dataset_name], "dataset_args": f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split: {data_args.eval_split_name}", "dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}", } if "common_voice" in data_args.dataset_name: kwargs["language"] = config_name if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: trainer.create_model_card(**kwargs) return results
def preprocess_dataset( dataset: datasets.DatasetDict, tokenizer: PreTrainedTokenizer, label_list: List[str], data_args, pad_token_id=-1, ): label_map = {label: i for i, label in enumerate(label_list)} def encode_batch(examples): features = defaultdict(list) for words, heads, deprels in zip(examples["tokens"], examples["head"], examples["deprel"]): # clean up i = 0 while i < len(heads): if heads[i] == "None": del words[i] del heads[i] del deprels[i] i += 1 tokens = [tokenizer.tokenize(w) for w in words] word_lengths = [len(w) for w in tokens] tokens_merged = [] list(map(tokens_merged.extend, tokens)) if 0 in word_lengths: continue # Filter out sequences that are too long if len(tokens_merged) >= (data_args.max_seq_length - 2): continue encoding = tokenizer( words, add_special_tokens=True, padding="max_length", truncation=True, max_length=data_args.max_seq_length, is_split_into_words=True, return_token_type_ids=True, return_attention_mask=True, ) input_ids = encoding["input_ids"] token_type_ids = encoding["token_type_ids"] attention_mask = encoding["attention_mask"] pad_item = [pad_token_id] # pad or truncate arc labels labels_arcs = [int(h) for h in heads] labels_arcs = labels_arcs + (data_args.max_seq_length - len(labels_arcs)) * pad_item # convert rel labels from map, pad or truncate if necessary labels_rels = [label_map[i.split(":")[0]] for i in deprels] labels_rels = labels_rels + (data_args.max_seq_length - len(labels_rels)) * pad_item # determine start indices of words, pad or truncate if necessary word_starts = np.cumsum([1] + word_lengths).tolist() word_starts = word_starts + (data_args.max_seq_length + 1 - len(word_starts)) * pad_item # sanity check lengths assert len(input_ids) == data_args.max_seq_length assert len(attention_mask) == data_args.max_seq_length assert len(token_type_ids) == data_args.max_seq_length assert len(labels_arcs) == data_args.max_seq_length assert len(labels_rels) == data_args.max_seq_length assert len(word_starts) == data_args.max_seq_length + 1 features["input_ids"].append(input_ids) features["attention_mask"].append(attention_mask) features["token_type_ids"].append(token_type_ids) features["word_starts"].append(word_starts) features["labels_arcs"].append(labels_arcs) features["labels_rels"].append(labels_rels) return dict(features) # Expects columns in all splits to be identical remove_columns = dataset.column_names["train"] dataset = dataset.map( encode_batch, batched=True, load_from_cache_file=not data_args.overwrite_cache, remove_columns=remove_columns, ) return dataset
def main(): # See all possible arguments in src/transformers/args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. args = parse_args() # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. accelerator = Accelerator() logger.info(accelerator.state, main_process_only=False) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() # set up weights and biases if available if is_wandb_available(): import wandb wandb.init(project=args.output_dir.split("/")[-1]) else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() # If passed along, set the training seed now. if args.seed is not None: set_seed(args.seed) # Handle the repository creation if accelerator.is_main_process: if args.push_to_hub and not args.preprocessing_only: if args.hub_model_id is None: repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token) else: repo_name = args.hub_model_id repo = Repository(args.output_dir, clone_from=repo_name) elif args.output_dir is not None: os.makedirs(args.output_dir, exist_ok=True) accelerator.wait_for_everyone() # 1. Download and create train, validation dataset # We load all dataset configuration and datset split pairs passed in # ``args.dataset_config_names`` and ``args.dataset_split_names`` datasets_splits = [] for dataset_config_name, train_split_name in zip(args.dataset_config_names, args.dataset_split_names): # load dataset dataset_split = load_dataset( args.dataset_name, dataset_config_name, split=train_split_name, cache_dir=args.cache_dir, ) datasets_splits.append(dataset_split) # Next, we concatenate all configurations and splits into a single training dataset raw_datasets = DatasetDict() if len(datasets_splits) > 1: raw_datasets["train"] = concatenate_datasets(datasets_splits).shuffle(seed=args.seed) else: raw_datasets["train"] = datasets_splits[0] # Take ``args.validation_split_percentage`` from the training dataset for the validation_split_percentage num_validation_samples = raw_datasets["train"].num_rows * args.validation_split_percentage // 100 if num_validation_samples == 0: raise ValueError( "`args.validation_split_percentage` is less than a single sample " f"for {len(raw_datasets['train'])} training samples. Increase " "`args.num_validation_split_percentage`. " ) raw_datasets["validation"] = raw_datasets["train"].select(range(num_validation_samples)) raw_datasets["train"] = raw_datasets["train"].select(range(num_validation_samples, raw_datasets["train"].num_rows)) # 2. Now we preprocess the datasets including loading the audio, resampling and normalization # Thankfully, `datasets` takes care of automatically loading and resampling the audio, # so that we just need to set the correct target sampling rate and normalize the input # via the `feature_extractor` feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(args.model_name_or_path) # make sure that dataset decodes audio with correct sampling rate raw_datasets = raw_datasets.cast_column( args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate) ) # only normalized-inputs-training is supported if not feature_extractor.do_normalize: raise ValueError( "Training is only supported for normalized inputs. Make sure ``feature_extractor.do_normalize == True``" ) # set max & min audio length in number of samples max_length = int(args.max_duration_in_seconds * feature_extractor.sampling_rate) min_length = int(args.min_duration_in_seconds * feature_extractor.sampling_rate) def prepare_dataset(batch): sample = batch[args.audio_column_name] inputs = feature_extractor( sample["array"], sampling_rate=sample["sampling_rate"], max_length=max_length, truncation=True ) batch["input_values"] = inputs.input_values[0] batch["input_length"] = len(inputs.input_values[0]) return batch # load via mapped files via path cache_file_names = None if args.train_cache_file_name is not None: cache_file_names = {"train": args.train_cache_file_name, "validation": args.validation_cache_file_name} # load audio files into numpy arrays with accelerator.main_process_first(): vectorized_datasets = raw_datasets.map( prepare_dataset, num_proc=args.preprocessing_num_workers, remove_columns=raw_datasets["train"].column_names, cache_file_names=cache_file_names, ) if min_length > 0.0: vectorized_datasets = vectorized_datasets.filter( lambda x: x > min_length, num_proc=args.preprocessing_num_workers, input_columns=["input_length"], ) vectorized_datasets = vectorized_datasets.remove_columns("input_length") # for large datasets it is advised to run the preprocessing on a # single machine first with ``args.preprocessing_only`` since there will mostly likely # be a timeout when running the script in distributed mode. # In a second step ``args.preprocessing_only`` can then be set to `False` to load the # cached dataset if args.preprocessing_only: return # 3. Load model config = Wav2Vec2Config.from_pretrained(args.model_name_or_path) # pretraining is only supported for "newer" stable layer norm architecture # apply_spec_augment has to be True, mask_feature_prob has to be 0.0 if not config.do_stable_layer_norm or config.feat_extract_norm != "layer": raise ValueError( "PreTraining is only supported for ``config.do_stable_layer_norm=True`` and" " ``config.feat_extract_norm='layer'" ) # initialize random model model = Wav2Vec2ForPreTraining(config) # Activate gradient checkpointing if needed if args.gradient_checkpointing: model.gradient_checkpointing_enable() # 4. Define data collator, optimizer and scheduler data_collator = DataCollatorForWav2Vec2Pretraining( model=model, feature_extractor=feature_extractor, pad_to_multiple_of=args.pad_to_multiple_of ) train_dataloader = DataLoader( vectorized_datasets["train"], shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size, ) eval_dataloader = DataLoader( vectorized_datasets["validation"], collate_fn=data_collator, batch_size=args.per_device_eval_batch_size ) # Optimizer optimizer = AdamW( list(model.parameters()), lr=args.learning_rate, betas=[args.adam_beta1, args.adam_beta2], eps=args.adam_epsilon, ) # Prepare everything with our `accelerator`. model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader ) # Scheduler and math around the number of training steps. num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch else: args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=args.max_train_steps, ) # 5. Train total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(vectorized_datasets['train'])}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info(f" Instantaneous batch size per device = {args.per_device_train_batch_size}") logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") completed_steps = 0 starting_epoch = 0 # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 starting_epoch = 0 for epoch in range(starting_epoch, args.num_train_epochs): model.train() for step, batch in enumerate(train_dataloader): # compute num of losses num_losses = batch["mask_time_indices"].sum() sub_attention_mask = batch.pop("sub_attention_mask", None) sub_attention_mask = ( sub_attention_mask if sub_attention_mask is not None else torch.ones_like(batch["mask_time_indices"]) ) percent_masked = num_losses / sub_attention_mask.sum() # forward outputs = model(**batch) # divide loss by gradient accumulation steps since gradients # are accumulated for multiple backward passes in PyTorch loss = outputs.loss / args.gradient_accumulation_steps accelerator.backward(loss) # make sure that `num_losses` is summed for distributed training # and average gradients over losses of all devices if accelerator.state.num_processes > 1: num_losses = accelerator.gather(num_losses).sum() gradient_multiplier = accelerator.state.num_processes / num_losses multiply_grads(model.module.parameters(), gradient_multiplier) else: multiply_grads(model.parameters(), 1 / num_losses) # update step if (step + 1) % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: # compute grad norm for monitoring scale = ( accelerator.scaler._scale.item() if hasattr(accelerator, "scaler") and accelerator.scaler is not None else 1 ) if accelerator.state.num_processes > 1: grad_norm = get_grad_norm(model.module.parameters(), scale) else: grad_norm = get_grad_norm(model.parameters(), scale) # update parameters optimizer.step() optimizer.zero_grad() if not accelerator.optimizer_step_was_skipped: lr_scheduler.step() elif accelerator.is_local_main_process: progress_bar.write( f"Gradients have overflown - skipping update step... Updating gradient scale to {scale}..." ) # update gumbel temperature gumbel_temperature = max( args.max_gumbel_temperature * args.gumbel_temperature_decay**completed_steps, args.min_gumbel_temperature, ) if hasattr(model, "module"): model.module.set_gumbel_temperature(gumbel_temperature) else: model.set_gumbel_temperature(gumbel_temperature) progress_bar.update(1) completed_steps += 1 # 6. Log all results if (step + 1) % (args.gradient_accumulation_steps * args.logging_steps) == 0: loss.detach() outputs.contrastive_loss.detach() outputs.diversity_loss.detach() if accelerator.state.num_processes > 1: loss = accelerator.gather(loss).sum() outputs.contrastive_loss = accelerator.gather(outputs.contrastive_loss).sum() outputs.diversity_loss = accelerator.gather(outputs.diversity_loss).sum() percent_masked = accelerator.gather(percent_masked).sum() train_logs = { "loss": (loss * args.gradient_accumulation_steps) / num_losses, "constrast_loss": outputs.contrastive_loss / num_losses, "div_loss": outputs.diversity_loss / num_losses, "%_mask_idx": percent_masked / accelerator.num_processes, "ppl": outputs.codevector_perplexity, "lr": torch.tensor(optimizer.param_groups[0]["lr"]), "temp": torch.tensor(gumbel_temperature), "grad_norm": torch.tensor(grad_norm), } log_str = "" for k, v in train_logs.items(): log_str += "| {}: {:.3e}".format(k, v.item()) if accelerator.is_local_main_process: progress_bar.write(log_str) if is_wandb_available(): wandb.log(train_logs) # save model every `args.saving_steps` steps if (step + 1) % (args.gradient_accumulation_steps * args.saving_steps) == 0: if (args.push_to_hub and epoch < args.num_train_epochs - 1) or args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained( args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save ) if (args.push_to_hub and epoch < args.num_train_epochs - 1) and accelerator.is_main_process: repo.push_to_hub( commit_message=f"Training in progress step {completed_steps}", blocking=False, auto_lfs_prune=True, ) # if completed steps > `args.max_train_steps` stop if completed_steps >= args.max_train_steps: break # 7. Validate! model.eval() # init logs val_logs = { "val_loss": 0, "val_contrastive_loss": 0, "val_diversity_loss": 0, "val_num_losses": 0, } for step, batch in enumerate(eval_dataloader): with torch.no_grad(): batch.pop("sub_attention_mask", None) outputs = model(**batch) val_logs["val_loss"] += outputs.loss val_logs["val_contrastive_loss"] += outputs.contrastive_loss val_logs["val_diversity_loss"] += outputs.diversity_loss val_logs["val_num_losses"] += batch["mask_time_indices"].sum() # sum over devices in multi-processing if accelerator.num_processes > 1: val_logs = {k: accelerator.gather(v).sum() for k, v in val_logs.items()} val_logs = {k: v / val_logs["val_num_losses"] for k, v in val_logs.items()} log_str = "" for k, v in val_logs.items(): log_str += "| {}: {:.3e}".format(k, v.item()) if accelerator.is_local_main_process: progress_bar.write(log_str) if is_wandb_available(): wandb.log(val_logs) if args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained( args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save ) if accelerator.is_main_process: if args.push_to_hub: repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() configure_logger(model_args, training_args) # Downloading and loading a dataset from the hub. datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir) if "validation" not in datasets.keys(): # make sure only "validation" and "train" keys remain" datasets = DatasetDict() datasets["validation"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"{data_args.train_split_name}[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, ) datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"{data_args.train_split_name}[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, ) else: # make sure only "validation" and "train" keys remain" datasets = DatasetDict() datasets["validation"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split="validation", cache_dir=model_args.cache_dir, ) datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"{data_args.train_split_name}", cache_dir=model_args.cache_dir, ) # only normalized-inputs-training is supported feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, do_normalize=True ) def prepare_dataset(batch): # check that all files have the correct sampling rate batch["speech"], _ = librosa.load(batch[data_args.speech_file_column], sr=feature_extractor.sampling_rate) return batch # load audio files into numpy arrays vectorized_datasets = datasets.map( prepare_dataset, num_proc=data_args.preprocessing_num_workers, remove_columns=datasets["train"].column_names ) # filter audio files that are too long vectorized_datasets = vectorized_datasets.filter( lambda data: len(data["speech"]) < int(data_args.max_duration_in_seconds * feature_extractor.sampling_rate) ) def normalize(batch): return feature_extractor(batch["speech"], sampling_rate=feature_extractor.sampling_rate) # normalize and transform to `BatchFeatures` vectorized_datasets = vectorized_datasets.map( normalize, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, remove_columns=vectorized_datasets["train"].column_names, ) # pretraining is only supported for "newer" stable layer norm architecture # apply_spec_augment has to be True, mask_feature_prob has to be 0.0 config = Wav2Vec2Config.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, gradient_checkpointing=model_args.gradient_checkpointing, ) if not config.do_stable_layer_norm or config.feat_extract_norm != "layer": raise ValueError( "PreTraining is only supported for ``config.do_stable_layer_norm=True`` and ``config.feat_extract_norm='layer'" ) model = FlaxWav2Vec2ForPreTraining( config, config=config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype) ) data_collator = FlaxDataCollatorForWav2Vec2Pretraining( model=model, feature_extractor=feature_extractor, pad_to_multiple_of=data_args.pad_to_multiple_of ) # Enable tensorboard only on the master node has_tensorboard = is_tensorboard_available() if has_tensorboard and jax.process_index() == 0: try: from flax.metrics.tensorboard import SummaryWriter summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir)) except ImportError as ie: has_tensorboard = False logger.warning( f"Unable to display metrics through TensorBoard because some package are not installed: {ie}" ) else: logger.warning( "Unable to display metrics through TensorBoard because the package is not installed: " "Please run pip install tensorboard to enable." ) # Initialize our training rng = jax.random.PRNGKey(training_args.seed) dropout_rngs = jax.random.split(rng, jax.local_device_count()) gumbel_rngs = jax.random.split(rng, jax.local_device_count()) num_epochs = int(training_args.num_train_epochs) train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count() eval_batch_size = int(training_args.per_device_eval_batch_size) * jax.device_count() num_train_steps = len(vectorized_datasets["train"]) // train_batch_size * num_epochs # Create learning rate schedule warmup_fn = optax.linear_schedule( init_value=0.0, end_value=training_args.learning_rate, transition_steps=training_args.warmup_steps ) decay_fn = optax.linear_schedule( init_value=training_args.learning_rate, end_value=0, transition_steps=num_train_steps - training_args.warmup_steps, ) linear_decay_lr_schedule_fn = optax.join_schedules( schedules=[warmup_fn, decay_fn], boundaries=[training_args.warmup_steps] ) # We use Optax's "masking" functionality to not apply weight decay # to bias and LayerNorm scale parameters. decay_mask_fn returns a # mask boolean with the same structure as the parameters. # The mask is True for parameters that should be decayed. def decay_mask_fn(params): flat_params = traverse_util.flatten_dict(params) flat_mask = { path: (path[-1] != "bias" and path[-2:] not in [("layer_norm", "scale"), ("final_layer_norm", "scale")]) for path in flat_params } return traverse_util.unflatten_dict(flat_mask) # create adam optimizer adamw = optax.adamw( learning_rate=linear_decay_lr_schedule_fn, b1=training_args.adam_beta1, b2=training_args.adam_beta2, eps=training_args.adam_epsilon, weight_decay=training_args.weight_decay, mask=decay_mask_fn, ) # Setup train state and define training hyper-parameters state = train_state.TrainState.create(apply_fn=model.__call__, params=model.params, tx=adamw) num_negatives = model.config.num_negatives contrastive_logits_temperature = model.config.contrastive_logits_temperature num_codevectors = model.config.num_codevectors_per_group * model.config.num_codevector_groups diversity_loss_weight = model.config.diversity_loss_weight # Define gradient update step fn def train_step(state, batch, dropout_rng, gumbel_rng): dropout_rng, new_dropout_rng = jax.random.split(dropout_rng) gumbel_rng, new_gumbel_rng = jax.random.split(gumbel_rng) def loss_fn(params): negative_indices = batch.pop("sampled_negative_indices") gumbel_temperature = jnp.clip( model_args.max_gumbel_temperature * model_args.gumbel_temperature_decay ** state.step, a_min=model_args.min_gumbel_temperature, ) outputs = state.apply_fn( **batch, gumbel_temperature=gumbel_temperature, params=params, dropout_rng=dropout_rng, gumbel_rng=gumbel_rng, train=True, ) contrastive_loss = compute_contrastive_loss( outputs.projected_quantized_states, outputs.projected_states, negative_indices, batch["mask_time_indices"], contrastive_logits_temperature, num_negatives, ) diversity_loss = (num_codevectors - outputs.codevector_perplexity) / num_codevectors loss = contrastive_loss + diversity_loss_weight * diversity_loss return loss grad_fn = jax.value_and_grad(loss_fn) loss, grad = grad_fn(state.params) grad = jax.lax.pmean(grad, "batch") new_state = state.apply_gradients(grads=grad) metrics = jax.lax.pmean( {"loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.step)}, axis_name="batch" ) return new_state, metrics, new_dropout_rng, new_gumbel_rng # Create parallel version of the train step p_train_step = jax.pmap(train_step, "batch", donate_argnums=(0,)) # Define eval fn def eval_step(params, batch): negative_indices = batch.pop("sampled_negative_indices") outputs = model(**batch, params=params, train=False) contrastive_loss = compute_contrastive_loss( outputs.projected_quantized_states, outputs.projected_states, negative_indices, batch["mask_time_indices"], contrastive_logits_temperature, num_negatives, ) diversity_loss = (num_codevectors - outputs.codevector_perplexity) / num_codevectors loss = contrastive_loss + diversity_loss_weight * diversity_loss # summarize metrics metrics = {"loss": loss.mean(), "codevector_perplexity": outputs.codevector_perplexity} metrics = jax.lax.pmean(metrics, axis_name="batch") return metrics p_eval_step = jax.pmap(eval_step, "batch", donate_argnums=(0,)) # Replicate the train state on each device state = jax_utils.replicate(state) train_time = 0 epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0) for epoch in epochs: # ======================== Training ================================ train_start = time.time() train_metrics = [] # Create sampling rng rng, input_rng = jax.random.split(rng) # Generate an epoch by shuffling sampling indices from the train dataset num_train_samples = len(vectorized_datasets["train"]) train_samples_idx = jax.random.permutation(input_rng, jnp.arange(num_train_samples)) train_batch_idx = generate_batch_splits(train_samples_idx, train_batch_size) # Gather the indexes for creating the batch and do a training step for i, batch_idx in enumerate(tqdm(train_batch_idx, desc="Training...", position=1)): samples = [vectorized_datasets["train"][int(idx)] for idx in batch_idx] model_inputs = data_collator(samples) model_inputs = shard(model_inputs.data) # Model forward state, train_metric, dropout_rngs, gumbel_rngs = p_train_step( state, model_inputs, dropout_rngs, gumbel_rngs ) train_metrics.append(train_metric) train_time += time.time() - train_start epochs.write( f"Epoch... ({epoch + 1}/{num_epochs} | Loss: {train_metric['loss'].mean()}, Learning Rate: {train_metric['learning_rate'].mean()})" ) # ======================== Evaluating ============================== num_eval_samples = len(vectorized_datasets["validation"]) eval_samples_idx = jnp.arange(num_eval_samples) eval_batch_idx = generate_batch_splits(eval_samples_idx, eval_batch_size) eval_metrics = [] for i, batch_idx in enumerate(tqdm(eval_batch_idx, desc="Evaluating ...", position=2)): samples = [vectorized_datasets["validation"][int(idx)] for idx in batch_idx] model_inputs = data_collator(samples) # Model forward model_inputs = shard(model_inputs.data) metrics = p_eval_step(state.params, model_inputs) eval_metrics.append(metrics) # get eval metrics eval_metrics = get_metrics(eval_metrics) eval_metrics = jax.tree_map(jnp.mean, eval_metrics) # Update progress bar epochs.write( f"Epoch... ({epoch + 1}/{num_epochs} | Loss: {eval_metrics['loss']}, Perplexity: {eval_metrics['codevector_perplexity']})" ) # Save metrics if has_tensorboard and jax.process_index() == 0: cur_step = epoch * (len(vectorized_datasets["train"]) // train_batch_size) write_metric(summary_writer, train_metrics, eval_metrics, train_time, cur_step) # save checkpoint after each epoch and push checkpoint to the hub if jax.process_index() == 0: params = jax.device_get(jax.tree_map(lambda x: x[0], state.params)) model.save_pretrained(training_args.output_dir, params=params, push_to_hub=training_args.push_to_hub)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank ) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # 1. First, let's load the dataset raw_datasets = DatasetDict() task_name = data_args.task lang_id = data_args.language if task_name is None: raise ValueError("Set --task should be set to '<xtreme_s_task>' " "(e.g. 'fleurs-asr', 'mls', 'covost2', 'minds14') ") if lang_id is None: raise ValueError( "Set --language should be set to the language id of the sub dataset " "config to be used (e.g. 'pl', 'en.tr', 'fr-FR') or 'all'" " for multi-lingual fine-tuning.") if data_args.language_group is not None: if data_args.task != "fleurs-asr": raise ValueError( "--language_group should only be used with --task=fleurs-asr") if data_args.language != "all": raise ValueError( "--language_group should only be used with --language=all") if data_args.target_column_name is None: target_column_name = TASK_TO_TARGET_COLUMN_NAME[task_name] else: target_column_name = data_args.target_column_name # here we differentiate between tasks with text as the target and classification tasks is_text_target = target_column_name in ("transcription", "translation") config_name = ".".join([task_name.split("-")[0], lang_id]) if training_args.do_train: raw_datasets["train"] = load_dataset( data_args.dataset_name, config_name, split=data_args.train_split_name, use_auth_token=data_args.use_auth_token, cache_dir=model_args.cache_dir, ) if data_args.audio_column_name not in raw_datasets[ "train"].column_names: raise ValueError( f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'. " "Make sure to set `--audio_column_name` to the correct audio column - one of " f"{', '.join(raw_datasets['train'].column_names)}.") if target_column_name not in raw_datasets["train"].column_names: raise ValueError( f"--target_column_name {target_column_name} not found in dataset '{data_args.dataset_name}'. " "Make sure to set `--target_column_name` to the correct text column - one of " f"{', '.join(raw_datasets['train'].column_names)}.") if data_args.max_train_samples is not None: raw_datasets["train"] = raw_datasets["train"].select( range(data_args.max_train_samples)) if training_args.do_eval: raw_datasets["eval"] = load_dataset( data_args.dataset_name, config_name, split=data_args.eval_split_name, use_auth_token=data_args.use_auth_token, cache_dir=model_args.cache_dir, ) if data_args.max_eval_samples is not None: raw_datasets["eval"] = raw_datasets["eval"].select( range(data_args.max_eval_samples)) if training_args.do_predict: raw_datasets["predict"] = load_dataset( data_args.dataset_name, config_name, split=data_args.predict_split_name, use_auth_token=data_args.use_auth_token, cache_dir=model_args.cache_dir, ) if data_args.max_predict_samples is not None: raw_datasets["predict"] = raw_datasets["predict"].select( range(data_args.max_predict_samples)) lang_list = next(iter(raw_datasets.values())).features["lang_id"].names if not is_text_target: label_list = next(iter( raw_datasets.values())).features[target_column_name].names num_labels = len(label_list) num_workers = data_args.preprocessing_num_workers lang_group = data_args.language_group if lang_group is not None: with training_args.main_process_first(desc="language group filter"): lang_group_id = next(iter( raw_datasets.values())).features["lang_group_id"].str2int( lang_group) raw_datasets = raw_datasets.filter( lambda lang_group: lang_group == lang_group_id, num_proc=num_workers, input_columns=["lang_group_id"], ) # 2. We remove some special characters from the datasets # that make training complicated and do not help in transcribing the speech # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic # that could be easily picked up by the model chars_to_ignore_regex = (f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None) def remove_special_characters(batch): if chars_to_ignore_regex is not None: batch["target_text"] = re.sub( chars_to_ignore_regex, "", batch[target_column_name]).lower() + " " else: batch["target_text"] = batch[target_column_name].lower() + " " return batch if is_text_target: with training_args.main_process_first( desc="dataset map special characters removal"): raw_datasets = raw_datasets.map( remove_special_characters, remove_columns=[target_column_name], desc="remove special characters from datasets", ) # save special tokens for tokenizer word_delimiter_token = data_args.word_delimiter_token unk_token = data_args.unk_token pad_token = data_args.pad_token # 3. Next, let's load the config as we might need it to create # the tokenizer config = AutoConfig.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token) if is_text_target: # 4. (Optional, for ASR and translation) If no tokenizer file is defined, # we create the vocabulary of the model by extracting all unique characters from # the training and evaluation datasets # We need to make sure that only first rank saves vocabulary # make sure all processes wait until vocab is created tokenizer_name_or_path = model_args.tokenizer_name_or_path tokenizer_kwargs = {} if tokenizer_name_or_path is None: # save vocab in training output dir tokenizer_name_or_path = training_args.output_dir vocab_file = os.path.join(tokenizer_name_or_path, "vocab.json") with training_args.main_process_first(): if training_args.overwrite_output_dir and os.path.isfile( vocab_file): os.remove(vocab_file) with training_args.main_process_first( desc="dataset map vocabulary creation"): if not os.path.isfile(vocab_file): os.makedirs(tokenizer_name_or_path, exist_ok=True) vocab_dict = create_vocabulary_from_data( raw_datasets, word_delimiter_token=word_delimiter_token, unk_token=unk_token, pad_token=pad_token, ) # save vocab dict to be loaded into tokenizer with open(vocab_file, "w") as file: json.dump(vocab_dict, file) # if tokenizer has just been created # it is defined by `tokenizer_class` if present in config else by `model_type` if not config.is_encoder_decoder: tokenizer_kwargs = { "config": config if config.tokenizer_class is not None else None, "tokenizer_type": config.model_type if config.tokenizer_class is None else None, "unk_token": unk_token, "pad_token": pad_token, "word_delimiter_token": word_delimiter_token, } else: tokenizer_kwargs = {} # 5. Now we can instantiate the feature extractor, tokenizer and model # Note for distributed training, the .from_pretrained methods guarantee that only # one local process can concurrently download model & vocab. # load feature_extractor and tokenizer if is_text_target: tokenizer = AutoTokenizer.from_pretrained( tokenizer_name_or_path, use_auth_token=data_args.use_auth_token, **tokenizer_kwargs, ) feature_extractor = AutoFeatureExtractor.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token) # adapt config # (speech translation requires pre-configured seq2seq models) if task_name != "covost2": config.update({ "feat_proj_dropout": model_args.feat_proj_dropout, "attention_dropout": model_args.attention_dropout, "hidden_dropout": model_args.hidden_dropout, "final_dropout": model_args.final_dropout, "mask_time_prob": model_args.mask_time_prob, "mask_time_length": model_args.mask_time_length, "mask_feature_prob": model_args.mask_feature_prob, "mask_feature_length": model_args.mask_feature_length, "gradient_checkpointing": training_args.gradient_checkpointing, "layerdrop": model_args.layerdrop, "ctc_zero_infinity": model_args.ctc_zero_infinity, "ctc_loss_reduction": model_args.ctc_loss_reduction, "activation_dropout": model_args.activation_dropout, }) if training_args.do_train: if is_text_target: config.pad_token_id = tokenizer.pad_token_id config.vocab_size = len(tokenizer) else: label_to_id = {v: i for i, v in enumerate(label_list)} config.label2id = label_to_id config.id2label = { id: label for label, id in label_to_id.items() } config.num_labels = num_labels # create model if target_column_name == "transcription": model = AutoModelForCTC.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, config=config, use_auth_token=data_args.use_auth_token, ) elif config.is_encoder_decoder: model = AutoModelForSpeechSeq2Seq.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, config=config, use_auth_token=data_args.use_auth_token, ) if model.config.decoder_start_token_id is None: raise ValueError( "Make sure that `config.decoder_start_token_id` is correctly defined" ) else: model = AutoModelForAudioClassification.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, config=config, use_auth_token=data_args.use_auth_token, ) # freeze encoder if model_args.freeze_feature_encoder: model.freeze_feature_encoder() # 6. Now we preprocess the datasets including loading the audio, resampling and normalization # Thankfully, `datasets` takes care of automatically loading and resampling the audio, # so that we just need to set the correct target sampling rate and normalize the input # via the `feature_extractor` # make sure that dataset decodes audio with correct sampling rate dataset_sampling_rate = next(iter(raw_datasets.values())).features[ data_args.audio_column_name].sampling_rate if dataset_sampling_rate != feature_extractor.sampling_rate: raw_datasets = raw_datasets.cast_column( data_args.audio_column_name, datasets.features.Audio( sampling_rate=feature_extractor.sampling_rate)) # derive max & min input length for sample rate & max duration max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate audio_column_name = data_args.audio_column_name # `phoneme_language` is only relevant if the model is fine-tuned on phoneme classification phoneme_language = data_args.phoneme_language # Preprocessing the datasets. # We need to read the audio files as arrays and tokenize the targets. def prepare_dataset(batch): # load audio sample = batch[audio_column_name] inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"]) batch["input_values"] = inputs.input_values[0] batch["length"] = len(batch["input_values"]) # encode targets additional_kwargs = {} if phoneme_language is not None: additional_kwargs["phonemizer_lang"] = phoneme_language if is_text_target: batch["labels"] = tokenizer(batch["target_text"], **additional_kwargs).input_ids else: batch["labels"] = batch[target_column_name] batch["lang"] = batch["lang_id"] return batch with training_args.main_process_first(desc="dataset map preprocessing"): vectorized_datasets = raw_datasets.map( prepare_dataset, remove_columns=next(iter(raw_datasets.values())).column_names, num_proc=num_workers, desc="preprocess datasets", ) if training_args.do_train: def is_audio_in_length_range(length): return length > min_input_length and length < max_input_length # filter data that is shorter than min_input_length vectorized_datasets["train"] = vectorized_datasets["train"].filter( is_audio_in_length_range, num_proc=num_workers, input_columns=["length"], ) # 7. Next, we can prepare for the training step. # Let's use the appropriate XTREME-S evaluation metric, # instantiate a data collator and the trainer # Define evaluation metrics during training, *i.e.* word error rate, character error rate eval_metric = load_metric("xtreme_s", task_name) # for large datasets it is advised to run the preprocessing on a # single machine first with ``args.preprocessing_only`` since there will mostly likely # be a timeout when running the script in distributed mode. # In a second step ``args.preprocessing_only`` can then be set to `False` to load the # cached dataset if data_args.preprocessing_only: logger.info( f"Data preprocessing finished. Files cached at {vectorized_datasets.cache_files}" ) return def asr_logits_argmax(logits, labels): return logits.argmax(dim=-1) def compute_asr_metric(pred): pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id pred_str = tokenizer.batch_decode(pred.predictions) # we do not want to group tokens when computing the metrics label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False) metric = eval_metric.compute(predictions=pred_str, references=label_str) return metric def compute_classification_metric(pred): pred_ids = np.argmax(pred.predictions, axis=1) metric = eval_metric.compute(predictions=pred_ids, references=pred.label_ids) return metric # Now save everything to be able to create a single processor later if is_main_process(training_args.local_rank): # save feature extractor, tokenizer and config feature_extractor.save_pretrained(training_args.output_dir) if is_text_target: tokenizer.save_pretrained(training_args.output_dir) config.save_pretrained(training_args.output_dir) # wait until configs are saved in the main process before loading the processor if training_args.local_rank != -1: torch.distributed.barrier() if is_text_target: processor = AutoProcessor.from_pretrained(training_args.output_dir) else: processor = AutoFeatureExtractor.from_pretrained( training_args.output_dir) # Instantiate custom data collator data_collator = SpeechDataCollatorWithPadding(processor=processor, pad_labels=is_text_target) # Initialize Trainer if target_column_name == "translation": trainer = Seq2SeqTrainer( model=model, data_collator=data_collator, args=training_args, preprocess_logits_for_metrics=asr_logits_argmax if training_args.predict_with_generate else None, compute_metrics=compute_asr_metric if training_args.predict_with_generate else None, train_dataset=vectorized_datasets["train"] if training_args.do_train else None, eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None, tokenizer=feature_extractor, ) else: trainer = Trainer( model=model, data_collator=data_collator, args=training_args, preprocess_logits_for_metrics=asr_logits_argmax if is_text_target else None, compute_metrics=compute_asr_metric if is_text_target else compute_classification_metric, train_dataset=vectorized_datasets["train"] if training_args.do_train else None, eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None, tokenizer=feature_extractor, ) # 8. Finally, we can start training # Training if training_args.do_train: # use last checkpoint if exist if last_checkpoint is not None: checkpoint = last_checkpoint elif os.path.isdir(model_args.model_name_or_path): checkpoint = model_args.model_name_or_path else: checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() metrics = train_result.metrics max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(vectorized_datasets["train"])) metrics["train_samples"] = min(max_train_samples, len(vectorized_datasets["train"])) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation on the test set results = {} if training_args.do_predict: logger.info( f"*** Evaluating on the `{data_args.predict_split_name}` set ***") if data_args.per_lang_metrics: # separate the `test` dataset into language-specific subsets and compute metrics for each of them metrics = {} average_metrics = defaultdict(list) for lang_id in range(len(lang_list)): lang_name = lang_list[lang_id] with training_args.main_process_first( desc="per-language dataset filter"): lang_dataset = vectorized_datasets["predict"].filter( lambda lang: lang == lang_id, num_proc=num_workers, input_columns=["lang"], ) lang_metrics = trainer.evaluate(lang_dataset) redundant_metrics = [ "eval_runtime", "eval_samples_per_second", "eval_steps_per_second", "eval_epoch" ] for metric_name, value in lang_metrics.items(): average_metrics[metric_name].append(value) if metric_name not in redundant_metrics: metrics[f"{metric_name}_{lang_name}"] = value for metric_name, value in average_metrics.items(): metrics[metric_name] = np.mean(value) else: metrics = trainer.evaluate(vectorized_datasets["predict"]) max_predict_samples = (data_args.max_predict_samples if data_args.max_predict_samples is not None else len(vectorized_datasets["predict"])) metrics["predict_samples"] = min(max_predict_samples, len(vectorized_datasets["predict"])) # make sure that the `predict` metrics end up in the log history for the model card trainer.log(OrderedDict(sorted(metrics.items()))) trainer.log_metrics("predict", metrics) trainer.save_metrics("predict", metrics) # Write model card and (optionally) push to hub kwargs = { "finetuned_from": model_args.model_name_or_path, "tasks": task_name, "tags": [task_name, data_args.dataset_name], "dataset_args": f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split: {data_args.eval_split_name}, Predict split: {data_args.predict_split_name}", "dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}", "language": data_args.language, } if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: trainer.create_model_card(**kwargs) return results
def main(): raw_datasets = DatasetDict() if training_args.do_train: raw_datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config, split=data_args.train_split_name, use_auth_token=data_args.use_auth_token, ) if data_args.audio_column not in raw_datasets["train"].column_names: raise ValueError( f"--audio_column '{data_args.audio_column}' not found in dataset '{data_args.dataset_name}'. " "Make sure to set `--audio_column` to the correct audio column - one of " f"{', '.join(raw_datasets['train'].column_names)}." ) if data_args.text_column not in raw_datasets["train"].column_names: raise ValueError( f"--text_column {data_args.text_column} not found in dataset '{data_args.dataset_name}'. " "Make sure to set `--text_column` to the correct text column - one of " f"{', '.join(raw_datasets['train'].column_names)}." ) if data_args.max_train_samples is not None: raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples)) if training_args.do_eval: raw_datasets["eval"] = load_dataset( data_args.dataset_name, data_args.dataset_config, split=data_args.eval_split_name, use_auth_token=data_args.use_auth_token, ) if data_args.max_eval_samples is not None: raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples)) # 2. We remove some special characters from the datasets # that make training complicated and do not help in transcribing the speech # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic # that could be easily picked up by the model chars_to_ignore_regex = ( f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None ) text_column = data_args.text_column def remove_special_characters(batch): if chars_to_ignore_regex is not None: batch["target_text"] = ( re.sub(chars_to_ignore_regex, "", batch[text_column]).lower() + " " ) else: batch["target_text"] = batch[text_column].lower() + " " return batch with training_args.main_process_first(desc="dataset map special characters removal"): raw_datasets = raw_datasets.map( remove_special_characters, remove_columns=[text_column], desc="remove special characters from datasets", ) # save special tokens for tokenizer word_delimiter_token = data_args.word_delimiter_token unk = data_args.unk pad = data_args.pad # 3. Next, let's load the config as we might need it to create # the tokenizer # load config config = AutoConfig.from_pretrained( model_args.model_name, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token, ) # 4. Next, if no tokenizer file is defined, # we create the vocabulary of the model by extracting all unique characters from # the training and evaluation datasets # We need to make sure that only first rank saves vocabulary # make sure all processes wait until vocab is created tokenizer_name_or_path = model_args.tokenizer_name_or_path tokenizer_kw = {} if tokenizer_name_or_path is None: # save vocab in training output dir tokenizer_name_or_path = training_args.out_dir vocab_file = os.path.join(tokenizer_name_or_path, "vocab.json") with training_args.main_process_first(): if training_args.overwrite_out_dir and os.path.isfile(vocab_file): os.remove(vocab_file) with training_args.main_process_first(desc="dataset map vocabulary creation"): if not os.path.isfile(vocab_file): os.makedirs(tokenizer_name_or_path, exist_ok=True) vocab_dict = create_vocabulary_from_data( raw_datasets, word_delimiter_token=word_delimiter_token, unk=unk, pad=pad, ) # save vocab dict to be loaded into tokenizer with open(vocab_file, "w") as file: json.dump(vocab_dict, file) # if tokenizer has just been created # it is defined by `tokenizer_class` if present in config else by `model_type` tokenizer_kw = { "config": config if config.tokenizer_class is not None else None, "tokenizer_type": config.model_type if config.tokenizer_class is None else None, "unk": unk, "pad": pad, "word_delimiter_token": word_delimiter_token, } tokenizer = AutoTokenizer.from_pretrained( tokenizer_name_or_path, use_auth_token=data_args.use_auth_token, **tokenizer_kw, ) feature_extractor = AutoFeatureExtractor.from_pretrained( model_args.model_name, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token, ) # adapt config config.update( { "feat_proj_dropout": model_args.feat_proj_dropout, "drop_attn": model_args.drop_attn, "drop": model_args.drop, "final_dropout": model_args.final_dropout, "mask_time_prob": model_args.mask_time_prob, "mask_time_length": model_args.mask_time_length, "mask_feature_prob": model_args.mask_feature_prob, "mask_feature_length": model_args.mask_feature_length, "grad_checkpoint": training_args.grad_checkpoint, "layerdrop": model_args.layerdrop, "ctc_loss_reduction": model_args.ctc_loss_reduction, "PAD": tokenizer.PAD, "s_vocab": len(tokenizer), "drop_act": model_args.drop_act, } ) # create model model = AutoModelForCTC.from_pretrained( model_args.model_name, cache_dir=model_args.cache_dir, config=config, use_auth_token=data_args.use_auth_token, ) # freeze encoder if model_args.freeze_feature_encoder: model.freeze_feature_encoder() # 6. Now we preprocess the datasets including loading the audio, resampling and normalization # Thankfully, `datasets` takes care of automatically loading and resampling the audio, # so that we just need to set the correct target sampling rate and normalize the input # via the `feature_extractor` # make sure that dataset decodes audio with correct sampling rate dataset_sampling_rate = ( next(iter(raw_datasets.values())).features[data_args.audio_column].sampling_rate ) if dataset_sampling_rate != feature_extractor.sampling_rate: raw_datasets = raw_datasets.cast_column( data_args.audio_column, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate), ) # derive max & min input length for sample rate & max duration max_input_length = data_args.max_duration * feature_extractor.sampling_rate min_input_length = data_args.min_duration * feature_extractor.sampling_rate audio_column = data_args.audio_column num_workers = data_args.num_workers # `phoneme_language` is only relevant if the model is fine-tuned on phoneme classification phoneme_language = data_args.phoneme_language # Preprocessing the datasets. # We need to read the audio files as arrays and tokenize the targets. def prepare_dataset(batch): # load audio sample = batch[audio_column] inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"]) batch["input_values"] = inputs.input_values[0] batch["input_length"] = len(batch["input_values"]) # encode targets additional_kw = {} if phoneme_language is not None: additional_kw["phonemizer_lang"] = phoneme_language batch["labels"] = tokenizer(batch["target_text"], **additional_kw).input_ids return batch with training_args.main_process_first(desc="dataset map preprocessing"): vectorized_datasets = raw_datasets.map( prepare_dataset, remove_columns=next(iter(raw_datasets.values())).column_names, num_proc=num_workers, desc="preprocess datasets", ) def is_audio_in_length_range(length): return length > min_input_length and length < max_input_length # filter data that is shorter than min_input_length vectorized_datasets = vectorized_datasets.filter( is_audio_in_length_range, num_proc=num_workers, input_columns=["input_length"], ) # 7. Next, we can prepare the training. # Let's use word error rate (WER) as our evaluation metric, # instantiate a data collator and the trainer # Define evaluation metrics during training, *i.e.* word error rate, character error rate eval_metrics = {metric: load_metric(metric) for metric in data_args.eval_metrics} # for large datasets it is advised to run the preprocessing on a # single machine first with ``args.preprocessing_only`` since there will mostly likely # be a timeout when running the script in distributed mode. # In a second step ``args.preprocessing_only`` can then be set to `False` to load the # cached dataset if data_args.preprocessing_only: logger.info( f"Data preprocessing finished. Files cached at {vectorized_datasets.cache_files}" ) return def compute_metrics(pred): pred_logits = pred.predictions pred_ids = np.argmax(pred_logits, axis=-1) pred.label_ids[pred.label_ids == -100] = tokenizer.PAD pred_str = tokenizer.batch_decode(pred_ids) # we do not want to group tokens when computing the metrics label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False) metrics = { k: v.compute(predictions=pred_str, references=label_str) for k, v in eval_metrics.items() } return metrics # Now save everything to be able to create a single processor later if is_main_process(training_args.local_rank): # save feature extractor, tokenizer and config feature_extractor.save_pretrained(training_args.out_dir) tokenizer.save_pretrained(training_args.out_dir) config.save_pretrained(training_args.out_dir) try: processor = AutoProcessor.from_pretrained(training_args.out_dir) except (OSError, KeyError): warnings.warn( "Loading a processor from a feature extractor config that does not" " include a `processor_class` attribute is deprecated and will be removed in v5. Please add the following " " attribute to your `preprocessor_config.json` file to suppress this warning: " " `'processor_class': 'Wav2Vec2Processor'`", FutureWarning, ) processor = Wav2Vec2Processor.from_pretrained(training_args.out_dir) # Instantiate custom data collator data_collator = DataCollatorCTCWithPadding(processor=processor) # Initialize Trainer trainer = Trainer( model=model, data_collator=data_collator, args=training_args, compute_metrics=compute_metrics, train_dataset=vectorized_datasets["train"] if training_args.do_train else None, eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None, tokenizer=feature_extractor, ) # 8. Finally, we can start training # Training if training_args.do_train: # use last checkpoint if exist if last_checkpoint is not None: checkpoint = last_checkpoint elif os.path.isdir(model_args.model_name): checkpoint = model_args.model_name else: checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() metrics = train_result.metrics max_train_samples = ( data_args.max_train_samples if data_args.max_train_samples is not None else len(vectorized_datasets["train"]) ) metrics["train_samples"] = min(max_train_samples, len(vectorized_datasets["train"])) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate() max_eval_samples = ( data_args.max_eval_samples if data_args.max_eval_samples is not None else len(vectorized_datasets["eval"]) ) metrics["eval_samples"] = min(max_eval_samples, len(vectorized_datasets["eval"])) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # Write model card and (optionally) push to hub config_name = data_args.dataset_config if data_args.dataset_config is not None else "na" kw = { "finetuned_from": model_args.model_name, "tasks": "speech-recognition", "tags": ["automatic-speech-recognition", data_args.dataset_name], "dataset_args": f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split: {data_args.eval_split_name}", "dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}", } if "common_voice" in data_args.dataset_name: kw["language"] = config_name if training_args.push_to_hub: trainer.push_to_hub(**kw) else: trainer.create_model_card(**kw) return results
logits, labels = eval_preds predictions = np.argmax(logits, axis=-1) metrics = { 'accuracy': accuracy_score(labels, predictions), 'precision': precision_score(labels, predictions), 'recall': recall_score(labels, predictions), 'f1-score': f1_score(labels, predictions), } return metrics def tokenize_function(example): return tokenizer(example["text"], truncation=True) tokenized_dataset = raw_dataset.map(tokenize_function, batched=True) data_collator = DataCollatorWithPadding(tokenizer=tokenizer) training_args = TrainingArguments( output_dir=DETECT_RESULT, num_train_epochs=10, per_device_train_batch_size=8, per_device_eval_batch_size=16, warmup_steps=500, learning_rate=5e-5, weight_decay=0.01, evaluation_strategy='epoch', save_strategy="epoch", load_best_model_at_end=True, metric_for_best_model='f1-score', report_to="none",
def main(): args = parse_args() # this requires having preprocessed framenet # using `frame.cli:preprocess-framenet paths = data_paths(args.data) dataset = load_dataset('json', data_files=paths) metric = load_metric("rouge") train_test = dataset["train"].train_test_split(test_size=0.1) test_valid = train_test["test"].train_test_split(test_size=0.5) datasets = DatasetDict({ "train": train_test["train"], "test": test_valid["test"], "valid": test_valid["train"] }) tokenizer = AutoTokenizer.from_pretrained(args.model) # the family of t5 models expect input sentences to be prefixed with `"summarize: "` if args.model in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b"]: prefix = "summarize: " else: prefix = "" # HuggingFace loves to use closures, I would prefer this # be refactored into the library, but going this route for simplicity def preprocess_function(examples): """Tokenize the data for Seq2Seq Maps over all the examples in the dataset to tokenize both the input framenet sentences and the target frame definitions. Args: examples: samples in the dataset """ inputs = [prefix + sent for sent in examples["sentence"]] model_inputs = tokenizer(inputs, max_length=args.max_input_length, truncation=True) with tokenizer.as_target_tokenizer(): labels = tokenizer(examples["frame_definition"], max_length=args.max_target_length, truncation=True) model_inputs["labels"] = labels["input_ids"] return model_inputs # again, with the closures - requires instance of the tokenizer def compute_metrics(eval_pred): predictions, labels = eval_pred decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True) # Replace -100 in the labels as we can't decode them. labels = np.where(labels != -100, labels, tokenizer.pad_token_id) decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) # Rouge expects a newline after each sentence decoded_preds = [ "\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds ] decoded_labels = [ "\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels ] result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True) # Extract a few results result = { key: value.mid.fmeasure * 100 for key, value in result.items() } # Add mean generated length prediction_lens = [ np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions ] result["gen_len"] = np.mean(prediction_lens) return {k: round(v, 4) for k, v in result.items()} tokenized_datasets = datasets.map(preprocess_function, batched=True) model = AutoModelForSeq2SeqLM.from_pretrained(args.model) data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) training_args = Seq2SeqTrainingArguments( "./results/summarization", evaluation_strategy="epoch", learning_rate=2e-5, per_device_train_batch_size=args.batch_size, per_device_eval_batch_size=args.batch_size, weight_decay=0.01, save_total_limit=3, num_train_epochs=args.epochs, predict_with_generate=True, fp16=True, ) trainer = Seq2SeqTrainer(model, training_args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["valid"], data_collator=data_collator, tokenizer=tokenizer, compute_metrics=compute_metrics) trainer.train()
def main(): # 1. Parse input arguments # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() # 2. Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) transformers.utils.logging.set_verbosity(log_level) transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() logger.info("Training/evaluation parameters %s", training_args) # 3. Detecting last checkpoint and eventualy continue from last checkpoint last_checkpoint = None if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome." ) elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Set seed before initializing model. set_seed(training_args.seed) # 4. Load dataset raw_datasets = DatasetDict() if training_args.do_train: raw_datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=data_args.train_split_name ) if training_args.do_eval: raw_datasets["eval"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=data_args.eval_split_name ) if data_args.audio_column_name not in next(iter(raw_datasets.values())).column_names: raise ValueError( f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'. " "Make sure to set `--audio_column_name` to the correct audio column - one of " f"{', '.join(next(iter(raw_datasets.values())).column_names)}." ) if data_args.text_column_name not in next(iter(raw_datasets.values())).column_names: raise ValueError( f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. " "Make sure to set `--text_column_name` to the correct text column - one of " f"{', '.join(next(iter(raw_datasets.values())).column_names)}." ) # 5. Load pretrained model, tokenizer, and feature extractor # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) feature_extractor = AutoFeatureExtractor.from_pretrained( model_args.feature_extractor_name if model_args.feature_extractor_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model = AutoModelForSpeechSeq2Seq.from_pretrained( model_args.model_name_or_path, config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) if model.config.decoder_start_token_id is None: raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined") if model_args.freeze_feature_encoder: model.freeze_feature_encoder() # 6. Resample speech dataset if necassary dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate if dataset_sampling_rate != feature_extractor.sampling_rate: raw_datasets = raw_datasets.cast_column( data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate) ) # 7. Preprocessing the datasets. # We need to read the audio files as arrays and tokenize the targets. max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate audio_column_name = data_args.audio_column_name num_workers = data_args.preprocessing_num_workers text_column_name = data_args.text_column_name model_input_name = feature_extractor.model_input_names[0] do_lower_case = data_args.do_lower_case if data_args.max_train_samples is not None: raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples)) if data_args.max_eval_samples is not None: raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples)) def prepare_dataset(batch): # process audio sample = batch[audio_column_name] inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"]) # process audio length batch[model_input_name] = inputs.input_values[0] batch["input_length"] = len(batch["input_values"]) # process targets input_str = batch[text_column_name].lower() if do_lower_case else batch[text_column_name] batch["labels"] = tokenizer(input_str).input_ids return batch with training_args.main_process_first(desc="dataset map pre-processing"): vectorized_datasets = raw_datasets.map( prepare_dataset, remove_columns=next(iter(raw_datasets.values())).column_names, num_proc=data_args.preprocessing_num_workers, desc="preprocess train dataset", ) # filter data that is shorter than min_input_length or longer than # max_input_length def is_audio_in_length_range(length): return length > min_input_length and length < max_input_length vectorized_datasets = vectorized_datasets.filter( is_audio_in_length_range, num_proc=num_workers, input_columns=["input_length"], ) # for large datasets it is advised to run the preprocessing on a # single machine first with `args.preprocessing_only` since there will mostly likely # be a timeout when running the script in distributed mode. # In a second step `args.preprocessing_only` can then be set to `False` to load the # cached dataset if data_args.preprocessing_only: cache = {k: v.cache_files for k, v in vectorized_datasets.items()} logger.info(f"Data preprocessing finished. Files cached at {cache}.") return # 8. Load Metric metric = load_metric("wer") def compute_metrics(pred): pred_ids = pred.predictions pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True) # we do not want to group tokens when computing the metrics label_str = tokenizer.batch_decode(pred.label_ids, skip_special_tokens=True) wer = metric.compute(predictions=pred_str, references=label_str) return {"wer": wer} # 9. Create a single speech processor if is_main_process(training_args.local_rank): # save feature extractor, tokenizer and config feature_extractor.save_pretrained(training_args.output_dir) tokenizer.save_pretrained(training_args.output_dir) config.save_pretrained(training_args.output_dir) processor = AutoProcessor.from_pretrained(training_args.output_dir) # 10. Define data collator data_collator = DataCollatorSpeechSeq2SeqWithPadding( processor=processor, decoder_start_token_id=model.config.decoder_start_token_id ) # 11. Initialize Trainer trainer = Seq2SeqTrainer( model=model, args=training_args, train_dataset=vectorized_datasets["train"] if training_args.do_train else None, eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None, tokenizer=feature_extractor, data_collator=data_collator, compute_metrics=compute_metrics if training_args.predict_with_generate else None, ) # 12. Training if training_args.do_train: checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the feature extractor too for easy upload metrics = train_result.metrics max_train_samples = ( data_args.max_train_samples if data_args.max_train_samples is not None else len(vectorized_datasets["train"]) ) metrics["train_samples"] = min(max_train_samples, len(vectorized_datasets["train"])) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # 13. Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate( metric_key_prefix="eval", max_length=model.config.max_length, num_beams=model.config.num_beams ) max_eval_samples = ( data_args.max_eval_samples if data_args.max_eval_samples is not None else len(vectorized_datasets["eval"]) ) metrics["eval_samples"] = min(max_eval_samples, len(vectorized_datasets["eval"])) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # 14. Write Training Stats kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "speech recognition"} if data_args.dataset_name is not None: kwargs["dataset_tags"] = data_args.dataset_name if data_args.dataset_config_name is not None: kwargs["dataset_args"] = data_args.dataset_config_name kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" else: kwargs["dataset"] = data_args.dataset_name if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: trainer.create_model_card(**kwargs) return results
def build_datasets( data_args: DataTrainingArguments, tokenizer: PreTrainedTokenizer, cache_dir=None, skip_train=False, skip_eval=False) -> Tuple[Dataset, Dataset]: if skip_eval and skip_train: logger.warning("Both `skip_train` and `skip_eval` are set to True") json_path = data_args.data_json data_dir = data_args.load_data_from add_line_breaks = data_args.add_line_breaks break_token = data_args.line_break_token train_data, eval_data = None, None dataset = DatasetDict() if add_line_breaks: tokenizer.add_special_tokens(dict(additional_special_tokens=[break_token])) if json_path is not None: logger.info("Preprocessing new dataset from {}".format(json_path)) eval_split = data_args.eval_split save_dir = data_args.save_data_to dataset = load_dataset('json', data_files=[json_path], cache_dir=cache_dir) if eval_split < 1: dataset = dataset["train"].train_test_split(test_size=eval_split, shuffle=False) if save_dir is None: # Spend less time on preprocessing if skip_train: del dataset["train"] if skip_eval and "test" in dataset: del dataset["test"] if not data_args.skip_text_clean: normalize = partial(normalize_text, add_line_breaks=add_line_breaks, brk=break_token) dataset = dataset.map(normalize, input_columns='text') proc_kwargs = dict( batched=True, batch_size=data_args.tokenizer_batch_size, remove_columns=["text", "title"]) if "train" in dataset: proc_train = create_preprocess_fn( tokenizer, data_args.max_source_length, data_args.max_target_length) dataset["train"] = dataset["train"].map(proc_train, **proc_kwargs) if "test" in dataset: proc_eval = create_preprocess_fn( tokenizer, data_args.max_source_length, data_args.val_max_target_length) dataset["test"] = dataset["test"].map(proc_eval, **proc_kwargs) dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"]) save_dir = data_args.save_data_to if save_dir is not None: if not os.path.exists(save_dir): os.mkdir(save_dir) logger.info("Saving preprocessed dataset to {}".format(save_dir)) dataset.save_to_disk(save_dir) elif data_dir is not None: logger.info("Loading preprocessed dataset from {}".format(data_dir)) if skip_train: eval_data = load_from_disk(os.path.join(data_dir, "test")) elif skip_eval: train_data = load_from_disk(os.path.join(data_dir, "train")) else: dataset = load_from_disk(data_dir) else: raise AttributeError("You must provide either `--data_json` or `--load_data_from` argument.") if "train" in dataset: train_data = dataset["train"] if "test" in dataset: eval_data = dataset["test"] return train_data, eval_data
def main(): args = get_args() set_seed(args.seed) dataset = load_dataset("codeparrot/codecomplex", split="train") train_test = dataset.train_test_split(test_size=0.2) test_validation = train_test["test"].train_test_split(test_size=0.5) train_test_validation = DatasetDict({ "train": train_test["train"], "test": test_validation["train"], "valid": test_validation["test"], }) print("Loading tokenizer and model") tokenizer = AutoTokenizer.from_pretrained(args.model_ckpt) tokenizer.pad_token = tokenizer.eos_token model = AutoModelForSequenceClassification.from_pretrained(args.model_ckpt, num_labels=7) model.config.pad_token_id = model.config.eos_token_id if args.freeze: for param in model.roberta.parameters(): param.requires_grad = False labels = ClassLabel(num_classes=7, names=list( set(train_test_validation["train"]["complexity"]))) def tokenize(example): inputs = tokenizer(example["src"], truncation=True, max_length=1024) label = labels.str2int(example["complexity"]) return { "input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "label": label, } tokenized_datasets = train_test_validation.map( tokenize, batched=True, remove_columns=train_test_validation["train"].column_names, ) data_collator = DataCollatorWithPadding(tokenizer=tokenizer) training_args = TrainingArguments( output_dir=args.output_dir, learning_rate=args.learning_rate, lr_scheduler_type=args.lr_scheduler_type, evaluation_strategy="epoch", save_strategy="epoch", logging_strategy="epoch", per_device_train_batch_size=args.batch_size, per_device_eval_batch_size=args.batch_size, num_train_epochs=args.num_epochs, gradient_accumulation_steps=args.gradient_accumulation_steps, weight_decay=0.01, metric_for_best_model="accuracy", run_name="complexity-java", report_to="wandb", ) trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["valid"], tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, ) print("Training...") trainer.add_callback(CustomCallback(trainer)) trainer.train()
def load_data( self, data: Tuple[str, Union[str, List[str]], Union[str, List[str]]], dataset: Optional[Any] = None, columns: Union[List[str], Tuple[str]] = ("input_ids", "attention_mask", "labels"), ) -> Union[Sequence[Mapping[str, Any]]]: file, input, target = data data_files = {} stage = self.running_stage.value data_files[stage] = str(file) # FLASH_TESTING is set in the CI to run faster. if flash._IS_TESTING and not torch.cuda.is_available(): try: dataset_dict = DatasetDict({ stage: load_dataset(self.filetype, data_files=data_files, split=[f'{stage}[:20]'])[0] }) except Exception: dataset_dict = load_dataset(self.filetype, data_files=data_files) else: dataset_dict = load_dataset(self.filetype, data_files=data_files) if not self.predicting: if isinstance(target, List): # multi-target dataset_dict = dataset_dict.map( partial(self._multilabel_target, target)) dataset.num_classes = len(target) self.set_state(LabelsState(target)) else: if self.training: labels = list( sorted(list(set(dataset_dict[stage][target])))) dataset.num_classes = len(labels) self.set_state(LabelsState(labels)) labels = self.get_state(LabelsState) # convert labels to ids if labels is not None: labels = labels.labels label_to_class_mapping = { v: k for k, v in enumerate(labels) } dataset_dict = dataset_dict.map( partial(self._transform_label, label_to_class_mapping, target)) # Hugging Face models expect target to be named ``labels``. if target != "labels": dataset_dict.rename_column_(target, "labels") dataset_dict = dataset_dict.map(partial(self._tokenize_fn, input=input), batched=True) dataset_dict.set_format("torch", columns=columns) return dataset_dict[stage]
def main(): raw_datasets = DatasetDict() if training_args.do_train: raw_datasets["train"] = load_dataset(data_args.dataset_name, data_args.dataset_config, split=data_args.train_split_name) if training_args.do_eval: raw_datasets["eval"] = load_dataset(data_args.dataset_name, data_args.dataset_config, split=data_args.eval_split_name) if data_args.audio_column not in next(iter( raw_datasets.values())).column_names: raise ValueError( f"--audio_column '{data_args.audio_column}' not found in dataset '{data_args.dataset_name}'. " "Make sure to set `--audio_column` to the correct audio column - one of " f"{', '.join(next(iter(raw_datasets.values())).column_names)}.") if data_args.text_column not in next(iter( raw_datasets.values())).column_names: raise ValueError( f"--text_column {data_args.text_column} not found in dataset '{data_args.dataset_name}'. " "Make sure to set `--text_column` to the correct text column - one of " f"{', '.join(next(iter(raw_datasets.values())).column_names)}.") # 5. Load pretrained model, tokenizer, and feature extractor # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name, cache_dir=model_args.cache_dir, revision=model_args.model_version, use_auth_token=True if model_args.use_auth_token else None, ) feature_extractor = AutoFeatureExtractor.from_pretrained( model_args.feature_extractor if model_args.feature_extractor else model_args.model_name, cache_dir=model_args.cache_dir, revision=model_args.model_version, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_version, use_auth_token=True if model_args.use_auth_token else None, ) model = AutoModelForSpeechSeq2Seq.from_pretrained( model_args.model_name, config=config, cache_dir=model_args.cache_dir, revision=model_args.model_version, use_auth_token=True if model_args.use_auth_token else None, ) if model.config.dec_START is None: raise ValueError( "Make sure that `config.dec_START` is correctly defined") if model_args.freeze_feature_encoder: model.freeze_feature_encoder() # 6. Resample speech dataset if necassary dataset_sampling_rate = (next(iter( raw_datasets.values())).features[data_args.audio_column].sampling_rate) if dataset_sampling_rate != feature_extractor.sampling_rate: raw_datasets = raw_datasets.cast_column( data_args.audio_column, datasets.features.Audio( sampling_rate=feature_extractor.sampling_rate), ) # 7. Preprocessing the datasets. # We need to read the audio files as arrays and tokenize the targets. max_input_length = data_args.max_duration * feature_extractor.sampling_rate min_input_length = data_args.min_duration * feature_extractor.sampling_rate audio_column = data_args.audio_column num_workers = data_args.num_workers text_column = data_args.text_column model_input_name = feature_extractor.model_input_names[0] lower_case = data_args.lower_case if data_args.max_train_samples is not None: raw_datasets["train"] = raw_datasets["train"].select( range(data_args.max_train_samples)) if data_args.max_eval_samples is not None: raw_datasets["eval"] = raw_datasets["eval"].select( range(data_args.max_eval_samples)) def prepare_dataset(batch): # process audio sample = batch[audio_column] inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"]) # process audio length batch[model_input_name] = inputs.input_values[0] batch["input_length"] = len(batch["input_values"]) # process targets input_str = batch[text_column].lower( ) if lower_case else batch[text_column] batch["labels"] = tokenizer(input_str).input_ids return batch with training_args.main_process_first(desc="dataset map pre-processing"): vectorized_datasets = raw_datasets.map( prepare_dataset, remove_columns=next(iter(raw_datasets.values())).column_names, num_proc=data_args.num_workers, desc="preprocess train dataset", ) # filter data that is shorter than min_input_length or longer than # max_input_length def is_audio_in_length_range(length): return length > min_input_length and length < max_input_length vectorized_datasets = vectorized_datasets.filter( is_audio_in_length_range, num_proc=num_workers, input_columns=["input_length"], ) # for large datasets it is advised to run the preprocessing on a # single machine first with `args.preprocessing_only` since there will mostly likely # be a timeout when running the script in distributed mode. # In a second step `args.preprocessing_only` can then be set to `False` to load the # cached dataset if data_args.preprocessing_only: cache = {k: v.cache_files for k, v in vectorized_datasets.items()} logger.info(f"Data preprocessing finished. Files cached at {cache}.") return # 8. Load Metric metric = load_metric("wer") def compute_metrics(pred): pred_ids = pred.predictions pred.label_ids[pred.label_ids == -100] = tokenizer.PAD pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True) # we do not want to group tokens when computing the metrics label_str = tokenizer.batch_decode(pred.label_ids, skip_special_tokens=True) wer = metric.compute(predictions=pred_str, references=label_str) return {"wer": wer} # 9. Create a single speech processor if is_main_process(training_args.local_rank): # save feature extractor, tokenizer and config feature_extractor.save_pretrained(training_args.out_dir) tokenizer.save_pretrained(training_args.out_dir) config.save_pretrained(training_args.out_dir) processor = AutoProcessor.from_pretrained(training_args.out_dir) # 10. Define data collator data_collator = DataCollatorSpeechSeq2SeqWithPadding( processor=processor, dec_START=model.config.dec_START) # 11. Initialize Trainer trainer = Seq2SeqTrainer( model=model, args=training_args, train_dataset=vectorized_datasets["train"] if training_args.do_train else None, eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None, tokenizer=feature_extractor, data_collator=data_collator, compute_metrics=compute_metrics if training_args.test_with_gen else None, ) # 12. Training if training_args.do_train: checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the feature extractor too for easy upload metrics = train_result.metrics max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(vectorized_datasets["train"])) metrics["train_samples"] = min(max_train_samples, len(vectorized_datasets["train"])) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # 13. Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate( metric_key_prefix="eval", max_len=model.config.max_len, n_beams=model.config.n_beams, ) max_eval_samples = (data_args.max_eval_samples if data_args.max_eval_samples is not None else len( vectorized_datasets["eval"])) metrics["eval_samples"] = min(max_eval_samples, len(vectorized_datasets["eval"])) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # 14. Write Training Stats kw = { "finetuned_from": model_args.model_name, "tasks": "speech recognition" } if data_args.dataset_name is not None: kw["dataset_tags"] = data_args.dataset_name if data_args.dataset_config is not None: kw["dataset_args"] = data_args.dataset_config kw["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config}" else: kw["dataset"] = data_args.dataset_name if training_args.push_to_hub: trainer.push_to_hub(**kw) else: trainer.create_model_card(**kw) return results
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() configure_logger(model_args, training_args) # Downloading and loading a dataset from the hub. datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir) if "validation" not in datasets.keys(): # make sure only "validation" and "train" keys remain" datasets = DatasetDict() datasets["validation"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split= f"{data_args.train_split_name}[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, ) datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split= f"{data_args.train_split_name}[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, ) else: # make sure only "validation" and "train" keys remain" datasets = DatasetDict() datasets["validation"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split="validation", cache_dir=model_args.cache_dir, ) datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"{data_args.train_split_name}", cache_dir=model_args.cache_dir, ) # only normalized-inputs-training is supported feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, do_normalize=True) def prepare_dataset(batch): # check that all files have the correct sampling rate batch["speech"], _ = librosa.load(batch[data_args.speech_file_column], sr=feature_extractor.sampling_rate) return batch # load audio files into numpy arrays vectorized_datasets = datasets.map( prepare_dataset, num_proc=data_args.preprocessing_num_workers, remove_columns=datasets["train"].column_names) # filter audio files that are too long vectorized_datasets = vectorized_datasets.filter(lambda data: len(data[ "speech"]) < int(data_args.max_duration_in_seconds * feature_extractor. sampling_rate)) def normalize(batch): return feature_extractor(batch["speech"], sampling_rate=feature_extractor.sampling_rate) # normalize and transform to `BatchFeatures` vectorized_datasets = vectorized_datasets.map( normalize, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, remove_columns=vectorized_datasets["train"].column_names, ) # pretraining is only supported for "newer" stable layer norm architecture # apply_spec_augment has to be True, mask_feature_prob has to be 0.0 config = Wav2Vec2Config.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, gradient_checkpointing=model_args.gradient_checkpointing, ) if not config.do_stable_layer_norm or config.feat_extract_norm != "layer": raise ValueError( "PreTraining is only supported for ``config.do_stable_layer_norm=True`` and ``config.feat_extract_norm='layer'" ) model = Wav2Vec2ForPreTraining(config) data_collator = DataCollatorForWav2Vec2Pretraining( model=model, feature_extractor=feature_extractor) trainer = Wav2Vec2PreTrainer( model=model, data_collator=data_collator, args=training_args, train_dataset=vectorized_datasets["train"], eval_dataset=vectorized_datasets["validation"], tokenizer=feature_extractor, max_gumbel_temp=model_args.max_gumbel_temperature, min_gumbel_temp=model_args.min_gumbel_temperature, gumbel_temp_decay=model_args.gumbel_temperature_decay, ) trainer.train()
def load_data(self, data: Any, columns: List[str] = None) -> "datasets.Dataset": if self.filetype == "json": file, field = data else: file = data data_files = {} stage = self._running_stage.value data_files[stage] = str(file) # FLASH_TESTING is set in the CI to run faster. if flash._IS_TESTING: try: if self.filetype == "json" and field is not None: dataset_dict = DatasetDict({ stage: load_dataset(self.filetype, data_files=data_files, split=[f"{stage}[:20]"], field=field)[0] }) else: dataset_dict = DatasetDict({ stage: load_dataset(self.filetype, data_files=data_files, split=[f"{stage}[:20]"])[0] }) column_names = dataset_dict[stage].column_names except Exception: if self.filetype == "json" and field is not None: dataset_dict = load_dataset(self.filetype, data_files=data_files, field=field) else: dataset_dict = load_dataset(self.filetype, data_files=data_files) column_names = dataset_dict[stage].column_names else: if self.filetype == "json" and field is not None: dataset_dict = load_dataset(self.filetype, data_files=data_files, field=field) else: dataset_dict = load_dataset(self.filetype, data_files=data_files) column_names = dataset_dict[stage].column_names if self.answer_column_name == "answer": if "answer" not in column_names: if "answer_text" in column_names and "answer_start" in column_names: dataset_dict = dataset_dict.map( self._reshape_answer_column, batched=False) column_names = dataset_dict[stage].column_names else: raise KeyError( """Dataset must contain either \"answer\" key as dict type or "answer_text" and "answer_start" as string and integer types.""") if not isinstance(dataset_dict[stage][self.answer_column_name][0], Dict): raise TypeError( f'{self.answer_column_name} column should be of type dict with keys "text" and "answer_start"' ) dataset_dict = dataset_dict.map(self._tokenize_fn, batched=True, remove_columns=column_names) return dataset_dict[stage]
class Classifier: def __init__(self, pretrained, prepared_dir, classifier_dir): """ pretrained is None means disable classifier """ self.pretrained = pretrained self.classifier_dir = classifier_dir self.prepared_dir = prepared_dir self.datasets = DatasetDict({ 'train': read_dataset_from_csv(prepared_dir + '/train.csv'), 'test': read_dataset_from_csv(prepared_dir + '/test.csv'), 'validation': read_dataset_from_csv(prepared_dir + '/validation.csv') }) self.metric = load_metric("seqeval") self.label_list = self.datasets["train"].features["tag"].feature.names check_folder(self.classifier_dir) if pretrained: self.model = AutoModelForTokenClassification.from_pretrained( self.pretrained, num_labels=len(self.label_list)) self.tokenizer = AutoTokenizer.from_pretrained(self.pretrained) self.data_collator = DataCollatorForTokenClassification( self.tokenizer) def train(self, num_train_epochs=10, learning_rate=1e-5, weight_decay=1e-2, per_device_train_batch_size=16, per_device_eval_batch_size=16): def compute_metrics(p): predictions, labels = p true_labels, true_predictions = self.process_pred_labels( predictions, labels) results = self.metric.compute(predictions=true_predictions, references=true_labels) return { "precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"], } if self.pretrained: tokenized_datasets = self.datasets.map( self.tokenize_and_align_labels, batched=True) train_args = TrainingArguments( "{}/exp".format(self.classifier_dir), evaluation_strategy="epoch", learning_rate=learning_rate, per_device_train_batch_size=per_device_train_batch_size, per_device_eval_batch_size=per_device_eval_batch_size, num_train_epochs=num_train_epochs, weight_decay=weight_decay, load_best_model_at_end=True) trainer = Trainer(self.model, train_args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["validation"], data_collator=self.data_collator, tokenizer=self.tokenizer, compute_metrics=compute_metrics) # fine tuning model trainer.train() trainer.save_model(self.classifier_dir) print("Trainer is saved to ", self.classifier_dir) else: print("Classifier is disabled. No need to train!") def eval(self, key='test'): tqdm.pandas() pred_out_path = '{}/{}_classified_pred.csv'.format( self.classifier_dir, key) label_out_path = '{}/{}_classified_label.csv'.format( self.classifier_dir, key) print("Predicting {}...".format(key)) if self.pretrained: tokenized_datasets = self.datasets.map( self.tokenize_and_align_labels, batched=True) trainer = Trainer(model=self.model, tokenizer=self.tokenizer, data_collator=self.data_collator) true_labels, true_predictions = self.predict_dataset( trainer, tokenized_datasets[key]) pred_result = save_classifier_result(self.datasets[key], true_predictions, pred_out_path) label_result = save_classifier_result(self.datasets[key], true_labels, label_out_path) # print result results = self.metric.compute(predictions=true_predictions, references=true_labels) print(" precision:\t{}\n recall:\t{}\n f1:\t{}\n accuracy:\t{}\n". format(results["overall_precision"], results["overall_recall"], results["overall_f1"], results["overall_accuracy"])) confusion_matrix(true_predictions, true_labels) return pred_result, label_result else: df = pd.read_csv('{}/{}.csv'.format(self.prepared_dir, key), converters={ 'token': str, 'written': str, 'spoken': str }) result = df[['sentence_id', 'token' ]].groupby(['sentence_id']).agg({'token': ' '.join}) result['tag'] = 'B' result.to_csv(pred_out_path, index=False) result.to_csv(label_out_path, index=False) print("Result saved to ", pred_out_path) print("Result saved to ", label_out_path) return result, result def predict(self, input_path, output_path): key = 'tmp' input_df = pd.DataFrame() if self.pretrained: input_df['src_token'] = read_txt(input_path) input_df['src_token'] = input_df['src_token'].str.lower() input_df['token'] = input_df['src_token'].str.split() input_df['tag'] = input_df['token'].apply(lambda x: ['O'] * len(x)) input_df['sentence_id'] = input_df.index trainer = Trainer(model=self.model, tokenizer=self.tokenizer, data_collator=self.data_collator) feature_tag = Sequence( ClassLabel(num_classes=3, names=self.label_list)) input_df['tag'] = input_df['tag'].apply( feature_tag.feature.str2int) eval_dataset = Dataset.from_pandas(input_df) eval_dataset.features["tag"] = feature_tag # predict tokenized_datasets = DatasetDict({ key: eval_dataset }).map(self.tokenize_and_align_labels, batched=True) _, true_predictions = self.predict_dataset(trainer, tokenized_datasets[key]) result = save_classifier_result(eval_dataset, true_predictions, output_path) return result else: input_df['token'] = read_txt(input_path) input_df['sentence_id'] = input_df.index input_df['tag'] = 'B' input_df.to_csv(output_path, index=False) print("Result saved to ", output_path) return input_df def tokenize_and_align_labels(self, examples): labels = [] label_all_tokens = False tokenized_inputs = self.tokenizer(examples["token"], truncation=True, is_split_into_words=True) for i, label in enumerate(examples["tag"]): word_ids = tokenized_inputs.word_ids(batch_index=i) previous_word_idx = None label_ids = [] for word_idx in word_ids: if word_idx is None: label_ids.append(-100) elif word_idx != previous_word_idx: label_ids.append(label[word_idx]) else: label_ids.append( label[word_idx] if label_all_tokens else -100) previous_word_idx = word_idx labels.append(label_ids) tokenized_inputs["labels"] = labels return tokenized_inputs def predict_dataset(self, trainer, data): predictions, labels, _ = trainer.predict(data) return self.process_pred_labels(predictions, labels) def process_pred_labels(self, predictions, labels): # Remove ignored index (special tokens) predictions = np.argmax(predictions, axis=2) true_predictions = [[ self.label_list[p] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] true_labels = [[ self.label_list[l] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] return true_labels, true_predictions