def load_tokenizer(self): """Loads the sentence piece tokenizer specified in the yaml file""" save_model_path = os.path.join( self.hparams["save_folder"], str(self.hparams["output_neurons"]) + "_unigram.model", ) # Downloading from the web download_file( source=self.hparams["tok_mdl_file"], dest=save_model_path, ) # Initialize and pre-train the tokenizer self.mod.tokenizer = SentencePiece( model_dir=self.hparams["save_folder"], vocab_size=self.hparams["output_neurons"], ) self.mod.tokenizer.sp.load(save_model_path)
def main(config): ### get Train Data ### # list of {'audio_sph_file': str, 'transcript_all_file': str, 'transcript_uid': str, 'filter_criteria': str} # meaning that <audio_sph_file>'s transcript is the one in the <transcript_all_file> with id <transcript_uid> hparams = load_hparams(config.train_data_config) train_corpus = get_utterance_manifest_from_datasets(hparams["datasets"]) ### create json file for SpeechBrain-->SentencePiece ### annotation_read = "transcript" # key-name for each `entry` in `train_corpus` having the transcript as its value ### write config file write_hyperpyyaml_file(os.path.join(config.output_folder, "sp_vocab_{}_{}.yaml".format(config.vocab_size, config.model_type)), {"model_dir": config.output_folder, "vocab_size": config.vocab_size, "model_type": config.model_type, "sp_model_file": os.path.join(config.output_folder, "{}_{}.model".format(str(config.vocab_size), config.model_type)), "unk_index": config.unk_index, "bos_index": config.bos_index, "eos_index": config.eos_index, "pad_index": config.pad_index}) ### train custom SentencePiece Tokenizer ### with tempfile.NamedTemporaryFile(mode="w+", suffix=".json") as f: f.write(json.dumps(dict([(entry["transcript_uid"], {annotation_read: entry["transcript"]}) for entry in train_corpus]))) f.seek(0) SentencePiece(model_dir = config.output_folder, vocab_size = config.vocab_size, annotation_train = f.name, annotation_read = annotation_read, annotation_format = "json", unk_id = config.unk_index, bos_id = config.bos_index, eos_id = config.eos_index, pad_id = config.pad_index, model_type = config.model_type, character_coverage = config.character_coverage, annotation_list_to_check = config.annotation_list_to_check)
def dataio_prepare(hparams): """This function prepares the datasets to be used in the brain class. It also defines the data processing pipeline through user-defined functions.""" # 1. Define datasets data_folder = hparams["data_folder"] train_data = sb.dataio.dataset.DynamicItemDataset.from_csv( csv_path=hparams["train_csv"], replacements={"data_root": data_folder}, ) if hparams["sorting"] == "ascending": # we sort training data to speed up training and get better results. train_data = train_data.filtered_sorted( sort_key="duration", key_max_value={"duration": hparams["avoid_if_longer_than"]}, ) # when sorting do not shuffle in dataloader ! otherwise is pointless hparams["dataloader_options"]["shuffle"] = False elif hparams["sorting"] == "descending": train_data = train_data.filtered_sorted( sort_key="duration", reverse=True, key_max_value={"duration": hparams["avoid_if_longer_than"]}, ) # when sorting do not shuffle in dataloader ! otherwise is pointless hparams["dataloader_options"]["shuffle"] = False elif hparams["sorting"] == "random": pass else: raise NotImplementedError( "sorting must be random, ascending or descending") valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv( csv_path=hparams["valid_csv"], replacements={"data_root": data_folder}, ) # We also sort the validation data so it is faster to validate valid_data = valid_data.filtered_sorted(sort_key="duration") test_data = sb.dataio.dataset.DynamicItemDataset.from_csv( csv_path=hparams["test_csv"], replacements={"data_root": data_folder}, ) # We also sort the validation data so it is faster to validate test_data = test_data.filtered_sorted(sort_key="duration") datasets = [train_data, valid_data, test_data] # defining tokenizer and loading it tokenizer = SentencePiece( model_dir=hparams["save_folder"], vocab_size=hparams["output_neurons"], annotation_train=hparams["train_csv"], annotation_read="wrd", model_type=hparams["token_type"], character_coverage=hparams["character_coverage"], bos_id=hparams["bos_index"], eos_id=hparams["eos_index"], ) # 2. Define audio pipeline: @sb.utils.data_pipeline.takes("wav") @sb.utils.data_pipeline.provides("sig") def audio_pipeline(wav): info = torchaudio.info(wav) sig = sb.dataio.dataio.read_audio(wav) resampled = torchaudio.transforms.Resample( info.sample_rate, hparams["sample_rate"], )(sig) return resampled sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline) # 3. Define text pipeline: @sb.utils.data_pipeline.takes("wrd") @sb.utils.data_pipeline.provides("tokens_list", "tokens_bos", "tokens_eos", "tokens") def text_pipeline(wrd): tokens_list = tokenizer.sp.encode_as_ids(wrd) yield tokens_list tokens_bos = torch.LongTensor([hparams["bos_index"]] + (tokens_list)) yield tokens_bos tokens_eos = torch.LongTensor(tokens_list + [hparams["eos_index"]]) yield tokens_eos tokens = torch.LongTensor(tokens_list) yield tokens sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline) # 4. Set output: sb.dataio.dataset.set_output_keys( datasets, ["id", "sig", "tokens_bos", "tokens_eos", "tokens"], ) return train_data, valid_data, test_data, tokenizer
"data_folder": hparams["data_folder"], "save_folder": hparams["save_folder"], "train_tsv_file": hparams["train_tsv_file"], "dev_tsv_file": hparams["dev_tsv_file"], "test_tsv_file": hparams["test_tsv_file"], "accented_letters": hparams["accented_letters"], "language": hparams["language"], "skip_prep": hparams["skip_prep"], }, ) # Defining tokenizer and loading it tokenizer = SentencePiece( model_dir=hparams["save_folder"], vocab_size=hparams["output_neurons"], annotation_train=hparams["train_csv"], annotation_read="wrd", model_type=hparams["token_type"], character_coverage=hparams["character_coverage"], ) # Create the datasets objects as well as tokenization and encoding :-D train_data, valid_data, test_data = dataio_prepare(hparams, tokenizer) # Trainer initialization asr_brain = ASR( modules=hparams["modules"], hparams=hparams, run_opts=run_opts, checkpointer=hparams["checkpointer"], )
) # Prepare data prepare_SLURP( data_folder=hparams["data_folder"], slu_type="decoupled", train_splits=hparams["train_splits"], ) # Creating tokenizer must be done after preparation # Specify the bos_id/eos_id if different from blank_id tokenizer = SentencePiece( model_dir=hparams["save_folder"], vocab_size=hparams["output_neurons"], csv_train=hparams["csv_train"], csv_read="semantics", model_type=hparams["token_type"], character_coverage=1.0, num_sequences=10000, ) hparams["tokenizer"] = tokenizer # Load index2label dict for decoding train_set = hparams["train_loader"]() valid_set = hparams["valid_loader"]() test_set = hparams["test_loader"]() hparams["asr_ind2lab"] = hparams["train_loader"].label_dict["transcript"][ "index2lab" ] # ugh hparams["ind2lab"] = hparams["test_loader"].label_dict["semantics"][ "index2lab"
def dataio_prep(hparams): """Creates the datasets and their data processing pipelines""" # 1. define tokenizer if hparams["target_type"] == "wrd": tokenizer = SentencePiece( model_dir=hparams["save_folder"], vocab_size=hparams["output_neurons"], csv_train=hparams["train_annotation"], csv_read="wrd", model_type=hparams["token_type"], character_coverage=hparams["character_coverage"], ) else: tokenizer = sb.dataio.encoder.CTCTextEncoder() # 2. Define audio pipelines: @sb.utils.data_pipeline.takes("noisy_wav") @sb.utils.data_pipeline.provides("noisy_sig") def noisy_pipeline(wav): return sb.dataio.dataio.read_audio(wav) @sb.utils.data_pipeline.takes("clean_wav") @sb.utils.data_pipeline.provides("clean_sig") def clean_pipeline(wav): return sb.dataio.dataio.read_audio(wav) # 3. Define target pipeline: token_keys = ["tokens_bos", "tokens_eos", "tokens"] @sb.utils.data_pipeline.takes(hparams["target_type"]) @sb.utils.data_pipeline.provides("tokens_list", *[t for t in token_keys]) def target_pipeline(target): if hparams["target_type"] == "wrd": tokens_list = tokenizer.sp.encode_as_ids(target) yield tokens_list else: tokens_list = target.strip().split() yield tokens_list tokens_list = tokenizer.encode_sequence(tokens_list) tokens_bos = torch.LongTensor([hparams["bos_index"]] + (tokens_list)) yield tokens_bos tokens_eos = torch.LongTensor(tokens_list + [hparams["eos_index"]]) yield tokens_eos tokens = torch.LongTensor(tokens_list) yield tokens # 4. Create datasets data = {} for dataset in ["train", "valid", "test"]: data[dataset] = sb.dataio.dataset.DynamicItemDataset.from_csv( csv_path=hparams[f"{dataset}_annotation"], replacements={"data_root", hparams["data_folder"]}, dynamic_items=[noisy_pipeline, clean_pipeline, target_pipeline], output_keys=["id", "noisy_sig", "clean_sig"] + token_keys, ) if dataset != "train": data[dataset] = data[dataset].filtered_sorted(sort_key="duration") # Sort train dataset and ensure it doesn't get un-sorted if hparams["sorting"] == "ascending" or hparams["sorting"] == "descending": data["train"] = data["train"].filtered_sorted( sort_key="duration", reverse=hparams["sorting"] == "descending", ) hparams["train_loader_options"]["shuffle"] = False elif hparams["sorting"] != "random": raise NotImplementedError( "Sorting must be random, ascending, or descending") # 5. Load or update tokenizer if hparams["target_type"] == "wrd": save_model_path = os.path.join(hparams["save_folder"], "tok_uni.model") save_vocab_path = os.path.join(hparams["save_folder"], "tok_uni.vocab") if "tok_mdl_file" in hparams: download_file( source=hparams["tok_mdl_file"], dest=save_model_path, replace_existing=True, ) tokenizer.sp.load(save_model_path) if "tok_voc_file" in hparams: download_file( source=hparams["tok_voc_file"], dest=save_vocab_path, replace_existing=True, ) tokenizer.sp.load(save_model_path) if (tokenizer.sp.eos_id() + 1) == (tokenizer.sp.bos_id() + 1) == 0 and not ( hparams["eos_index"] == hparams["bos_index"] == hparams["blank_index"] == hparams["unk_index"] == 0): raise ValueError("Desired indexes for special tokens do not agree " "with loaded tokenizer special tokens !") else: tokenizer.update_from_didataset(data["train"], output_key="tokens_list") tokenizer.insert_bos_eos( bos_label="<eos-bos>", eos_label="<eos-bos>", bos_index=hparams["bos_index"], ) return data, tokenizer
def test_tokenizer(): from speechbrain.tokenizers.SentencePiece import SentencePiece gt = [ ["HELLO", "MORNING", "MORNING", "HELLO"], ["HELLO", "MORNING", "HELLO"], ] # Word-level input test dict_int2lab = {1: "HELLO", 2: "MORNING"} spm = SentencePiece( "tokenizer_data/", 2000, csv_train="tests/unittests/tokenizer_data/dev-clean.csv", csv_read="wrd", model_type="bpe", ) encoded_seq_ids, encoded_seq_pieces = spm( torch.Tensor([[1, 2, 2, 1], [1, 2, 1, 0]]), torch.Tensor([1.0, 0.75]), dict_int2lab, task="encode", ) lens = (encoded_seq_pieces * encoded_seq_ids.shape[1]).int() # decode from torch tensors (batch, batch_lens) words_seq = spm(encoded_seq_ids, encoded_seq_pieces, task="decode") assert words_seq == gt, "output not the same" # decode from a list of bpe sequence (without padding) hyps_list = [ encoded_seq_ids[0].int().tolist(), encoded_seq_ids[1][: lens[1]].int().tolist(), ] words_seq = spm(hyps_list, task="decode_from_list") assert words_seq == gt, "output not the same" # Char-level input test dict_int2lab = { 1: "H", 2: "E", 3: "L", 4: "O", 5: "M", 6: "R", 7: "N", 8: "I", 9: "G", 10: "_", } spm = SentencePiece( "tokenizer_data/", 2000, csv_train="tests/unittests/tokenizer_data/dev-clean.csv", csv_read="char", char_format_input=True, model_type="bpe", ) encoded_seq_ids, encoded_seq_pieces = spm( torch.Tensor( [ [ 1, 2, 3, 3, 4, 10, 5, 4, 6, 7, 8, 7, 9, 10, 5, 4, 6, 7, 8, 7, 9, 10, 1, 2, 3, 3, 4, ], [ 1, 2, 3, 3, 4, 10, 5, 4, 6, 7, 8, 7, 9, 10, 1, 2, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, ], ] ), torch.Tensor([1.0, 0.7037037037037037]), dict_int2lab, task="encode", ) lens = (encoded_seq_pieces * encoded_seq_ids.shape[1]).int() # decode from torch tensors (batch, batch_lens) words_seq = spm(encoded_seq_ids, encoded_seq_pieces, task="decode") assert words_seq == gt, "output not the same" # decode from a list of bpe sequence (without padding) hyps_list = [ encoded_seq_ids[0].int().tolist(), encoded_seq_ids[1][: lens[1]].int().tolist(), ] words_seq = spm(hyps_list, task="decode_from_list") assert words_seq == gt, "output not the same"
def dataio_prep(hparams): """Creates the datasets and their data processing pipelines""" # 1. define tokenizer and load it tokenizer = SentencePiece( model_dir=hparams["save_folder"], vocab_size=hparams["output_neurons"], csv_train=hparams["train_annotation"], csv_read="wrd", model_type=hparams["token_type"], character_coverage=hparams["character_coverage"], ) """Loads the sentence piece tokenizer specified in the yaml file""" save_model_path = os.path.join(hparams["save_folder"], "tok_unigram.model") save_vocab_path = os.path.join(hparams["save_folder"], "tok_unigram.vocab") if "tok_mdl_file" in hparams: download_file( source=hparams["tok_mdl_file"], dest=save_model_path, replace_existing=True, ) tokenizer.sp.load(save_model_path) if "tok_voc_file" in hparams: download_file( source=hparams["tok_voc_file"], dest=save_vocab_path, replace_existing=True, ) tokenizer.sp.load(save_model_path) if (tokenizer.sp.eos_id() + 1) == (tokenizer.sp.bos_id() + 1) == 0 and not ( hparams["eos_index"] == hparams["bos_index"] == hparams["blank_index"] == hparams["unk_index"] == 0 ): raise ValueError( "Desired indexes for special tokens do not agree " "with loaded tokenizer special tokens !" ) # 2. Define audio pipeline: @sb.utils.data_pipeline.takes(hparams["input_type"]) @sb.utils.data_pipeline.provides("sig") def audio_pipeline(wav): sig = sb.dataio.dataio.read_audio(wav) return sig # 3. Define text pipeline: @sb.utils.data_pipeline.takes("wrd") @sb.utils.data_pipeline.provides("tokens_bos", "tokens_eos", "tokens") def text_pipeline(wrd): tokens_list = tokenizer.sp.encode_as_ids(wrd) tokens_bos = torch.LongTensor([hparams["bos_index"]] + (tokens_list)) yield tokens_bos tokens_eos = torch.LongTensor(tokens_list + [hparams["eos_index"]]) yield tokens_eos tokens = torch.LongTensor(tokens_list) yield tokens # 4. Create datasets data = {} for dataset in ["train", "valid", "test"]: data[dataset] = sb.dataio.dataset.DynamicItemDataset.from_csv( csv_path=hparams[f"{dataset}_annotation"], replacements={"data_root", hparams["data_folder"]}, dynamic_items=[audio_pipeline, text_pipeline], output_keys=["id", "sig", "tokens_bos", "tokens_eos", "tokens"], ) if dataset != "train": data[dataset] = data[dataset].filtered_sorted(sort_key="duration") # Sort train dataset and ensure it doesn't get un-sorted if hparams["sorting"] == "ascending" or hparams["sorting"] == "descending": data["train"] = data["train"].filtered_sorted( sort_key="duration", reverse=hparams["sorting"] == "descending", ) hparams["dataloader_options"]["shuffle"] = False elif hparams["sorting"] != "random": raise NotImplementedError( "Sorting must be random, ascending, or descending" ) return data, tokenizer