Example #1
0
    def load_tokenizer(self):
        """Loads the sentence piece tokenizer specified in the yaml file"""
        save_model_path = os.path.join(
            self.hparams["save_folder"],
            str(self.hparams["output_neurons"]) + "_unigram.model",
        )

        # Downloading from the web
        download_file(
            source=self.hparams["tok_mdl_file"],
            dest=save_model_path,
        )

        # Initialize and pre-train the tokenizer
        self.mod.tokenizer = SentencePiece(
            model_dir=self.hparams["save_folder"],
            vocab_size=self.hparams["output_neurons"],
        )
        self.mod.tokenizer.sp.load(save_model_path)
def main(config):
    ### get Train Data ###
    # list of {'audio_sph_file': str, 'transcript_all_file': str, 'transcript_uid': str, 'filter_criteria': str}
    # meaning that <audio_sph_file>'s transcript is the one in the <transcript_all_file> with id <transcript_uid>
    hparams = load_hparams(config.train_data_config)
    train_corpus = get_utterance_manifest_from_datasets(hparams["datasets"])
    
    ### create json file for SpeechBrain-->SentencePiece ###
    annotation_read = "transcript" # key-name for each `entry` in `train_corpus` having the transcript as its value

    ### write config file
    write_hyperpyyaml_file(os.path.join(config.output_folder, "sp_vocab_{}_{}.yaml".format(config.vocab_size, config.model_type)),
                           {"model_dir": config.output_folder,
                            "vocab_size": config.vocab_size,
                            "model_type": config.model_type,
                            "sp_model_file": os.path.join(config.output_folder, "{}_{}.model".format(str(config.vocab_size), config.model_type)),
                            "unk_index": config.unk_index,
                            "bos_index": config.bos_index,
                            "eos_index": config.eos_index,
                            "pad_index": config.pad_index})



    ### train custom SentencePiece Tokenizer ###
    with tempfile.NamedTemporaryFile(mode="w+", suffix=".json") as f:
        f.write(json.dumps(dict([(entry["transcript_uid"], {annotation_read: entry["transcript"]}) for entry in train_corpus])))
        f.seek(0) 

        SentencePiece(model_dir                = config.output_folder,
                      vocab_size               = config.vocab_size,
                      annotation_train         = f.name,
                      annotation_read          = annotation_read,
                      annotation_format        = "json",
                      unk_id                   = config.unk_index,
                      bos_id                   = config.bos_index,
                      eos_id                   = config.eos_index,
                      pad_id                   = config.pad_index,
                      model_type               = config.model_type,
                      character_coverage       = config.character_coverage,
                      annotation_list_to_check = config.annotation_list_to_check)
Example #3
0
def dataio_prepare(hparams):
    """This function prepares the datasets to be used in the brain class.
    It also defines the data processing pipeline through user-defined functions."""

    # 1. Define datasets
    data_folder = hparams["data_folder"]

    train_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
        csv_path=hparams["train_csv"],
        replacements={"data_root": data_folder},
    )

    if hparams["sorting"] == "ascending":
        # we sort training data to speed up training and get better results.
        train_data = train_data.filtered_sorted(
            sort_key="duration",
            key_max_value={"duration": hparams["avoid_if_longer_than"]},
        )
        # when sorting do not shuffle in dataloader ! otherwise is pointless
        hparams["dataloader_options"]["shuffle"] = False

    elif hparams["sorting"] == "descending":
        train_data = train_data.filtered_sorted(
            sort_key="duration",
            reverse=True,
            key_max_value={"duration": hparams["avoid_if_longer_than"]},
        )
        # when sorting do not shuffle in dataloader ! otherwise is pointless
        hparams["dataloader_options"]["shuffle"] = False

    elif hparams["sorting"] == "random":
        pass

    else:
        raise NotImplementedError(
            "sorting must be random, ascending or descending")

    valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
        csv_path=hparams["valid_csv"],
        replacements={"data_root": data_folder},
    )
    # We also sort the validation data so it is faster to validate
    valid_data = valid_data.filtered_sorted(sort_key="duration")

    test_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
        csv_path=hparams["test_csv"],
        replacements={"data_root": data_folder},
    )

    # We also sort the validation data so it is faster to validate
    test_data = test_data.filtered_sorted(sort_key="duration")

    datasets = [train_data, valid_data, test_data]

    # defining tokenizer and loading it
    tokenizer = SentencePiece(
        model_dir=hparams["save_folder"],
        vocab_size=hparams["output_neurons"],
        annotation_train=hparams["train_csv"],
        annotation_read="wrd",
        model_type=hparams["token_type"],
        character_coverage=hparams["character_coverage"],
        bos_id=hparams["bos_index"],
        eos_id=hparams["eos_index"],
    )

    # 2. Define audio pipeline:
    @sb.utils.data_pipeline.takes("wav")
    @sb.utils.data_pipeline.provides("sig")
    def audio_pipeline(wav):
        info = torchaudio.info(wav)
        sig = sb.dataio.dataio.read_audio(wav)
        resampled = torchaudio.transforms.Resample(
            info.sample_rate,
            hparams["sample_rate"],
        )(sig)
        return resampled

    sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline)

    # 3. Define text pipeline:
    @sb.utils.data_pipeline.takes("wrd")
    @sb.utils.data_pipeline.provides("tokens_list", "tokens_bos", "tokens_eos",
                                     "tokens")
    def text_pipeline(wrd):
        tokens_list = tokenizer.sp.encode_as_ids(wrd)
        yield tokens_list
        tokens_bos = torch.LongTensor([hparams["bos_index"]] + (tokens_list))
        yield tokens_bos
        tokens_eos = torch.LongTensor(tokens_list + [hparams["eos_index"]])
        yield tokens_eos
        tokens = torch.LongTensor(tokens_list)
        yield tokens

    sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline)

    # 4. Set output:
    sb.dataio.dataset.set_output_keys(
        datasets,
        ["id", "sig", "tokens_bos", "tokens_eos", "tokens"],
    )
    return train_data, valid_data, test_data, tokenizer
            "data_folder": hparams["data_folder"],
            "save_folder": hparams["save_folder"],
            "train_tsv_file": hparams["train_tsv_file"],
            "dev_tsv_file": hparams["dev_tsv_file"],
            "test_tsv_file": hparams["test_tsv_file"],
            "accented_letters": hparams["accented_letters"],
            "language": hparams["language"],
            "skip_prep": hparams["skip_prep"],
        },
    )

    # Defining tokenizer and loading it
    tokenizer = SentencePiece(
        model_dir=hparams["save_folder"],
        vocab_size=hparams["output_neurons"],
        annotation_train=hparams["train_csv"],
        annotation_read="wrd",
        model_type=hparams["token_type"],
        character_coverage=hparams["character_coverage"],
    )

    # Create the datasets objects as well as tokenization and encoding :-D
    train_data, valid_data, test_data = dataio_prepare(hparams, tokenizer)

    # Trainer initialization
    asr_brain = ASR(
        modules=hparams["modules"],
        hparams=hparams,
        run_opts=run_opts,
        checkpointer=hparams["checkpointer"],
    )
Example #5
0
    )

    # Prepare data
    prepare_SLURP(
        data_folder=hparams["data_folder"],
        slu_type="decoupled",
        train_splits=hparams["train_splits"],
    )

    # Creating tokenizer must be done after preparation
    # Specify the bos_id/eos_id if different from blank_id
    tokenizer = SentencePiece(
        model_dir=hparams["save_folder"],
        vocab_size=hparams["output_neurons"],
        csv_train=hparams["csv_train"],
        csv_read="semantics",
        model_type=hparams["token_type"],
        character_coverage=1.0,
        num_sequences=10000,
    )
    hparams["tokenizer"] = tokenizer

    # Load index2label dict for decoding
    train_set = hparams["train_loader"]()
    valid_set = hparams["valid_loader"]()
    test_set = hparams["test_loader"]()
    hparams["asr_ind2lab"] = hparams["train_loader"].label_dict["transcript"][
        "index2lab"
    ]  # ugh
    hparams["ind2lab"] = hparams["test_loader"].label_dict["semantics"][
        "index2lab"
Example #6
0
def dataio_prep(hparams):
    """Creates the datasets and their data processing pipelines"""

    # 1. define tokenizer
    if hparams["target_type"] == "wrd":
        tokenizer = SentencePiece(
            model_dir=hparams["save_folder"],
            vocab_size=hparams["output_neurons"],
            csv_train=hparams["train_annotation"],
            csv_read="wrd",
            model_type=hparams["token_type"],
            character_coverage=hparams["character_coverage"],
        )
    else:
        tokenizer = sb.dataio.encoder.CTCTextEncoder()

    # 2. Define audio pipelines:
    @sb.utils.data_pipeline.takes("noisy_wav")
    @sb.utils.data_pipeline.provides("noisy_sig")
    def noisy_pipeline(wav):
        return sb.dataio.dataio.read_audio(wav)

    @sb.utils.data_pipeline.takes("clean_wav")
    @sb.utils.data_pipeline.provides("clean_sig")
    def clean_pipeline(wav):
        return sb.dataio.dataio.read_audio(wav)

    # 3. Define target pipeline:
    token_keys = ["tokens_bos", "tokens_eos", "tokens"]

    @sb.utils.data_pipeline.takes(hparams["target_type"])
    @sb.utils.data_pipeline.provides("tokens_list", *[t for t in token_keys])
    def target_pipeline(target):
        if hparams["target_type"] == "wrd":
            tokens_list = tokenizer.sp.encode_as_ids(target)
            yield tokens_list
        else:
            tokens_list = target.strip().split()
            yield tokens_list
            tokens_list = tokenizer.encode_sequence(tokens_list)
        tokens_bos = torch.LongTensor([hparams["bos_index"]] + (tokens_list))
        yield tokens_bos
        tokens_eos = torch.LongTensor(tokens_list + [hparams["eos_index"]])
        yield tokens_eos
        tokens = torch.LongTensor(tokens_list)
        yield tokens

    # 4. Create datasets
    data = {}
    for dataset in ["train", "valid", "test"]:
        data[dataset] = sb.dataio.dataset.DynamicItemDataset.from_csv(
            csv_path=hparams[f"{dataset}_annotation"],
            replacements={"data_root", hparams["data_folder"]},
            dynamic_items=[noisy_pipeline, clean_pipeline, target_pipeline],
            output_keys=["id", "noisy_sig", "clean_sig"] + token_keys,
        )
        if dataset != "train":
            data[dataset] = data[dataset].filtered_sorted(sort_key="duration")

    # Sort train dataset and ensure it doesn't get un-sorted
    if hparams["sorting"] == "ascending" or hparams["sorting"] == "descending":
        data["train"] = data["train"].filtered_sorted(
            sort_key="duration",
            reverse=hparams["sorting"] == "descending",
        )
        hparams["train_loader_options"]["shuffle"] = False
    elif hparams["sorting"] != "random":
        raise NotImplementedError(
            "Sorting must be random, ascending, or descending")

    # 5. Load or update tokenizer
    if hparams["target_type"] == "wrd":
        save_model_path = os.path.join(hparams["save_folder"], "tok_uni.model")
        save_vocab_path = os.path.join(hparams["save_folder"], "tok_uni.vocab")

        if "tok_mdl_file" in hparams:
            download_file(
                source=hparams["tok_mdl_file"],
                dest=save_model_path,
                replace_existing=True,
            )
            tokenizer.sp.load(save_model_path)

        if "tok_voc_file" in hparams:
            download_file(
                source=hparams["tok_voc_file"],
                dest=save_vocab_path,
                replace_existing=True,
            )
            tokenizer.sp.load(save_model_path)

        if (tokenizer.sp.eos_id() +
                1) == (tokenizer.sp.bos_id() + 1) == 0 and not (
                    hparams["eos_index"] == hparams["bos_index"] ==
                    hparams["blank_index"] == hparams["unk_index"] == 0):
            raise ValueError("Desired indexes for special tokens do not agree "
                             "with loaded tokenizer special tokens !")

    else:
        tokenizer.update_from_didataset(data["train"],
                                        output_key="tokens_list")
        tokenizer.insert_bos_eos(
            bos_label="<eos-bos>",
            eos_label="<eos-bos>",
            bos_index=hparams["bos_index"],
        )

    return data, tokenizer
Example #7
0
def test_tokenizer():
    from speechbrain.tokenizers.SentencePiece import SentencePiece

    gt = [
        ["HELLO", "MORNING", "MORNING", "HELLO"],
        ["HELLO", "MORNING", "HELLO"],
    ]

    # Word-level input test
    dict_int2lab = {1: "HELLO", 2: "MORNING"}

    spm = SentencePiece(
        "tokenizer_data/",
        2000,
        csv_train="tests/unittests/tokenizer_data/dev-clean.csv",
        csv_read="wrd",
        model_type="bpe",
    )
    encoded_seq_ids, encoded_seq_pieces = spm(
        torch.Tensor([[1, 2, 2, 1], [1, 2, 1, 0]]),
        torch.Tensor([1.0, 0.75]),
        dict_int2lab,
        task="encode",
    )
    lens = (encoded_seq_pieces * encoded_seq_ids.shape[1]).int()
    # decode from torch tensors (batch, batch_lens)
    words_seq = spm(encoded_seq_ids, encoded_seq_pieces, task="decode")
    assert words_seq == gt, "output not the same"
    # decode from a list of bpe sequence (without padding)
    hyps_list = [
        encoded_seq_ids[0].int().tolist(),
        encoded_seq_ids[1][: lens[1]].int().tolist(),
    ]
    words_seq = spm(hyps_list, task="decode_from_list")
    assert words_seq == gt, "output not the same"

    # Char-level input test
    dict_int2lab = {
        1: "H",
        2: "E",
        3: "L",
        4: "O",
        5: "M",
        6: "R",
        7: "N",
        8: "I",
        9: "G",
        10: "_",
    }

    spm = SentencePiece(
        "tokenizer_data/",
        2000,
        csv_train="tests/unittests/tokenizer_data/dev-clean.csv",
        csv_read="char",
        char_format_input=True,
        model_type="bpe",
    )
    encoded_seq_ids, encoded_seq_pieces = spm(
        torch.Tensor(
            [
                [
                    1,
                    2,
                    3,
                    3,
                    4,
                    10,
                    5,
                    4,
                    6,
                    7,
                    8,
                    7,
                    9,
                    10,
                    5,
                    4,
                    6,
                    7,
                    8,
                    7,
                    9,
                    10,
                    1,
                    2,
                    3,
                    3,
                    4,
                ],
                [
                    1,
                    2,
                    3,
                    3,
                    4,
                    10,
                    5,
                    4,
                    6,
                    7,
                    8,
                    7,
                    9,
                    10,
                    1,
                    2,
                    3,
                    3,
                    4,
                    0,
                    0,
                    0,
                    0,
                    0,
                    0,
                    0,
                    0,
                ],
            ]
        ),
        torch.Tensor([1.0, 0.7037037037037037]),
        dict_int2lab,
        task="encode",
    )
    lens = (encoded_seq_pieces * encoded_seq_ids.shape[1]).int()
    # decode from torch tensors (batch, batch_lens)
    words_seq = spm(encoded_seq_ids, encoded_seq_pieces, task="decode")
    assert words_seq == gt, "output not the same"
    # decode from a list of bpe sequence (without padding)
    hyps_list = [
        encoded_seq_ids[0].int().tolist(),
        encoded_seq_ids[1][: lens[1]].int().tolist(),
    ]
    words_seq = spm(hyps_list, task="decode_from_list")
    assert words_seq == gt, "output not the same"
Example #8
0
def dataio_prep(hparams):
    """Creates the datasets and their data processing pipelines"""

    # 1. define tokenizer and load it
    tokenizer = SentencePiece(
        model_dir=hparams["save_folder"],
        vocab_size=hparams["output_neurons"],
        csv_train=hparams["train_annotation"],
        csv_read="wrd",
        model_type=hparams["token_type"],
        character_coverage=hparams["character_coverage"],
    )

    """Loads the sentence piece tokenizer specified in the yaml file"""
    save_model_path = os.path.join(hparams["save_folder"], "tok_unigram.model")
    save_vocab_path = os.path.join(hparams["save_folder"], "tok_unigram.vocab")

    if "tok_mdl_file" in hparams:
        download_file(
            source=hparams["tok_mdl_file"],
            dest=save_model_path,
            replace_existing=True,
        )
        tokenizer.sp.load(save_model_path)

    if "tok_voc_file" in hparams:
        download_file(
            source=hparams["tok_voc_file"],
            dest=save_vocab_path,
            replace_existing=True,
        )
        tokenizer.sp.load(save_model_path)

    if (tokenizer.sp.eos_id() + 1) == (tokenizer.sp.bos_id() + 1) == 0 and not (
        hparams["eos_index"]
        == hparams["bos_index"]
        == hparams["blank_index"]
        == hparams["unk_index"]
        == 0
    ):
        raise ValueError(
            "Desired indexes for special tokens do not agree "
            "with loaded tokenizer special tokens !"
        )

    # 2. Define audio pipeline:
    @sb.utils.data_pipeline.takes(hparams["input_type"])
    @sb.utils.data_pipeline.provides("sig")
    def audio_pipeline(wav):
        sig = sb.dataio.dataio.read_audio(wav)
        return sig

    # 3. Define text pipeline:
    @sb.utils.data_pipeline.takes("wrd")
    @sb.utils.data_pipeline.provides("tokens_bos", "tokens_eos", "tokens")
    def text_pipeline(wrd):
        tokens_list = tokenizer.sp.encode_as_ids(wrd)
        tokens_bos = torch.LongTensor([hparams["bos_index"]] + (tokens_list))
        yield tokens_bos
        tokens_eos = torch.LongTensor(tokens_list + [hparams["eos_index"]])
        yield tokens_eos
        tokens = torch.LongTensor(tokens_list)
        yield tokens

    # 4. Create datasets
    data = {}
    for dataset in ["train", "valid", "test"]:
        data[dataset] = sb.dataio.dataset.DynamicItemDataset.from_csv(
            csv_path=hparams[f"{dataset}_annotation"],
            replacements={"data_root", hparams["data_folder"]},
            dynamic_items=[audio_pipeline, text_pipeline],
            output_keys=["id", "sig", "tokens_bos", "tokens_eos", "tokens"],
        )
        if dataset != "train":
            data[dataset] = data[dataset].filtered_sorted(sort_key="duration")

    # Sort train dataset and ensure it doesn't get un-sorted
    if hparams["sorting"] == "ascending" or hparams["sorting"] == "descending":
        data["train"] = data["train"].filtered_sorted(
            sort_key="duration", reverse=hparams["sorting"] == "descending",
        )
        hparams["dataloader_options"]["shuffle"] = False
    elif hparams["sorting"] != "random":
        raise NotImplementedError(
            "Sorting must be random, ascending, or descending"
        )

    return data, tokenizer