Ejemplo n.º 1
0
def test_saveable_dataloader_multiprocess(tmpdir):
    # Same test as above, but with multiprocess dataloading
    from speechbrain.dataio.dataloader import SaveableDataLoader

    save_file = tmpdir + "/dataloader.ckpt"
    dataset = torch.randn(10, 1)
    for num_parallel in [1, 2, 10, 12]:
        dataloader = SaveableDataLoader(dataset,
                                        num_workers=num_parallel,
                                        collate_fn=None)  # Note num_workers
        data_iterator = iter(dataloader)
        first_item = next(data_iterator)
        assert first_item == dataset[0]
        # Save here, note that this overwrites.
        dataloader._speechbrain_save(save_file)
        second_item = next(data_iterator)
        assert second_item == dataset[1]
        # Cleanup needed for MacOS (open file limit)
        del data_iterator
        del dataloader
        # Now make a new dataloader and recover:
        new_dataloader = SaveableDataLoader(dataset,
                                            num_workers=num_parallel,
                                            collate_fn=None)
        new_dataloader._speechbrain_load(save_file,
                                         end_of_epoch=False,
                                         device=None)
        new_data_iterator = iter(new_dataloader)
        second_second_item = next(new_data_iterator)
        assert second_second_item == second_item
        del new_data_iterator
        del new_dataloader
Ejemplo n.º 2
0
def test_saveable_dataloader(tmpdir):
    from speechbrain.dataio.dataloader import SaveableDataLoader

    save_file = tmpdir + "/dataloader.ckpt"
    dataset = torch.randn(10, 1)
    dataloader = SaveableDataLoader(dataset, collate_fn=None)
    data_iterator = iter(dataloader)
    first_item = next(data_iterator)
    assert first_item == dataset[0]
    # Save here:
    dataloader._speechbrain_save(save_file)
    second_item = next(data_iterator)
    assert second_item == dataset[1]
    # Now make a new dataloader and recover:
    new_dataloader = SaveableDataLoader(dataset, collate_fn=None)
    new_dataloader._speechbrain_load(save_file,
                                     end_of_epoch=False,
                                     device=None)
    new_data_iterator = iter(new_dataloader)
    second_second_item = next(new_data_iterator)
    assert second_second_item == second_item
Ejemplo n.º 3
0
def dataio_prep(hparams):
    """This function prepares the datasets to be used in the brain class.
    It also defines the data processing pipeline through user-defined functions."""
    data_folder = hparams["data_folder"]
    # 1. Declarations:
    train_data = sb.dataio.dataset.DynamicItemDataset.from_json(
        json_path=hparams["train_annotation"],
        replacements={"data_root": data_folder},
    )
    if hparams["sorting"] == "ascending":
        # we sort training data to speed up training and get better results.
        train_data = train_data.filtered_sorted(sort_key="duration")
        # when sorting do not shuffle in dataloader ! otherwise is pointless
        hparams["train_dataloader_opts"]["shuffle"] = False

    elif hparams["sorting"] == "descending":
        train_data = train_data.filtered_sorted(sort_key="duration",
                                                reverse=True)
        # when sorting do not shuffle in dataloader ! otherwise is pointless
        hparams["train_dataloader_opts"]["shuffle"] = False

    elif hparams["sorting"] == "random":
        pass

    else:
        raise NotImplementedError(
            "sorting must be random, ascending or descending")

    valid_data = sb.dataio.dataset.DynamicItemDataset.from_json(
        json_path=hparams["valid_annotation"],
        replacements={"data_root": data_folder},
    )
    valid_data = valid_data.filtered_sorted(sort_key="duration")

    test_data = sb.dataio.dataset.DynamicItemDataset.from_json(
        json_path=hparams["test_annotation"],
        replacements={"data_root": data_folder},
    )
    test_data = test_data.filtered_sorted(sort_key="duration")

    datasets = [train_data, valid_data, test_data]
    label_encoder = sb.dataio.encoder.CTCTextEncoder()

    # 2. Define audio pipeline:
    @sb.utils.data_pipeline.takes("wav")
    @sb.utils.data_pipeline.provides("sig")
    def audio_pipeline(wav):
        sig = sb.dataio.dataio.read_audio(wav)
        return sig

    sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline)

    # 3. Define text pipeline:
    @sb.utils.data_pipeline.takes("phn")
    @sb.utils.data_pipeline.provides(
        "phn_list",
        "phn_encoded_list",
        "phn_encoded",
        "phn_encoded_eos",
        "phn_encoded_bos",
    )
    def text_pipeline(phn):
        phn_list = phn.strip().split()
        yield phn_list
        phn_encoded_list = label_encoder.encode_sequence(phn_list)
        yield phn_encoded_list
        phn_encoded = torch.LongTensor(phn_encoded_list)
        yield phn_encoded
        phn_encoded_eos = torch.LongTensor(
            label_encoder.append_eos_index(phn_encoded_list))
        yield phn_encoded_eos
        phn_encoded_bos = torch.LongTensor(
            label_encoder.prepend_bos_index(phn_encoded_list))
        yield phn_encoded_bos

    sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline)

    # 3. Fit encoder:
    # Load or compute the label encoder
    lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt")
    special_labels = {
        "bos_label": hparams["bos_index"],
        "eos_label": hparams["eos_index"],
        "blank_label": hparams["blank_index"],
    }
    label_encoder.load_or_create(
        path=lab_enc_file,
        from_didatasets=[train_data],
        output_key="phn_list",
        special_labels=special_labels,
        sequence_input=True,
    )

    # 4. Set output:
    sb.dataio.dataset.set_output_keys(
        datasets,
        ["id", "sig", "phn_encoded", "phn_encoded_eos", "phn_encoded_bos"],
    )

    # Support for dynamic batching
    if hparams["dynamic_batching"]:
        dynamic_hparams = hparams["dynamic_batch_sampler"]
        hop_size = dynamic_hparams["feats_hop_size"]

        batch_sampler = DynamicBatchSampler(
            train_data,
            dynamic_hparams["max_batch_len"],
            num_buckets=dynamic_hparams["num_buckets"],
            length_func=lambda x: x["duration"] * (1 / hop_size),
            shuffle=dynamic_hparams["shuffle_ex"],
            batch_ordering=dynamic_hparams["batch_ordering"],
        )

        train_data = SaveableDataLoader(train_data,
                                        batch_sampler=batch_sampler,
                                        collate_fn=PaddedBatch)

    return train_data, valid_data, test_data, label_encoder
Ejemplo n.º 4
0
def dataio_prep(hparams):
    """Creates the datasets and their data processing pipelines"""
    # 1. Get label encoder
    label_encoder = sb.dataio.encoder.CTCTextEncoder()

    # 2. Define audio pipelines:
    @sb.utils.data_pipeline.takes("noisy_wav")
    @sb.utils.data_pipeline.provides("noisy_sig")
    def noisy_pipeline(wav):
        return sb.dataio.dataio.read_audio(wav)

    @sb.utils.data_pipeline.takes("clean_wav")
    @sb.utils.data_pipeline.provides("clean_sig")
    def clean_pipeline(wav):
        return sb.dataio.dataio.read_audio(wav)

    # 3. Define target pipeline:
    # @sb.utils.data_pipeline.takes("phn")
    @sb.utils.data_pipeline.takes("phones")
    @sb.utils.data_pipeline.provides("phn_list", "phn_encoded")
    def target_pipeline(target):
        phn_list = target.strip().split()
        yield phn_list
        phn_encoded = label_encoder.encode_sequence_torch(phn_list)
        yield phn_encoded

    # 4. Create datasets
    data = {}
    for dataset in ["train", "valid", "test"]:
        data[dataset] = sb.dataio.dataset.DynamicItemDataset.from_json(
            json_path=hparams[f"{dataset}_annotation"],
            replacements={"data_root": hparams["data_folder"]},
            dynamic_items=[noisy_pipeline, clean_pipeline, target_pipeline],
            output_keys=["id", "noisy_sig", "clean_sig", "phn_encoded"],
        )
        if dataset != "train":
            data[dataset] = data[dataset].filtered_sorted(sort_key="length")

    # Sort train dataset and ensure it doesn't get un-sorted
    if hparams["sorting"] == "ascending" or hparams["sorting"] == "descending":
        data["train"] = data["train"].filtered_sorted(
            sort_key="length",
            reverse=hparams["sorting"] == "descending",
        )
        hparams["train_loader_options"]["shuffle"] = False
    elif hparams["sorting"] != "random":
        raise NotImplementedError(
            "Sorting must be random, ascending, or descending")

    lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt")
    label_encoder.load_or_create(
        path=lab_enc_file,
        from_didatasets=[data["train"]],
        output_key="phn_list",
        special_labels={"blank_label": hparams["blank_index"]},
        sequence_input=True,
    )

    if hparams["dynamic_batching"]:
        dynamic_hparams = hparams["dynamic_batch_sampler"]
        hope_size = dynamic_hparams["feats_hop_size"]
        for dataset in ["train", "valid", "test"]:

            batch_sampler = DynamicBatchSampler(
                data[dataset],
                dynamic_hparams["max_batch_len"],
                dynamic_hparams["left_bucket_len"],
                bucket_length_multiplier=dynamic_hparams["multiplier"],
                length_func=lambda x: x["length"] * (1 / hope_size),
                shuffle=dynamic_hparams["shuffle_ex"],
                # batch_ordering=dynamic_hparams["batch_ordering"],
            )

            data[dataset] = SaveableDataLoader(
                data[dataset],
                batch_sampler=batch_sampler,
                collate_fn=PaddedBatch,
            )

    return data, label_encoder