Beispiel #1
0
def parse_libri_folder(libri_folders):
    # parsing librispeech
    # step 1:  we construct a dictionary with speakers ids
    utterances = []
    txt_files = []
    for libri_dir in libri_folders:
        utterances.extend(get_all_files(libri_dir, match_and=[".flac"]))
        txt_files.extend(get_all_files(libri_dir, match_and=["trans.txt"]))
    # step 2: we then build an hashtable for words for each utterance
    words_dict = {}
    for trans in txt_files:
        with open(trans, "r") as f:
            for line in f:
                splitted = line.split(" ")
                utt_id = splitted[0]
                words = " ".join(splitted[1:])
                words_dict[utt_id] = words.strip("\n")

    # step 3: we build an hashtable for also speakers
    speakers = {}
    for u in utterances:
        spk_id = Path(u).parent.parent.stem
        if spk_id not in speakers:
            speakers[spk_id] = [u]
        else:
            speakers[spk_id].append(u)

    return speakers, words_dict
Beispiel #2
0
def prepare_mini_librispeech(
    data_folder, save_json_train, save_json_valid, save_json_test
):
    """
    Prepares the json files for the Mini Librispeech dataset.

    Downloads the dataset if its not found in the `data_folder`.

    Arguments
    ---------
    data_folder : str
        Path to the folder where the Mini Librispeech dataset is stored.
    save_json_train : str
        Path where the train data specification file will be saved.
    save_json_valid : str
        Path where the validation data specification file will be saved.
    save_json_test : str
        Path where the test data specification file will be saved.

    Example
    -------
    >>> data_folder = '/path/to/mini_librispeech'
    >>> prepare_mini_librispeech(data_folder, 'train.json', 'valid.json', 'test.json')
    """

    # Check if this phase is already done (if so, skip it)
    if skip(save_json_train, save_json_valid, save_json_test):
        logger.info("Preparation completed in previous run, skipping.")
        return

    # If the dataset doesn't exist yet, download it
    train_folder = os.path.join(data_folder, "LibriSpeech", "train-clean-5")
    valid_folder = os.path.join(data_folder, "LibriSpeech", "dev-clean-2")
    test_folder = os.path.join(data_folder, "LibriSpeech", "test-clean")
    if not check_folders(train_folder, valid_folder, test_folder):
        download_mini_librispeech(data_folder)

    # List files and create manifest from list
    logger.info(
        f"Creating {save_json_train}, {save_json_valid}, and {save_json_test}"
    )
    extension = [".flac"]

    # List of flac audio files
    wav_list_train = get_all_files(train_folder, match_and=extension)
    wav_list_valid = get_all_files(valid_folder, match_and=extension)
    wav_list_test = get_all_files(test_folder, match_and=extension)

    # List of transcription file
    extension = [".trans.txt"]
    trans_list = get_all_files(data_folder, match_and=extension)
    trans_dict = get_transcription(trans_list)

    # Create the json files
    create_json(wav_list_train, trans_dict, save_json_train)
    create_json(wav_list_valid, trans_dict, save_json_valid)
    create_json(wav_list_test, trans_dict, save_json_test)
def resample_folder(input_folder, output_folder, fs, regex):

    files = get_all_files(input_folder, match_and=[regex])
    torchaudio.initialize_sox()
    for f in tqdm.tqdm(files):

        # we use sox because torchaudio.Resample uses too much RAM.
        resample = torchaudio.sox_effects.SoxEffectsChain()
        resample.append_effect_to_chain("rate", [fs])
        resample.set_input_file(f)

        audio, fs = resample.sox_build_flow_effects()

        audio = (audio / torch.max(torch.abs(audio), dim=-1, keepdim=True)[0]
                 )  # scale back otherwise you get empty .wav file
        os.makedirs(
            Path(
                os.path.join(output_folder,
                             Path(f).relative_to(Path(input_folder)))).parent,
            exist_ok=True,
        )
        torchaudio.save(
            os.path.join(output_folder,
                         Path(f).relative_to(Path(input_folder))),
            audio,
            fs,
        )
    torchaudio.shutdown_sox()
def prepare_mini_librispeech(
    data_folder,
    save_json_train,
    save_json_valid,
    save_json_test,
    split_ratio=[80, 10, 10],
):
    """
    Prepares the json files for the Mini Librispeech dataset.

    Downloads the dataset if it is not found in the `data_folder`.

    Arguments
    ---------
    data_folder : str
        Path to the folder where the Mini Librispeech dataset is stored.
    save_json_train : str
        Path where the train data specification file will be saved.
    save_json_valid : str
        Path where the validation data specification file will be saved.
    save_json_test : str
        Path where the test data specification file will be saved.
    split_ratio: list
        List composed of three integers that sets split ratios for train, valid,
        and test sets, respecively. For instance split_ratio=[80, 10, 10] will
        assign 80% of the sentences to training, 10% for validation, and 10%
        for test.

    Example
    -------
    >>> data_folder = '/path/to/mini_librispeech'
    >>> prepare_mini_librispeech(data_folder, 'train.json', 'valid.json', 'test.json')
    """

    # Check if this phase is already done (if so, skip it)
    if skip(save_json_train, save_json_valid, save_json_test):
        logger.info("Preparation completed in previous run, skipping.")
        return

    # If the dataset doesn't exist yet, download it
    train_folder = os.path.join(data_folder, "LibriSpeech", "train-clean-5")
    if not check_folders(train_folder):
        download_mini_librispeech(data_folder)

    # List files and create manifest from list
    logger.info(
        f"Creating {save_json_train}, {save_json_valid}, and {save_json_test}"
    )
    extension = [".flac"]
    wav_list = get_all_files(train_folder, match_and=extension)

    # Random split the signal list into train, valid, and test sets.
    data_split = split_sets(wav_list, split_ratio)

    # Creating json files
    create_json(data_split["train"], save_json_train)
    create_json(data_split["valid"], save_json_valid)
    create_json(data_split["test"], save_json_test)
Beispiel #5
0
def get_transcription_files_by_dataset(dataset: str,
                                       transcription_folder: str) -> List[str]:
    """return paths of transcriptions from the given data set and the path of all of transcriptions"""
    train_set = get_data_list(f"splits/{dataset}")
    transcription_train_set = list(
        map(lambda path: path.split(".")[0].strip(), train_set))
    transcription_train_set = list(
        map(lambda path: f"{path}.tdf", transcription_train_set))

    transcription_files = get_all_files(transcription_folder,
                                        match_or=transcription_train_set)

    return transcription_files
def prepare_commonlanguage(folder, csv_file, max_noise_len=None):
    """Prepare the CommonLanguage dataset for VAD training.

    Arguments
    ---------
    folder : str
        The location of the folder containing the dataset.
    csv_file : str
        Filename for storing the prepared csv file.
    max_noise_len : float
        The maximum noise length in seconds. Noises longer
        than this will be cut into pieces.
    """
    logger.info("CommonLanguage Preparation...")
    wav_lst = get_all_files(os.path.join(folder), match_and=[".wav"])
    if not os.path.isfile(csv_file):
        logger.info(csv_file + " creation...")
        _prepare_csv(folder, wav_lst, csv_file, max_noise_len)
def create_sets(data_folder, extension):
    """
    Creates lists for train, dev and test sets with data from the data_folder

    Arguments
    ---------
    data_folder : str
        Path of the CommonLanguage dataset.
    extension: list of file extentions
        List of strings with file extentions that correspond to the audio files
        in the CommonLanguage dataset

    Returns
    -------
    dictionary containing train, dev, and test splits.
    """

    # Datasets initialization
    datasets = {"train", "dev", "test"}
    data_split = {dataset: [] for dataset in datasets}

    # Get the list of languages from the dataset folder
    languages = [
        name for name in os.listdir(data_folder)
        if os.path.isdir(os.path.join(data_folder, name))
        and datasets.issubset(os.listdir(os.path.join(data_folder, name)))
    ]

    msg = f"{len(languages)} languages detected!"
    logger.info(msg)

    # Fill the train, dev and test datasets with audio filenames
    for language in languages:
        for dataset in datasets:
            curr_folder = os.path.join(data_folder, language, dataset)
            wav_list = get_all_files(curr_folder, match_and=extension)
            data_split[dataset].extend(wav_list)

    msg = "Data successfully split!"
    logger.info(msg)

    return data_split
Beispiel #8
0
def prepare_musan(folder, music_csv, noise_csv, speech_csv, max_noise_len=None):
    """Prepare the musan dataset (music, noise, speech).

    Arguments
    ---------
    folder : str
        The location of the folder containing the dataset.
    noise_csv : str
        Filename for storing the prepared noise csv.
    max_noise_len : float
        The maximum noise length in seconds. Noises longer
        than this will be cut into pieces.
    """

    sub_folders = ["music", "noise", "speech"]
    csv_files = [music_csv, noise_csv, speech_csv]
    logger.info("Musan Data Preparation...")
    for sub_folder, csv_file in zip(sub_folders, csv_files):
        wav_lst = get_all_files(
            os.path.join(folder, sub_folder), match_and=[".wav"]
        )
        if not os.path.isfile(csv_file):
            logger.info(csv_file + " creation...")
            _prepare_csv(folder, wav_lst, csv_file, max_noise_len)
Beispiel #9
0
def download_vctk(destination, tmp_dir=None, device="cpu"):
    """Download dataset and perform resample to 16000 Hz.

    Arguments
    ---------
    destination : str
        Place to put final zipped dataset.
    tmp_dir : str
        Location to store temporary files. Will use `tempfile` if not provided.
    device : str
        Passed directly to pytorch's ``.to()`` method. Used for resampling.
    """
    dataset_name = "noisy-vctk-16k"
    if tmp_dir is None:
        tmp_dir = tempfile.gettempdir()
    final_dir = os.path.join(tmp_dir, dataset_name)

    if not os.path.isdir(tmp_dir):
        os.mkdir(tmp_dir)

    if not os.path.isdir(final_dir):
        os.mkdir(final_dir)

    prefix = "https://datashare.is.ed.ac.uk/bitstream/handle/10283/2791/"
    noisy_vctk_urls = [
        prefix + "clean_testset_wav.zip",
        prefix + "noisy_testset_wav.zip",
        prefix + "testset_txt.zip",
        prefix + "clean_trainset_28spk_wav.zip",
        prefix + "noisy_trainset_28spk_wav.zip",
        prefix + "trainset_28spk_txt.zip",
    ]

    zip_files = []
    for url in noisy_vctk_urls:
        filename = os.path.join(tmp_dir, url.split("/")[-1])
        zip_files.append(filename)
        if not os.path.isfile(filename):
            logger.info("Downloading " + url)
            with urllib.request.urlopen(url) as response:
                with open(filename, "wb") as tmp_file:
                    logger.info("... to " + tmp_file.name)
                    shutil.copyfileobj(response, tmp_file)

    # Unzip
    for zip_file in zip_files:
        logger.info("Unzipping " + zip_file)
        shutil.unpack_archive(zip_file, tmp_dir, "zip")
        os.remove(zip_file)

    # Move transcripts to final dir
    shutil.move(os.path.join(tmp_dir, "testset_txt"), final_dir)
    shutil.move(os.path.join(tmp_dir, "trainset_28spk_txt"), final_dir)

    # Downsample
    dirs = [
        "noisy_testset_wav",
        "clean_testset_wav",
        "noisy_trainset_28spk_wav",
        "clean_trainset_28spk_wav",
    ]

    downsampler = Resample(orig_freq=48000, new_freq=16000)

    for directory in dirs:
        logger.info("Resampling " + directory)
        dirname = os.path.join(tmp_dir, directory)

        # Make directory to store downsampled files
        dirname_16k = os.path.join(final_dir, directory + "_16k")
        if not os.path.isdir(dirname_16k):
            os.mkdir(dirname_16k)

        # Load files and downsample
        for filename in get_all_files(dirname, match_and=[".wav"]):
            signal, rate = torchaudio.load(filename)
            downsampled_signal = downsampler(signal.view(1, -1).to(device))

            # Save downsampled file
            torchaudio.save(
                os.path.join(dirname_16k, filename[-12:]),
                downsampled_signal[0].cpu(),
                sample_rate=16000,
                channels_first=False,
            )

            # Remove old file
            os.remove(filename)

        # Remove old directory
        os.rmdir(dirname)

    logger.info("Zipping " + final_dir)
    final_zip = shutil.make_archive(
        base_name=final_dir,
        format="zip",
        root_dir=os.path.dirname(final_dir),
        base_dir=os.path.basename(final_dir),
    )

    logger.info(f"Moving {final_zip} to {destination}")
    shutil.move(final_zip, os.path.join(destination, dataset_name + ".zip"))
Beispiel #10
0
def prepare_voicebank(data_folder,
                      save_folder,
                      valid_speaker_count=2,
                      skip_prep=False):
    """
    Prepares the json files for the Voicebank dataset.

    Expects the data folder to be the same format as the output of
    ``download_vctk()`` below.

    Arguments
    ---------
    data_folder : str
        Path to the folder where the original Voicebank dataset is stored.
    save_folder : str
        The directory where to store the json files.
    valid_speaker_count : int
        The number of validation speakers to use (out of 28 in train set).
    skip_prep: bool
        If True, skip data preparation.

    Example
    -------
    >>> data_folder = '/path/to/datasets/Voicebank'
    >>> save_folder = 'exp/Voicebank_exp'
    >>> prepare_voicebank(data_folder, save_folder)
    """

    if skip_prep:
        return

    # Setting ouput files
    save_json_train = os.path.join(save_folder, TRAIN_JSON)
    save_json_valid = os.path.join(save_folder, VALID_JSON)
    save_json_test = os.path.join(save_folder, TEST_JSON)

    # Check if this phase is already done (if so, skip it)
    if skip(save_json_train, save_json_test, save_json_valid):
        logger.info("Preparation completed in previous run, skipping.")
        return

    train_clean_folder = os.path.join(data_folder,
                                      "clean_trainset_28spk_wav_16k")
    train_noisy_folder = os.path.join(data_folder,
                                      "noisy_trainset_28spk_wav_16k")
    train_txts = os.path.join(data_folder, "trainset_28spk_txt")
    test_clean_folder = os.path.join(data_folder, "clean_testset_wav_16k")
    test_noisy_folder = os.path.join(data_folder, "noisy_testset_wav_16k")
    test_txts = os.path.join(data_folder, "testset_txt")

    # Setting the save folder
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)

    # Additional checks to make sure the data folder contains Voicebank
    check_voicebank_folders(
        train_clean_folder,
        train_noisy_folder,
        train_txts,
        test_clean_folder,
        test_noisy_folder,
        test_txts,
    )

    logger.debug("Creating lexicon...")
    lexicon = create_lexicon(os.path.join(data_folder, "lexicon.txt"))
    logger.info("Creating json files for noisy VoiceBank...")

    logger.debug("Collecting files...")
    extension = [".wav"]
    valid_speakers = TRAIN_SPEAKERS[:valid_speaker_count]
    wav_lst_train = get_all_files(
        train_noisy_folder,
        match_and=extension,
        exclude_or=valid_speakers,
    )
    wav_lst_valid = get_all_files(
        train_noisy_folder,
        match_and=extension,
        match_or=valid_speakers,
    )
    wav_lst_test = get_all_files(test_noisy_folder, match_and=extension)

    logger.debug("Creating json files for noisy VoiceBank...")
    create_json(wav_lst_train, save_json_train, train_clean_folder, train_txts,
                lexicon)
    create_json(wav_lst_valid, save_json_valid, train_clean_folder, train_txts,
                lexicon)
    create_json(wav_lst_test, save_json_test, test_clean_folder, test_txts,
                lexicon)
Beispiel #11
0
def prepare_timit(
    data_folder,
    save_json_train,
    save_json_valid,
    save_json_test,
    phn_set=39,
    uppercase=False,
    skip_prep=False,
):
    """
    repares the json files for the TIMIT dataset.

    Arguments
    ---------
    data_folder : str
        Path to the folder where the original TIMIT dataset is stored.
    save_json_train : str
        The path where to store the training json file.
    save_json_valid : str
        The path where to store the valid json file.
    save_json_test : str
        The path where to store the test json file.
    phn_set : {60, 48, 39}, optional,
        Default: 39
        The phoneme set to use in the phn label.
        It could be composed of 60, 48, or 39 phonemes.
    uppercase : bool, optional
        Default: False
        This option must be True when the TIMIT dataset
        is in the upper-case version.
    skip_prep: bool
        Default: False
        If True, the data preparation is skipped.

    Example
    -------
    >>> from recipes.TIMIT.timit_prepare import prepare_timit
    >>> data_folder = 'datasets/TIMIT'
    >>> prepare_timit(data_folder, 'train.json', 'valid.json', 'test.json')
    """

    # Skip if needed
    if skip_prep:
        return

    # Getting speaker dictionary
    dev_spk, test_spk = _get_speaker()
    avoid_sentences = ["sa1", "sa2"]
    extension = [".wav"]

    # Checking TIMIT_uppercase
    if uppercase:
        avoid_sentences = [item.upper() for item in avoid_sentences]
        extension = [item.upper() for item in extension]
        dev_spk = [item.upper() for item in dev_spk]
        test_spk = [item.upper() for item in test_spk]

    # Check if this phase is already done (if so, skip it)
    if skip([save_json_train, save_json_valid, save_json_test]):
        logger.info("Skipping preparation, completed in previous run.")
        return

    # Additional checks to make sure the data folder contains TIMIT
    _check_timit_folders(uppercase, data_folder)

    msg = "Creating json files for the TIMIT Dataset.."
    logger.info(msg)

    # Creating json files
    splits = ["train", "test", "test"]
    annotations = [save_json_train, save_json_valid, save_json_test]
    match_or = [None, dev_spk, test_spk]

    for split, save_file, match in zip(splits, annotations, match_or):
        if uppercase:
            match_lst = extension + [split.upper()]
        else:
            match_lst = extension + [split]

        # List of the wav files
        wav_lst = get_all_files(
            data_folder,
            match_and=match_lst,
            match_or=match,
            exclude_or=avoid_sentences,
        )
        if split == "dev":
            print(wav_lst)

        # Json creation
        create_json(wav_lst, save_file, uppercase, phn_set)
Beispiel #12
0
def prepare_dns(
    data_folder,
    save_folder,
    seg_size=10.0,
    valid_folder=None,
    valid_ratio=0.002,
    valid_snr_low=0,
    valid_snr_high=40,
    skip_prep=False,
):
    """
    Prepares the csv files for the DNS challenge dataset.

    Arguments
    ---------
    data_folder : str
        Path to the folder where the original DNS dataset is stored.
    save_folder : str
        The directory where to store the csv files.
    seg_size : float
        Split the file into multiple fix length segments (ms).
    valid_ratio : float
        Use this fraction of the training data as a validation set.
    valid_folder : str
        Location for storing mixed validation samples.
    valid_snr_low : float
        Lowest SNR to use when mixing the validation set.
    valid_snr_high : float
        Highest SNR to use when mixing the validiation set.
    skip_prep: bool
        If False, skip data preparation.

    Example
    -------
    >>> # This example requires the actual DNS dataset:
    >>> data_folder = 'datasets/DNS-Challenge'
    >>> save_folder = 'DNS_prepared'
    >>> prepare_dns(data_folder, save_folder)
    """
    if skip_prep:
        return

    if valid_ratio > 0 and valid_folder is None:
        raise ValueError("Must provide folder for storing validation data")

    # Additional checks to make sure the data folder contains DNS
    _check_DNS_folders(data_folder)

    train_folder = os.path.join(data_folder, "datasets")
    test_folder = os.path.join(data_folder, "datasets", "test_set",
                               "synthetic", "no_reverb")

    # Setting file extension.
    extension = [".wav"]

    # Setting the save folder
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)

    # Check if this phase is already done (if so, skip it)
    if skip(save_folder):
        logger.info("Preparation completed in previous run.")
        return

    logger.info("Creating csv files for the DNS Dataset...")

    # Setting ouput files
    save_csv_noise = os.path.join(save_folder, NOISE_CSV)
    save_csv_clean = os.path.join(save_folder, CLEAN_CSV)
    save_csv_valid = os.path.join(save_folder, VALID_CSV)
    save_csv_test = os.path.join(save_folder, TEST_CSV)

    # Get the list of files
    wav_lst_noise = get_all_files(os.path.join(train_folder, "noise"),
                                  match_and=extension)
    wav_lst_clean = get_all_files(os.path.join(train_folder, "clean"),
                                  match_and=extension)

    # Clean is excluded here, but will be picked up by `create_csv`
    wav_lst_test = get_all_files(
        test_folder,
        match_and=extension,
        exclude_or=["/clean/"],
    )

    # Split training into validation and training
    if valid_ratio > 0:

        # Sort to ensure same validation set for each run.
        wav_lst_noise.sort()
        wav_lst_clean.sort()

        # Split
        valid_count = int(valid_ratio * len(wav_lst_clean))
        valid_lst_noise = wav_lst_noise[:valid_count]
        valid_lst_clean = wav_lst_clean[:valid_count]
        wav_lst_noise = wav_lst_noise[valid_count:]
        wav_lst_clean = wav_lst_clean[valid_count:]

        # Create noise csv to use when adding noise to validation samples.
        save_valid_noise = os.path.join(save_folder, "valid_noise.csv")
        create_csv(save_valid_noise, valid_lst_noise)
        create_csv(
            save_csv_valid,
            valid_lst_clean,
            seg_size=seg_size,
            noise_csv=save_valid_noise,
            noisy_folder=valid_folder,
            noise_snr_low=valid_snr_low,
            noise_snr_high=valid_snr_high,
        )

    # Test set has target in parallel "clean" directory
    create_csv(save_csv_test, wav_lst_test, has_target=True)

    # Create tr_clean.csv and tr_noise.csv for dynamic mixing the training data
    create_csv(save_csv_noise, wav_lst_noise)
    create_csv(save_csv_clean, wav_lst_clean, seg_size=seg_size)
Beispiel #13
0
def prepare_data(
    data_folder,
    save_json_train,
    save_json_valid,
    save_json_test,
    split_ratio=[80, 10, 10],
    different_speakers=False,
    seed=12,
):
    """
    Prepares the json files for the IEMOCAP dataset.

    We here use only the audio part of the dataset. The assumpion is
    that the data folder is structured as:

    <session_id>/<emotion>/<file:name>.wav

    e.g.
    session1/ang/psno1_ang_s084_orgn.wav

    Please, process the original IEMOCAP folder to match the expected
    folder structure.


    Arguments
    ---------
    data_folder : str
        Path to the folder where the transformed IEMOCAP dataset is stored.
    save_json_train : str
        Path where the train data specification file will be saved.
    save_json_valid : str
        Path where the validation data specification file will be saved.
    save_json_test : str
        Path where the test data specification file will be saved.
    split_ratio: list
        List composed of three integers that sets split ratios for train,
        valid, and test sets, respecively.
        For instance split_ratio=[80, 10, 10] will assign 80% of the sentences
        to training, 10% for validation, and 10% for test.
    seed : int
        Seed for reproducibility

    Example
    -------
    >>> data_folder = '/path/to/iemocap'
    >>> prepare_data(data_path, data_folder, 'train.json', 'valid.json',
        'test.json')
    """

    # setting seeds for reproducible code.
    random.seed(seed)

    # Check if this phase is already done (if so, skip it)
    if skip(save_json_train, save_json_valid, save_json_test):
        logger.info("Preparation completed in previous run, skipping.")
        return

    if not check_folders(data_folder + "session1/ang/psno1_ang_s084_orgn.wav"):
        logger.info(
            "The data folder is not in the expected format. Expected <session_id>/<emo_id>/<file_name>.wav (e.g., session1/ang/psno1_ang_s084_orgn.wav)"
        )

    # List files and create manifest from list
    logger.info(
        f"Creating {save_json_train}, {save_json_valid}, and {save_json_test}")
    extension = [".wav"]

    # Randomly split the signal list into train, valid, and test sets.
    wav_list = get_all_files(data_folder, match_and=extension)
    if different_speakers:
        data_split = split_different_speakers(wav_list)
    else:
        data_split = split_sets(wav_list, split_ratio)

    # Creating json files
    create_json(data_split["train"], save_json_train)
    create_json(data_split["valid"], save_json_valid)
    create_json(data_split["test"], save_json_test)
Beispiel #14
0
def prepare_data(
    data_original,
    data_transformed,
    save_json_train,
    save_json_valid,
    save_json_test,
    split_ratio=[80, 10, 10],
    different_speakers=False,
    seed=12,
):
    """
    Prepares the json files for the IEMOCAP dataset.

    We here use only the audio part of the dataset. The assumpion is
    that the data folder is structured as:

    <session_id>/<emotion>/<file:name>.wav

    e.g.
    session1/ang/psno1_ang_s084_orgn.wav

    Please, process the original IEMOCAP folder to match the expected
    folder structure.


    Arguments
    ---------
    data_original : str
        Path to the folder where the original IEMOCAP dataset is stored.
    data_transformed : str
        Path to the folder where the transformed IEMOCAP dataset will be stored.
    save_json_train : str
        Path where the train data specification file will be saved.
    save_json_valid : str
        Path where the validation data specification file will be saved.
    save_json_test : str
        Path where the test data specification file will be saved.
    split_ratio: list
        List composed of three integers that sets split ratios for train,
        valid, and test sets, respecively.
        For instance split_ratio=[80, 10, 10] will assign 80% of the sentences
        to training, 10% for validation, and 10% for test.
    seed : int
        Seed for reproducibility

    Example
    -------
    >>> data_original = '/path/to/iemocap/IEMOCAP_full_release/Session'
    >>> data_transformed = '/path/to/iemocap/IEMOCAP_ahsn_leave-two-speaker-out'
    >>> prepare_data(data_original, data_transformed, 'train.json', 'valid.json',
        'test.json')
    """

    # setting seeds for reproducible code.
    random.seed(seed)

    # Check if this phase is already done (if so, skip it)
    if skip(save_json_train, save_json_valid, save_json_test):
        logger.info("Preparation completed in previous run, skipping.")
        return

    # Check if the transformed data folder exist, generate it otherwise.
    if not check_folders(data_transformed):
        logger.info(
            "The data transformed folder doesn't exist. Do the transformation step."
        )
        transform_data(data_original, data_transformed)
    else:
        logger.info("Data Transformation completed in previous run, skipping.")

    if (not len(list(glob.iglob(data_transformed + "/*/*/*", recursive=True)))
            == NUMBER_UTT):
        logger.error(
            "Error: The data folder is not in the expected format. Expected <session_id>/<emo_id>/<file_name>.wav (e.g., session1/ang/psno1_ang_s084_orgn.wav)"
        )
        sys.exit("Data transformed dirctory " + data_transformed +
                 "contains: " + str(
                     len(
                         list(
                             glob.iglob(data_transformed + "/*/*/*",
                                        recursive=True)))) +
                 " file. Expected " + str(NUMBER_UTT) + ".")

    # List files and create manifest from list
    logger.info(
        f"Creating {save_json_train}, {save_json_valid}, and {save_json_test}")
    extension = [".wav"]

    # Randomly split the signal list into train, valid, and test sets.
    wav_list = get_all_files(data_transformed, match_and=extension)
    if different_speakers:
        data_split = split_different_speakers(wav_list)
    else:
        data_split = split_sets(wav_list, split_ratio)

    # Creating json files
    create_json(data_split["train"], save_json_train)
    create_json(data_split["valid"], save_json_valid)
    create_json(data_split["test"], save_json_test)
Beispiel #15
0
def prepare_timit(
    data_folder,
    splits,
    save_folder,
    kaldi_ali_tr=None,
    kaldi_ali_dev=None,
    kaldi_ali_test=None,
    kaldi_lab_opts=None,
    phn_set=39,
    uppercase=False,
):
    """
    repares the csv files for the TIMIT dataset.

    Arguments
    ---------
    data_folder : str
        Path to the folder where the original TIMIT dataset is stored.
    splits : list
        List of splits to prepare from ['train', 'dev', 'test']
    save_folder : str
        The directory where to store the csv files.
    kaldi_ali_tr : dict, optional
        Default: 'None'
        When set, this is the directiory where the kaldi
        training alignments are stored.  They will be automatically converted
        into pkl for an easier use within speechbrain.
    kaldi_ali_dev : str, optional
        Default: 'None'
        When set, this is the path to directory where the
        kaldi dev alignments are stored.
    kaldi_ali_te : str, optional
        Default: 'None'
        When set, this is the path to the directory where the
        kaldi test alignments are stored.
    phn_set : {60, 48, 39}, optional,
        Default: 39
        The phoneme set to use in the phn label.
        It could be composed of 60, 48, or 39 phonemes.
    uppercase : bool, optional
        Default: False
        This option must be True when the TIMIT dataset
        is in the upper-case version.

    Example
    -------
    >>> from recipes.TIMIT.timit_prepare import prepare_timit
    >>> data_folder = 'datasets/TIMIT'
    >>> splits = ['train', 'dev', 'test']
    >>> save_folder = 'TIMIT_prepared'
    >>> prepare_timit(data_folder, splits, save_folder)
    """
    conf = {
        "data_folder": data_folder,
        "splits": splits,
        "kaldi_ali_tr": kaldi_ali_tr,
        "kaldi_ali_dev": kaldi_ali_dev,
        "kaldi_ali_test": kaldi_ali_test,
        "save_folder": save_folder,
        "phn_set": phn_set,
        "uppercase": uppercase,
    }

    # Getting speaker dictionary
    dev_spk, test_spk = _get_speaker()

    # Avoid calibration sentences
    avoid_sentences = ["sa1", "sa2"]

    # Setting file extension.
    extension = [".wav"]

    # Checking TIMIT_uppercase
    if uppercase:
        avoid_sentences = [item.upper() for item in avoid_sentences]
        extension = [item.upper() for item in extension]
        dev_spk = [item.upper() for item in dev_spk]
        test_spk = [item.upper() for item in test_spk]

    # Setting the save folder
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)

    # Setting ouput files
    save_opt = os.path.join(save_folder, OPT_FILE)
    save_csv_train = os.path.join(save_folder, TRAIN_CSV)
    save_csv_dev = os.path.join(save_folder, DEV_CSV)
    save_csv_test = os.path.join(save_folder, TEST_CSV)

    # Check if this phase is already done (if so, skip it)
    if skip(splits, save_folder, conf):
        logger.info("Skipping preparation, completed in previous run.")
        return

    # Additional checks to make sure the data folder contains TIMIT
    _check_timit_folders(uppercase, data_folder)

    msg = "Creating csv file for the TIMIT Dataset.."
    logger.info(msg)

    # Creating csv file for training data
    if "train" in splits:

        # Checking TIMIT_uppercase
        if uppercase:
            match_lst = extension + ["TRAIN"]
        else:
            match_lst = extension + ["train"]

        wav_lst_train = get_all_files(
            data_folder,
            match_and=match_lst,
            exclude_or=avoid_sentences,
        )

        create_csv(
            wav_lst_train,
            save_csv_train,
            uppercase,
            data_folder,
            phn_set,
            kaldi_lab=kaldi_ali_tr,
            kaldi_lab_opts=kaldi_lab_opts,
        )

    # Creating csv file for dev data
    if "dev" in splits:

        # Checking TIMIT_uppercase
        if uppercase:
            match_lst = extension + ["TEST"]
        else:
            match_lst = extension + ["test"]

        wav_lst_dev = get_all_files(
            data_folder,
            match_and=match_lst,
            match_or=dev_spk,
            exclude_or=avoid_sentences,
        )

        create_csv(
            wav_lst_dev,
            save_csv_dev,
            uppercase,
            data_folder,
            phn_set,
            kaldi_lab=kaldi_ali_dev,
            kaldi_lab_opts=kaldi_lab_opts,
        )

    # Creating csv file for test data
    if "test" in splits:

        # Checking TIMIT_uppercase
        if uppercase:
            match_lst = extension + ["TEST"]
        else:
            match_lst = extension + ["test"]

        wav_lst_test = get_all_files(
            data_folder,
            match_and=match_lst,
            match_or=test_spk,
            exclude_or=avoid_sentences,
        )

        create_csv(
            wav_lst_test,
            save_csv_test,
            uppercase,
            data_folder,
            phn_set,
            kaldi_lab=kaldi_ali_test,
            kaldi_lab_opts=kaldi_lab_opts,
        )

    # saving options
    save_pkl(conf, save_opt)
def prepare_ksponspeech(
    data_folder,
    save_folder,
    tr_splits=[],
    dev_splits=[],
    te_splits=[],
    select_n_sentences=None,
    merge_lst=[],
    merge_name=None,
    skip_prep=False,
):
    """
    This class prepares the csv files for the KsponSpeech dataset.
    Download link: https://aihub.or.kr/aidata/105/download

    Arguments
    ---------
    data_folder : str
        Path to the folder where the original KsponSpeech dataset is stored.
    tr_splits : list
        List of train splits to prepare from ['train', 'dev', 'eval_clean',
        'eval_other'].
    dev_splits : list
        List of dev splits to prepare from ['dev'].
    te_splits : list
        List of test splits to prepare from ['eval_clean','eval_other'].
    save_folder : str
        The directory where to store the csv files.
    select_n_sentences : int
        Default : None
        If not None, only pick this many sentences.
    merge_lst : list
        List of KsponSpeech splits (e.g, eval_clean, eval_other) to
        merge in a singe csv file.
    merge_name: str
        Name of the merged csv file.
    skip_prep: bool
        If True, data preparation is skipped.


    Example
    -------
    >>> data_folder = 'datasets/KsponSpeech'
    >>> tr_splits = ['train']
    >>> dev_splits = ['dev']
    >>> te_splits = ['eval_clean']
    >>> save_folder = 'KsponSpeech_prepared'
    >>> prepare_ksponspeech(data_folder, save_folder, tr_splits, dev_splits, \
                            te_splits)
    """

    if skip_prep:
        return
    data_folder = data_folder
    splits = tr_splits + dev_splits + te_splits
    save_folder = save_folder
    select_n_sentences = select_n_sentences
    conf = {
        "select_n_sentences": select_n_sentences,
    }

    # Other variables
    # Saving folder
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)

    save_opt = os.path.join(save_folder, OPT_FILE)

    # Check if this phase is already done (if so, skip it)
    if skip(splits, save_folder, conf):
        logger.info("Skipping preparation, completed in previous run.")
        return
    else:
        logger.info("Data_preparation...")

    # Additional checks to make sure the data folder contains ksponspeech
    check_ksponspeech_folders(data_folder, splits)

    # parse trn file
    all_texts = {}
    for split_index in range(len(splits)):

        split = splits[split_index]
        dirlist = split2dirs(split)
        wav_lst = []
        for dir in dirlist:
            wav_lst += get_all_files(os.path.join(data_folder, dir),
                                     match_and=[".wav"])

        trnpath = os.path.join(data_folder, split + ".trn")
        text_dict = text_to_dict(trnpath)
        all_texts.update(text_dict)

        if select_n_sentences is not None:
            n_sentences = select_n_sentences[split_index]
        else:
            n_sentences = len(wav_lst)

        create_csv(
            save_folder,
            wav_lst,
            text_dict,
            split,
            n_sentences,
        )

    # Merging csv file if needed
    if merge_lst and merge_name is not None:
        merge_files = [split_kspon + ".csv" for split_kspon in merge_lst]
        merge_csvs(
            data_folder=save_folder,
            csv_lst=merge_files,
            merged_csv=merge_name,
        )

    # saving options
    save_pkl(conf, save_opt)
def prepare_librispeech(
    data_folder,
    save_folder,
    tr_splits=[],
    dev_splits=[],
    te_splits=[],
    select_n_sentences=None,
    merge_lst=[],
    merge_name=None,
    create_lexicon=False,
    skip_prep=False,
):
    """
    This class prepares the csv files for the LibriSpeech dataset.
    Download link: http://www.openslr.org/12

    Arguments
    ---------
    data_folder : str
        Path to the folder where the original LibriSpeech dataset is stored.
    tr_splits : list
        List of train splits to prepare from ['test-others','train-clean-100',
        'train-clean-360','train-other-500'].
    dev_splits : list
        List of dev splits to prepare from ['dev-clean','dev-others'].
    te_splits : list
        List of test splits to prepare from ['test-clean','test-others'].
    save_folder : str
        The directory where to store the csv files.
    select_n_sentences : int
        Default : None
        If not None, only pick this many sentences.
    merge_lst : list
        List of librispeech splits (e.g, train-clean, train-clean-360,..) to
        merge in a singe csv file.
    merge_name: str
        Name of the merged csv file.
    create_lexicon: bool
        If True, it outputs csv files contaning mapping between graphene
        to phonemes. Use it for training a G2P system.
    skip_prep: bool
        If True, data preparation is skipped.


    Example
    -------
    >>> data_folder = 'datasets/LibriSpeech'
    >>> splits = ['train-clean-100', 'dev-clean', 'test-clean']
    >>> save_folder = 'librispeech_prepared'
    >>> prepare_librispeech(data_folder, splits, save_folder)
    """

    if skip_prep:
        return
    data_folder = data_folder
    splits = tr_splits + dev_splits + te_splits
    save_folder = save_folder
    select_n_sentences = select_n_sentences
    conf = {
        "select_n_sentences": select_n_sentences,
    }

    # Other variables
    # Saving folder
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)

    save_opt = os.path.join(save_folder, OPT_FILE)

    # Check if this phase is already done (if so, skip it)
    if skip(splits, save_folder, conf):
        logger.info("Skipping preparation, completed in previous run.")
        return
    else:
        logger.info("Data_preparation...")

    # Additional checks to make sure the data folder contains Librispeech
    check_librispeech_folders(data_folder, splits)

    # create csv files for each split
    all_texts = {}
    for split_index in range(len(splits)):

        split = splits[split_index]

        wav_lst = get_all_files(os.path.join(data_folder, split),
                                match_and=[".flac"])

        text_lst = get_all_files(os.path.join(data_folder, split),
                                 match_and=["trans.txt"])

        text_dict = text_to_dict(text_lst)
        all_texts.update(text_dict)

        if select_n_sentences is not None:
            n_sentences = select_n_sentences[split_index]
        else:
            n_sentences = len(wav_lst)

        create_csv(
            save_folder,
            wav_lst,
            text_dict,
            split,
            n_sentences,
        )

    # Merging csv file if needed
    if merge_lst and merge_name is not None:
        merge_files = [split_libri + ".csv" for split_libri in merge_lst]
        merge_csvs(
            data_folder=save_folder,
            csv_lst=merge_files,
            merged_csv=merge_name,
        )

    # Create lexicon.csv and oov.csv
    if create_lexicon:
        create_lexicon_and_oov_csv(all_texts, data_folder, save_folder)

    # saving options
    save_pkl(conf, save_opt)
Beispiel #18
0
    for u in utterances:
        spk_id = Path(u).parent.parent.stem
        if spk_id not in speakers:
            speakers[spk_id] = [u]
        else:
            speakers[spk_id].append(u)

    return speakers, words_dict


# split
split_f = params.split_factors
# we get all noises and rirs
noises = []
for f in params.noises_folders:
    noises.extend(get_all_files(f, match_and=[".wav"]))
rirs = []
for f in params.rirs_folders:
    rirs.extend(get_all_files(f, match_and=[".wav"]))
# we split them in training, dev and eval
noises = split_list(noises, split_f)
rirs = split_list(rirs, split_f)
# do the same for background noises
if params.backgrounds_root:
    backgrounds = get_all_files(params.backgrounds_root, match_and=[".wav"])
    backgrounds = split_list(backgrounds, split_f)
else:
    backgrounds = [None] * 3

os.makedirs(os.path.join(params.out_folder, "metadata"), exist_ok=True)