def prepare_librispeech(
    data_folder,
    save_folder,
    tr_splits=[],
    dev_splits=[],
    te_splits=[],
    select_n_sentences=None,
    merge_lst=[],
    merge_name=None,
    create_lexicon=False,
    skip_prep=False,
):
    """
    This class prepares the csv files for the LibriSpeech dataset.
    Download link: http://www.openslr.org/12

    Arguments
    ---------
    data_folder : str
        Path to the folder where the original LibriSpeech dataset is stored.
    tr_splits : list
        List of train splits to prepare from ['test-others','train-clean-100',
        'train-clean-360','train-other-500'].
    dev_splits : list
        List of dev splits to prepare from ['dev-clean','dev-others'].
    te_splits : list
        List of test splits to prepare from ['test-clean','test-others'].
    save_folder : str
        The directory where to store the csv files.
    select_n_sentences : int
        Default : None
        If not None, only pick this many sentences.
    merge_lst : list
        List of librispeech splits (e.g, train-clean, train-clean-360,..) to
        merge in a singe csv file.
    merge_name: str
        Name of the merged csv file.
    create_lexicon: bool
        If True, it outputs csv files contaning mapping between graphene
        to phonemes. Use it for training a G2P system.
    skip_prep: bool
        If True, data preparation is skipped.


    Example
    -------
    >>> data_folder = 'datasets/LibriSpeech'
    >>> splits = ['train-clean-100', 'dev-clean', 'test-clean']
    >>> save_folder = 'librispeech_prepared'
    >>> prepare_librispeech(data_folder, splits, save_folder)
    """

    if skip_prep:
        return
    data_folder = data_folder
    splits = tr_splits + dev_splits + te_splits
    save_folder = save_folder
    select_n_sentences = select_n_sentences
    conf = {
        "select_n_sentences": select_n_sentences,
    }

    # Other variables
    # Saving folder
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)

    save_opt = os.path.join(save_folder, OPT_FILE)

    # Check if this phase is already done (if so, skip it)
    if skip(splits, save_folder, conf):
        logger.info("Skipping preparation, completed in previous run.")
        return
    else:
        logger.info("Data_preparation...")

    # Additional checks to make sure the data folder contains Librispeech
    check_librispeech_folders(data_folder, splits)

    # create csv files for each split
    all_texts = {}
    for split_index in range(len(splits)):

        split = splits[split_index]

        wav_lst = get_all_files(os.path.join(data_folder, split),
                                match_and=[".flac"])

        text_lst = get_all_files(os.path.join(data_folder, split),
                                 match_and=["trans.txt"])

        text_dict = text_to_dict(text_lst)
        all_texts.update(text_dict)

        if select_n_sentences is not None:
            n_sentences = select_n_sentences[split_index]
        else:
            n_sentences = len(wav_lst)

        create_csv(
            save_folder,
            wav_lst,
            text_dict,
            split,
            n_sentences,
        )

    # Merging csv file if needed
    if merge_lst and merge_name is not None:
        merge_files = [split_libri + ".csv" for split_libri in merge_lst]
        merge_csvs(
            data_folder=save_folder,
            csv_lst=merge_files,
            merged_csv=merge_name,
        )

    # Create lexicon.csv and oov.csv
    if create_lexicon:
        create_lexicon_and_oov_csv(all_texts, data_folder, save_folder)

    # saving options
    save_pkl(conf, save_opt)
Ejemplo n.º 2
0
def prepare_voxceleb(
    data_folder,
    save_folder,
    verification_pairs_file,
    splits=["train", "dev", "test"],
    split_ratio=[90, 10],
    seg_dur=3.0,
    amp_th=5e-04,
    source=None,
    split_speaker=False,
    random_segment=False,
    skip_prep=False,
):
    """
    Prepares the csv files for the Voxceleb1 or Voxceleb2 datasets.
    Please follow the instructions in the README.md file for
    preparing Voxceleb2.

    Arguments
    ---------
    data_folder : str
        Path to the folder where the original VoxCeleb dataset is stored.
    save_folder : str
        The directory where to store the csv files.
    verification_pairs_file : str
        txt file containing the verification split.
    splits : list
        List of splits to prepare from ['train', 'dev']
    split_ratio : list
        List if int for train and validation splits
    seg_dur : int
        Segment duration of a chunk in seconds (e.g., 3.0 seconds).
    amp_th : float
        removes segments whose average amplitude is below the
        given threshold.
    source : str
        Path to the folder where the VoxCeleb dataset source is stored.
    split_speaker : bool
        Speaker-wise split
    random_segment : bool
        Train random segments
    skip_prep: Bool
        If True, skip preparation.

    Example
    -------
    >>> from recipes.VoxCeleb.voxceleb1_prepare import prepare_voxceleb
    >>> data_folder = 'data/VoxCeleb1/'
    >>> save_folder = 'VoxData/'
    >>> splits = ['train', 'dev']
    >>> split_ratio = [90, 10]
    >>> prepare_voxceleb(data_folder, save_folder, splits, split_ratio)
    """

    if skip_prep:
        return
    # Create configuration for easily skipping data_preparation stage
    conf = {
        "data_folder": data_folder,
        "splits": splits,
        "split_ratio": split_ratio,
        "save_folder": save_folder,
        "seg_dur": seg_dur,
        "split_speaker": split_speaker,
    }

    if not os.path.exists(save_folder):
        os.makedirs(save_folder)

    # Setting ouput files
    save_opt = os.path.join(save_folder, OPT_FILE)
    save_csv_train = os.path.join(save_folder, TRAIN_CSV)
    save_csv_dev = os.path.join(save_folder, DEV_CSV)

    # Create the data folder contains VoxCeleb1 test data from the source
    if source is not None:
        if not os.path.exists(os.path.join(data_folder, "wav", "id10270")):
            logger.info(f"Extracting {source}/{TEST_WAV} to {data_folder}")
            shutil.unpack_archive(os.path.join(source, TEST_WAV), data_folder)
        if not os.path.exists(os.path.join(data_folder, "meta")):
            logger.info(f"Copying {source}/meta to {data_folder}")
            shutil.copytree(os.path.join(source, "meta"),
                            os.path.join(data_folder, "meta"))

    # Check if this phase is already done (if so, skip it)
    if skip(splits, save_folder, conf):
        logger.info("Skipping preparation, completed in previous run.")
        return

    # Additional checks to make sure the data folder contains VoxCeleb data
    if "," in data_folder:
        data_folder = data_folder.replace(" ", "").split(",")
    else:
        data_folder = [data_folder]

    # _check_voxceleb1_folders(data_folder, splits)

    msg = "\tCreating csv file for the VoxCeleb Dataset.."
    logger.info(msg)

    # Split data into 90% train and 10% validation (verification split)
    wav_lst_train, wav_lst_dev = _get_utt_split_lists(data_folder, split_ratio,
                                                      verification_pairs_file,
                                                      split_speaker)

    # Creating csv file for training data
    if "train" in splits:
        prepare_csv(seg_dur, wav_lst_train, save_csv_train, random_segment,
                    amp_th)

    if "dev" in splits:
        prepare_csv(seg_dur, wav_lst_dev, save_csv_dev, random_segment, amp_th)

    # For PLDA verification
    if "test" in splits:
        prepare_csv_enrol_test(data_folder, save_folder,
                               verification_pairs_file)

    # Saving options (useful to skip this phase when already done)
    save_pkl(conf, save_opt)
Ejemplo n.º 3
0
def prepare_ksponspeech(
    data_folder,
    save_folder,
    tr_splits=[],
    dev_splits=[],
    te_splits=[],
    select_n_sentences=None,
    merge_lst=[],
    merge_name=None,
    skip_prep=False,
):
    """
    This class prepares the csv files for the KsponSpeech dataset.
    Download link: https://aihub.or.kr/aidata/105/download

    Arguments
    ---------
    data_folder : str
        Path to the folder where the original KsponSpeech dataset is stored.
    tr_splits : list
        List of train splits to prepare from ['train', 'dev', 'eval_clean',
        'eval_other'].
    dev_splits : list
        List of dev splits to prepare from ['dev'].
    te_splits : list
        List of test splits to prepare from ['eval_clean','eval_other'].
    save_folder : str
        The directory where to store the csv files.
    select_n_sentences : int
        Default : None
        If not None, only pick this many sentences.
    merge_lst : list
        List of KsponSpeech splits (e.g, eval_clean, eval_other) to
        merge in a singe csv file.
    merge_name: str
        Name of the merged csv file.
    skip_prep: bool
        If True, data preparation is skipped.


    Example
    -------
    >>> data_folder = 'datasets/KsponSpeech'
    >>> tr_splits = ['train']
    >>> dev_splits = ['dev']
    >>> te_splits = ['eval_clean']
    >>> save_folder = 'KsponSpeech_prepared'
    >>> prepare_ksponspeech(data_folder, save_folder, tr_splits, dev_splits, \
                            te_splits)
    """

    if skip_prep:
        return
    data_folder = data_folder
    splits = tr_splits + dev_splits + te_splits
    save_folder = save_folder
    select_n_sentences = select_n_sentences
    conf = {
        "select_n_sentences": select_n_sentences,
    }

    # Other variables
    # Saving folder
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)

    save_opt = os.path.join(save_folder, OPT_FILE)

    # Check if this phase is already done (if so, skip it)
    if skip(splits, save_folder, conf):
        logger.info("Skipping preparation, completed in previous run.")
        return
    else:
        logger.info("Data_preparation...")

    # Additional checks to make sure the data folder contains ksponspeech
    check_ksponspeech_folders(data_folder, splits)

    # parse trn file
    all_texts = {}
    for split_index in range(len(splits)):

        split = splits[split_index]
        dirlist = split2dirs(split)
        wav_lst = []
        for dir in dirlist:
            wav_lst += get_all_files(os.path.join(data_folder, dir),
                                     match_and=[".wav"])

        trnpath = os.path.join(data_folder, split + ".trn")
        text_dict = text_to_dict(trnpath)
        all_texts.update(text_dict)

        if select_n_sentences is not None:
            n_sentences = select_n_sentences[split_index]
        else:
            n_sentences = len(wav_lst)

        create_csv(
            save_folder,
            wav_lst,
            text_dict,
            split,
            n_sentences,
        )

    # Merging csv file if needed
    if merge_lst and merge_name is not None:
        merge_files = [split_kspon + ".csv" for split_kspon in merge_lst]
        merge_csvs(
            data_folder=save_folder,
            csv_lst=merge_files,
            merged_csv=merge_name,
        )

    # saving options
    save_pkl(conf, save_opt)
Ejemplo n.º 4
0
def prepare_ami(
    data_folder,
    manual_annot_folder,
    save_folder,
    ref_rttm_dir,
    meta_data_dir,
    split_type="full_corpus_asr",
    skip_TNO=True,
    mic_type="Lapel",
    vad_type="oracle",
    max_subseg_dur=3.0,
    overlap=1.5,
):
    """
    Prepares reference RTTM and JSON files for the AMI dataset.

    Arguments
    ---------
    data_folder : str
        Path to the folder where the original amicorpus is stored.
    manual_annot_folder : str
        Directory where the manual annotations are stored.
    save_folder : str
        The save directory in results.
    ref_rttm_dir : str
        Directory to store reference RTTM files.
    meta_data_dir : str
        Directory to store the meta data (json) files.
    split_type : str
        Standard dataset split. See ami_splits.py for more information.
        Allowed split_type: "scenario_only", "full_corpus" or "full_corpus_asr"
    skip_TNO: bool
        Skips TNO meeting recordings if True.
    mic_type : str
        Type of microphone to be used.
    vad_type : str
        Type of VAD. Kept for future when VAD will be added.
    max_subseg_dur : float
        Duration in seconds of a subsegments to be prepared from larger segments.
    overlap : float
        Overlap duration in seconds between adjacent subsegments

    Example
    -------
    >>> from recipes.AMI.ami_prepare import prepare_ami
    >>> data_folder = '/network/datasets/ami/amicorpus/'
    >>> manual_annot_folder = '/home/mila/d/dawalatn/nauman/ami_public_manual/'
    >>> save_folder = 'results/save/'
    >>> split_type = 'full_corpus_asr'
    >>> mic_type = 'Lapel'
    >>> prepare_ami(data_folder, manual_annot_folder, save_folder, split_type, mic_type)
    """

    # Meta files
    meta_files = [
        os.path.join(meta_data_dir, "ami_train." + mic_type + ".subsegs.json"),
        os.path.join(meta_data_dir, "ami_dev." + mic_type + ".subsegs.json"),
        os.path.join(meta_data_dir, "ami_eval." + mic_type + ".subsegs.json"),
    ]

    # Create configuration for easily skipping data_preparation stage
    conf = {
        "data_folder": data_folder,
        "save_folder": save_folder,
        "ref_rttm_dir": ref_rttm_dir,
        "meta_data_dir": meta_data_dir,
        "split_type": split_type,
        "skip_TNO": skip_TNO,
        "mic_type": mic_type,
        "vad": vad_type,
        "max_subseg_dur": max_subseg_dur,
        "overlap": overlap,
        "meta_files": meta_files,
    }

    if not os.path.exists(save_folder):
        os.makedirs(save_folder)

    # Setting output option files.
    opt_file = "opt_ami_prepare." + mic_type + ".pkl"

    # Check if this phase is already done (if so, skip it)
    if skip(save_folder, conf, meta_files, opt_file):
        logger.info(
            "Skipping data preparation, as it was completed in previous run.")
        return

    msg = "\tCreating meta-data file for the AMI Dataset.."
    logger.debug(msg)

    # Get the split
    train_set, dev_set, eval_set = get_AMI_split(split_type)

    # Prepare RTTM from XML(manual annot) and store are groundtruth
    # Create ref_RTTM directory
    if not os.path.exists(ref_rttm_dir):
        os.makedirs(ref_rttm_dir)

    # Create reference RTTM files
    splits = ["train", "dev", "eval"]
    for i in splits:
        rttm_file = ref_rttm_dir + "/fullref_ami_" + i + ".rttm"
        if i == "train":
            prepare_segs_for_RTTM(
                train_set,
                rttm_file,
                data_folder,
                manual_annot_folder,
                i,
                skip_TNO,
            )
        if i == "dev":
            prepare_segs_for_RTTM(
                dev_set,
                rttm_file,
                data_folder,
                manual_annot_folder,
                i,
                skip_TNO,
            )
        if i == "eval":
            prepare_segs_for_RTTM(
                eval_set,
                rttm_file,
                data_folder,
                manual_annot_folder,
                i,
                skip_TNO,
            )

    # Create meta_files for splits
    meta_data_dir = meta_data_dir
    if not os.path.exists(meta_data_dir):
        os.makedirs(meta_data_dir)

    for i in splits:
        rttm_file = ref_rttm_dir + "/fullref_ami_" + i + ".rttm"
        meta_filename_prefix = "ami_" + i
        prepare_metadata(
            rttm_file,
            meta_data_dir,
            data_folder,
            meta_filename_prefix,
            max_subseg_dur,
            overlap,
            mic_type,
        )

    save_opt_file = os.path.join(save_folder, opt_file)
    save_pkl(conf, save_opt_file)
Ejemplo n.º 5
0
def create_csv(
    wav_lst,
    csv_file,
    uppercase,
    data_folder,
    phn_set,
    kaldi_lab=None,
    kaldi_lab_opts=None,
    kaldi_lab_dir=None,
):
    """
    Creates the csv file given a list of wav files.

    Arguments
    ---------
    wav_lst : list
        The list of wav files of a given data split.
    csv_file : str
        The path of the output csv file
    uppercase : bool
        Whether this is the uppercase version of timit.
    data_folder : str
        The location of the data.
    kaldi_lab : str, optional
        Default: None
        The path of the kaldi labels (optional).
    kaldi_lab_opts : str, optional
        Default: None
        A string containing the options used to compute the labels.

    Returns
    -------
    None
    """

    # Adding some Prints
    msg = "Creating csv lists in  %s..." % (csv_file)
    logger.info(msg)

    # Reading kaldi labels if needed:
    snt_no_lab = 0
    missing_lab = False

    if kaldi_lab is not None:

        lab = read_kaldi_lab(
            kaldi_lab,
            kaldi_lab_opts,
        )

        if not os.path.exists(kaldi_lab_dir):
            os.makedirs(kaldi_lab_dir)

    csv_lines = [[
        "ID",
        "duration",
        "wav",
        "wav_format",
        "wav_opts",
        "spk_id",
        "spk_id_format",
        "spk_id_opts",
        "phn",
        "phn_format",
        "phn_opts",
        "wrd",
        "wrd_format",
        "wrd_opts",
        "ground_truth_phn_ends",
        "ground_truth_phn_ends_format",
        "ground_truth_phn_ends_opts",
    ]]

    if kaldi_lab is not None:
        csv_lines[0].append("kaldi_lab")
        csv_lines[0].append("kaldi_lab_format")
        csv_lines[0].append("kaldi_lab_opts")

    # Processing all the wav files in the list
    for wav_file in wav_lst:

        # Getting sentence and speaker ids
        spk_id = wav_file.split("/")[-2]
        snt_id = wav_file.split("/")[-1].replace(".wav", "")
        snt_id = spk_id + "_" + snt_id

        if kaldi_lab is not None:
            if snt_id not in lab.keys():
                missing_lab = False
                msg = ("The sentence %s does not have a corresponding "
                       "kaldi label" % (snt_id))

                logger.info(msg)
                snt_no_lab = snt_no_lab + 1
            else:
                snt_lab_path = os.path.join(kaldi_lab_dir, snt_id + ".pkl")
                save_pkl(lab[snt_id], snt_lab_path)

            # If too many kaldi labels are missing rise an error
            if snt_no_lab / len(wav_lst) > 0.05:
                err_msg = ("Too many sentences do not have the "
                           "corresponding kaldi label. Please check data and "
                           "kaldi labels (check %s and %s)." %
                           (data_folder, kaldi_lab))
                logger.debutg(err_msg)

        if missing_lab:
            continue

        # Reading the signal (to retrieve duration in seconds)
        signal = read_audio(wav_file)
        duration = len(signal) / SAMPLERATE

        # Retrieving words and check for uppercase
        if uppercase:
            wrd_file = wav_file.replace(".WAV", ".WRD")
        else:
            wrd_file = wav_file.replace(".wav", ".wrd")
        if not os.path.exists(os.path.dirname(wrd_file)):
            err_msg = "the wrd file %s does not exists!" % (wrd_file)
            raise FileNotFoundError(err_msg)

        words = [line.rstrip("\n").split(" ")[2] for line in open(wrd_file)]
        words = " ".join(words)

        # Retrieving phonemes
        if uppercase:
            phn_file = wav_file.replace(".WAV", ".PHN")
        else:
            phn_file = wav_file.replace(".wav", ".phn")

        if not os.path.exists(os.path.dirname(phn_file)):
            err_msg = "the wrd file %s does not exists!" % (phn_file)
            raise FileNotFoundError(err_msg)

        # Getting the phoneme and ground truth ends lists from the phn files
        phonemes, ends = get_phoneme_lists(phn_file, phn_set)

        # Composition of the csv_line
        csv_line = [
            snt_id,
            str(duration),
            wav_file,
            "wav",
            "",
            spk_id,
            "string",
            "",
            str(phonemes),
            "string",
            "",
            str(words),
            "string",
            "label:False",
            str(ends),
            "string",
            "label:False",
        ]

        if kaldi_lab is not None:
            csv_line.append(snt_lab_path)
            csv_line.append("pkl")
            csv_line.append("")

        # Adding this line to the csv_lines list
        csv_lines.append(csv_line)

    # Writing the csv lines
    _write_csv(csv_lines, csv_file)
    msg = "%s sucessfully created!" % (csv_file)
    logger.info(msg)
Ejemplo n.º 6
0
def prepare_timit(
    data_folder,
    splits,
    save_folder,
    kaldi_ali_tr=None,
    kaldi_ali_dev=None,
    kaldi_ali_test=None,
    kaldi_lab_opts=None,
    phn_set=39,
    uppercase=False,
):
    """
    repares the csv files for the TIMIT dataset.

    Arguments
    ---------
    data_folder : str
        Path to the folder where the original TIMIT dataset is stored.
    splits : list
        List of splits to prepare from ['train', 'dev', 'test']
    save_folder : str
        The directory where to store the csv files.
    kaldi_ali_tr : dict, optional
        Default: 'None'
        When set, this is the directiory where the kaldi
        training alignments are stored.  They will be automatically converted
        into pkl for an easier use within speechbrain.
    kaldi_ali_dev : str, optional
        Default: 'None'
        When set, this is the path to directory where the
        kaldi dev alignments are stored.
    kaldi_ali_te : str, optional
        Default: 'None'
        When set, this is the path to the directory where the
        kaldi test alignments are stored.
    phn_set : {60, 48, 39}, optional,
        Default: 39
        The phoneme set to use in the phn label.
        It could be composed of 60, 48, or 39 phonemes.
    uppercase : bool, optional
        Default: False
        This option must be True when the TIMIT dataset
        is in the upper-case version.

    Example
    -------
    >>> from recipes.TIMIT.timit_prepare import prepare_timit
    >>> data_folder = 'datasets/TIMIT'
    >>> splits = ['train', 'dev', 'test']
    >>> save_folder = 'TIMIT_prepared'
    >>> prepare_timit(data_folder, splits, save_folder)
    """
    conf = {
        "data_folder": data_folder,
        "splits": splits,
        "kaldi_ali_tr": kaldi_ali_tr,
        "kaldi_ali_dev": kaldi_ali_dev,
        "kaldi_ali_test": kaldi_ali_test,
        "save_folder": save_folder,
        "phn_set": phn_set,
        "uppercase": uppercase,
    }

    # Getting speaker dictionary
    dev_spk, test_spk = _get_speaker()

    # Avoid calibration sentences
    avoid_sentences = ["sa1", "sa2"]

    # Setting file extension.
    extension = [".wav"]

    # Checking TIMIT_uppercase
    if uppercase:
        avoid_sentences = [item.upper() for item in avoid_sentences]
        extension = [item.upper() for item in extension]
        dev_spk = [item.upper() for item in dev_spk]
        test_spk = [item.upper() for item in test_spk]

    # Setting the save folder
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)

    # Setting ouput files
    save_opt = os.path.join(save_folder, OPT_FILE)
    save_csv_train = os.path.join(save_folder, TRAIN_CSV)
    save_csv_dev = os.path.join(save_folder, DEV_CSV)
    save_csv_test = os.path.join(save_folder, TEST_CSV)

    # Check if this phase is already done (if so, skip it)
    if skip(splits, save_folder, conf):
        logger.info("Skipping preparation, completed in previous run.")
        return

    # Additional checks to make sure the data folder contains TIMIT
    _check_timit_folders(uppercase, data_folder)

    msg = "Creating csv file for the TIMIT Dataset.."
    logger.info(msg)

    # Creating csv file for training data
    if "train" in splits:

        # Checking TIMIT_uppercase
        if uppercase:
            match_lst = extension + ["TRAIN"]
        else:
            match_lst = extension + ["train"]

        wav_lst_train = get_all_files(
            data_folder,
            match_and=match_lst,
            exclude_or=avoid_sentences,
        )

        create_csv(
            wav_lst_train,
            save_csv_train,
            uppercase,
            data_folder,
            phn_set,
            kaldi_lab=kaldi_ali_tr,
            kaldi_lab_opts=kaldi_lab_opts,
        )

    # Creating csv file for dev data
    if "dev" in splits:

        # Checking TIMIT_uppercase
        if uppercase:
            match_lst = extension + ["TEST"]
        else:
            match_lst = extension + ["test"]

        wav_lst_dev = get_all_files(
            data_folder,
            match_and=match_lst,
            match_or=dev_spk,
            exclude_or=avoid_sentences,
        )

        create_csv(
            wav_lst_dev,
            save_csv_dev,
            uppercase,
            data_folder,
            phn_set,
            kaldi_lab=kaldi_ali_dev,
            kaldi_lab_opts=kaldi_lab_opts,
        )

    # Creating csv file for test data
    if "test" in splits:

        # Checking TIMIT_uppercase
        if uppercase:
            match_lst = extension + ["TEST"]
        else:
            match_lst = extension + ["test"]

        wav_lst_test = get_all_files(
            data_folder,
            match_and=match_lst,
            match_or=test_spk,
            exclude_or=avoid_sentences,
        )

        create_csv(
            wav_lst_test,
            save_csv_test,
            uppercase,
            data_folder,
            phn_set,
            kaldi_lab=kaldi_ali_test,
            kaldi_lab_opts=kaldi_lab_opts,
        )

    # saving options
    save_pkl(conf, save_opt)