def test_drop_freq():
    from glob import glob

    for filename in glob(os.path.join(output_folder, "save", "*.flac")):
        expected_file = filename.replace("results", "expected")
        actual = read_audio(filename)
        expected = read_audio(expected_file)
        assert actual.allclose(expected)
Beispiel #2
0
def create_json(wav_list, json_file):
    """
    Creates the json file given a list of wav files.

    Arguments
    ---------
    wav_list : list of str
        The list of wav files.
    json_file : str
        The path of the output json file
    """
    # Processing all the wav files in the list
    json_dict = {}
    for wav_file in wav_list:

        # Reading the signal (to retrieve duration in seconds)
        signal = read_audio(wav_file)
        duration = signal.shape[0] / SAMPLERATE

        # Manipulate path to get relative path and uttid
        path_parts = wav_file.split(os.path.sep)
        uttid, _ = os.path.splitext(path_parts[-1])
        relative_path = os.path.join("{data_root}", *path_parts[-5:])

        # Create entry for this utterance
        json_dict[uttid] = {"wav": relative_path, "length": duration}

    # Writing the dictionary to the json file
    with open(json_file, mode="w") as json_f:
        json.dump(json_dict, json_f, indent=2)

    logger.info(f"{json_file} successfully created!")
Beispiel #3
0
def create_json(wav_lst, json_file, clean_folder, txt_folder, lexicon):
    """
    Creates the json file given a list of wav files.

    Arguments
    ---------
    wav_lst : list
        The list of wav files.
    json_file : str
        The path of the output json file
    clean_folder : str
        The location of parallel clean samples.
    txt_folder : str
        The location of the transcript files.
    """
    logger.debug(f"Creating json lists in {json_file}")

    # Processing all the wav files in the list
    json_dict = {}
    for wav_file in wav_lst:  # ex:p203_122.wav

        # Example wav_file: p232_001.wav
        noisy_path, filename = os.path.split(wav_file)
        _, noisy_dir = os.path.split(noisy_path)
        _, clean_dir = os.path.split(clean_folder)
        noisy_rel_path = os.path.join("{data_root}", noisy_dir, filename)
        clean_rel_path = os.path.join("{data_root}", clean_dir, filename)

        # Reading the signal (to retrieve duration in seconds)
        signal = read_audio(wav_file)
        duration = signal.shape[0] / SAMPLERATE

        # Read text
        snt_id = filename.replace(".wav", "")
        with open(os.path.join(txt_folder, snt_id + ".txt")) as f:
            word_string = f.read()
        word_string = remove_punctuation(word_string).strip().upper()
        phones = [
            phn for word in word_string.split()
            for phn in lexicon[word].split()
        ]

        # Remove duplicate phones
        phones = [i for i, j in zip(phones, phones[1:] + [None]) if i != j]
        phone_string = " ".join(phones)

        json_dict[snt_id] = {
            "noisy_wav": noisy_rel_path,
            "clean_wav": clean_rel_path,
            "length": duration,
            "words": word_string,
            "phones": phone_string,
        }

    # Writing the json lines
    with open(json_file, mode="w") as json_f:
        json.dump(json_dict, json_f, indent=2)

    logger.info(f"{json_file} successfully created!")
def generate_silence_data(num_known_samples_per_split,
                          splits,
                          data_folder,
                          percentage_silence=26):
    """Generates silence samples.

    Arguments
    ---------
    num_known_samples_per_split: int
        Total number of samples of known words for each split (i.e. set).
    splits: str
        Training, validation and test sets.
    data_folder: str
        path to dataset.
    percentage_silence: int
        How many silence samples to generate; relative to the total number of known words.
  """
    for split in splits:
        num_silence_samples = int(
            (percentage_silence / 100.0) * num_known_samples_per_split[split])

        # Fetch all background noise wav files used to generate silence samples
        search_path = os.path.join(data_folder, "_background_noise_", "*.wav")
        silence_paths = []
        for wav_path in glob.glob(search_path):
            silence_paths.append(wav_path)

        # Generate random silence samples
        # Assumes that the pytorch seed has been defined in the HyperPyYaml file
        num_silence_samples_per_path = int(num_silence_samples /
                                           len(silence_paths))
        for silence_path in silence_paths:
            signal = read_audio(silence_path)
            random_starts = ((torch.rand(num_silence_samples_per_path) *
                              (signal.shape[0] - 16001)).type(
                                  torch.int).tolist())

            for i, random_start in enumerate(random_starts):
                splits[split]["ID"].append(
                    re.sub(
                        r".wav",
                        "/" + str(random_start) + "_" + str(i),
                        re.sub(r".+?(?=_background_noise_)", "", silence_path),
                    ))

                splits[split]["duration"].append(1.0)
                splits[split]["start"].append(random_start)
                splits[split]["stop"].append(random_start + 16000)
                splits[split]["wav"].append(silence_path)
                splits[split]["spk_id"].append(None)
                splits[split]["command"].append("silence")
                splits[split]["transcript"].append(None)
Beispiel #5
0
def test_read_audio(tmpdir):
    from speechbrain.dataio.dataio import read_audio, write_audio

    test_waveform = torch.rand(16000)
    wavfile = os.path.join(tmpdir, "wave.wav")
    write_audio(wavfile, test_waveform, 16000)

    # dummy annotation
    for i in range(3):
        start = torch.randint(0, 8000, (1, )).item()
        stop = start + torch.randint(500, 1000, (1, )).item()
        wav_obj = {"wav": {"file": wavfile, "start": start, "stop": stop}}
        loaded = read_audio(wav_obj["wav"])
        assert loaded.allclose(test_waveform[start:stop], atol=1e-4)
def generalized_eigenvalue(audio_file, diffuse=True, show_plots=False):
    xs_speech = read_audio(audio_file)
    xs_speech = xs_speech.unsqueeze(0)

    stft = STFT(sample_rate=fs)
    cov = Covariance()
    gev = Gev()
    istft = ISTFT(sample_rate=fs)

    Xs = stft(xs_speech)

    SSs = cov(Xs)
    NNs = cov(Xs)
    Ys_gev = gev(Xs, SSs, NNs)
    ys_gev = istft(Ys_gev)

    if show_plots:
        plt.figure(1)
        plt.title("Noisy signal at microphone 1")
        plt.imshow(
            torch.transpose(
                torch.log(Xs[0, :, :, 0, 0] ** 2 + Xs[0, :, :, 1, 0] ** 2), 1, 0
            ),
            origin="lower",
        )
        plt.figure(2)
        plt.title("Noisy signal at microphone 1")
        plt.plot(xs_speech.squeeze()[:, 0])
        plt.figure(3)
        plt.title("Beamformed signal")
        plt.imshow(
            torch.transpose(
                torch.log(
                    Ys_gev[0, :, :, 0, 0] ** 2 + Ys_gev[0, :, :, 1, 0] ** 2
                ),
                1,
                0,
            ),
            origin="lower",
        )
        plt.figure(4)
        plt.title("Beamformed signal")
        plt.plot(ys_gev.squeeze())
        plt.show()

    return ys_gev.squeeze()
Beispiel #7
0
def create_json(wav_lst, json_file, clean_folder):
    """
    Creates the json file given a list of wav files.

    Arguments
    ---------
    wav_lst : list
        The list of wav files.
    json_file : str
        The path of the output json file
    clean_folder : str
        The location of parallel clean samples.
    """
    logger.debug(f"Creating json lists in {json_file}")

    # Processing all the wav files in the list
    json_dict = {}
    for wav_file in wav_lst:  # ex:p203_122.wav

        # Example wav_file: p232_001.wav
        noisy_path, filename = os.path.split(wav_file)
        _, noisy_dir = os.path.split(noisy_path)
        _, clean_dir = os.path.split(clean_folder)
        noisy_rel_path = os.path.join("{data_root}", noisy_dir, filename)
        clean_rel_path = os.path.join("{data_root}", clean_dir, filename)

        # Reading the signal (to retrieve duration in seconds)
        signal = read_audio(wav_file)
        duration = signal.shape[0] / SAMPLERATE

        # Read text
        snt_id = filename.replace(".wav", "")

        json_dict[snt_id] = {
            "noisy_wav": noisy_rel_path,
            "clean_wav": clean_rel_path,
            "length": duration,
        }

    # Writing the json lines
    with open(json_file, mode="w") as json_f:
        json.dump(json_dict, json_f, indent=2)

    logger.info(f"{json_file} successfully created!")
Beispiel #8
0
def delay_and_sum(audio_file, show_plots=False):
    xs_speech = read_audio(audio_file)
    xs_speech = xs_speech.unsqueeze(0)

    stft = STFT(sample_rate=fs)
    cov = Covariance()
    gccphat = GccPhat()
    delaysum = DelaySum()
    istft = ISTFT(sample_rate=fs)

    Xs = stft(xs_speech)
    XXs = cov(Xs)
    tdoas = gccphat(XXs)
    Ys_ds = delaysum(Xs, tdoas)
    ys_ds = istft(Ys_ds)

    if show_plots:
        plt.figure(1)
        plt.title("Noisy signal at microphone 1")
        plt.imshow(
            torch.transpose(
                torch.log(Xs[0, :, :, 0, 0]**2 + Xs[0, :, :, 1, 0]**2), 1, 0),
            origin="lower",
        )
        plt.figure(2)
        plt.title("Noisy signal at microphone 1")
        plt.plot(xs_speech.squeeze()[:, 0])
        plt.figure(3)
        plt.title("Beamformed signal")
        plt.imshow(
            torch.transpose(
                torch.log(Ys_ds[0, :, :, 0, 0]**2 + Ys_ds[0, :, :, 1, 0]**2),
                1,
                0,
            ),
            origin="lower",
        )
        plt.figure(4)
        plt.title("Beamformed signal")
        plt.plot(ys_ds.squeeze())

    return ys_ds.squeeze()
Beispiel #9
0
def create_json(
    wav_lst,
    json_file,
    uppercase,
    phn_set,
):
    """
    Creates the json file given a list of wav files.

    Arguments
    ---------
    wav_lst : list
        The list of wav files of a given data split.
    json_file : str
            The path of the output json file.
    uppercase : bool
        Whether this is the uppercase version of timit.
    phn_set : {60, 48, 39}, optional,
        Default: 39
        The phoneme set to use in the phn label.
    """

    # Adding some Prints
    msg = "Creating %s..." % (json_file)
    logger.info(msg)
    json_dict = {}

    for wav_file in wav_lst:

        # Getting sentence and speaker ids
        spk_id = wav_file.split("/")[-2]
        snt_id = wav_file.split("/")[-1].replace(".wav", "")
        snt_id = spk_id + "_" + snt_id

        # Reading the signal (to retrieve duration in seconds)
        signal = read_audio(wav_file)
        duration = len(signal) / SAMPLERATE

        # Retrieving words and check for uppercase
        if uppercase:
            wrd_file = wav_file.replace(".WAV", ".WRD")
        else:
            wrd_file = wav_file.replace(".wav", ".wrd")

        if not os.path.exists(os.path.dirname(wrd_file)):
            err_msg = "the wrd file %s does not exists!" % (wrd_file)
            raise FileNotFoundError(err_msg)

        words = [line.rstrip("\n").split(" ")[2] for line in open(wrd_file)]
        words = " ".join(words)

        # Retrieving phonemes
        if uppercase:
            phn_file = wav_file.replace(".WAV", ".PHN")
        else:
            phn_file = wav_file.replace(".wav", ".phn")

        if not os.path.exists(os.path.dirname(phn_file)):
            err_msg = "the wrd file %s does not exists!" % (phn_file)
            raise FileNotFoundError(err_msg)

        # Getting the phoneme and ground truth ends lists from the phn files
        phonemes, ends = get_phoneme_lists(phn_file, phn_set)

        json_dict[snt_id] = {
            "wav": wav_file,
            "duration": duration,
            "spk_id": spk_id,
            "phn": phonemes,
            "wrd": words,
            "ground_truth_phn_ends": ends,
        }

    # Writing the dictionary to the json file
    with open(json_file, mode="w") as json_f:
        json.dump(json_dict, json_f, indent=2)

    logger.info(f"{json_file} successfully created!")
def create_json(metadata, audio_data_folder, folds_list, json_file):
    """
    Creates the json file given a list of wav files.
    Arguments
    ---------
    metadata: dict
        A dictionary containing the UrbanSound8k metadata file modified for the
        SpeechBrain, such that keys are IDs (which are the .wav file names without the file extension).
    folds_list : list of int
        The list of folds [1,10] to include in this batch
    json_file : str
        The path of the output json file
    """
    # Processing all the wav files in the list
    json_dict = {}

    for ID, sample_metadata in metadata.items():
        fold_num = int(sample_metadata["fold"])
        if fold_num in folds_list:
            # Reading the signal (to retrieve duration in seconds)
            wav_file = os.path.join(
                os.path.abspath(audio_data_folder),
                "fold" + str(fold_num) + "/",
                sample_metadata["slice_file_name"],
            )
            try:

                signal = read_audio(wav_file)
                file_info = torchaudio.info(wav_file)

                # If we're using sox/soundfile backend, file_info will have the old type
                if isinstance(file_info,
                              torchaudio.backend.common.AudioMetaData):
                    duration = signal.shape[0] / file_info.sample_rate
                else:
                    duration = signal.shape[0] / file_info[0].rate

                # Create entry for this sample ONLY if we have successfully read-in the file using SpeechBrain/torchaudio
                json_dict[ID] = {
                    "wav": sample_metadata["slice_file_name"],
                    "classID": int(sample_metadata["classID"]),
                    "class_string": sample_metadata["class_string"],
                    "salience": int(sample_metadata["salience"]),
                    "fold": sample_metadata["fold"],
                    "duration": duration,
                }
            except Exception:
                print(
                    f"There was a problem reading the file:{wav_file}. Skipping duration field for it."
                )
                logger.exception(
                    f"There was a problem reading the file:{wav_file}. Skipping it."
                )

    # Writing the dictionary to the json file
    # Need to make sure sub folder "manifest" exists, if not create it
    parent_dir = os.path.dirname(json_file)
    if not os.path.exists(parent_dir):
        os.mkdir(parent_dir)

    with open(json_file, mode="w") as json_f:
        json.dump(json_dict, json_f, indent=2)

    logger.info(f"{json_file} successfully created!")
Beispiel #11
0
def create_csv(
    wav_lst,
    csv_file,
    uppercase,
    data_folder,
    phn_set,
    kaldi_lab=None,
    kaldi_lab_opts=None,
    kaldi_lab_dir=None,
):
    """
    Creates the csv file given a list of wav files.

    Arguments
    ---------
    wav_lst : list
        The list of wav files of a given data split.
    csv_file : str
        The path of the output csv file
    uppercase : bool
        Whether this is the uppercase version of timit.
    data_folder : str
        The location of the data.
    kaldi_lab : str, optional
        Default: None
        The path of the kaldi labels (optional).
    kaldi_lab_opts : str, optional
        Default: None
        A string containing the options used to compute the labels.

    Returns
    -------
    None
    """

    # Adding some Prints
    msg = "Creating csv lists in  %s..." % (csv_file)
    logger.info(msg)

    # Reading kaldi labels if needed:
    snt_no_lab = 0
    missing_lab = False

    if kaldi_lab is not None:

        lab = read_kaldi_lab(
            kaldi_lab,
            kaldi_lab_opts,
        )

        if not os.path.exists(kaldi_lab_dir):
            os.makedirs(kaldi_lab_dir)

    csv_lines = [[
        "ID",
        "duration",
        "wav",
        "wav_format",
        "wav_opts",
        "spk_id",
        "spk_id_format",
        "spk_id_opts",
        "phn",
        "phn_format",
        "phn_opts",
        "wrd",
        "wrd_format",
        "wrd_opts",
        "ground_truth_phn_ends",
        "ground_truth_phn_ends_format",
        "ground_truth_phn_ends_opts",
    ]]

    if kaldi_lab is not None:
        csv_lines[0].append("kaldi_lab")
        csv_lines[0].append("kaldi_lab_format")
        csv_lines[0].append("kaldi_lab_opts")

    # Processing all the wav files in the list
    for wav_file in wav_lst:

        # Getting sentence and speaker ids
        spk_id = wav_file.split("/")[-2]
        snt_id = wav_file.split("/")[-1].replace(".wav", "")
        snt_id = spk_id + "_" + snt_id

        if kaldi_lab is not None:
            if snt_id not in lab.keys():
                missing_lab = False
                msg = ("The sentence %s does not have a corresponding "
                       "kaldi label" % (snt_id))

                logger.info(msg)
                snt_no_lab = snt_no_lab + 1
            else:
                snt_lab_path = os.path.join(kaldi_lab_dir, snt_id + ".pkl")
                save_pkl(lab[snt_id], snt_lab_path)

            # If too many kaldi labels are missing rise an error
            if snt_no_lab / len(wav_lst) > 0.05:
                err_msg = ("Too many sentences do not have the "
                           "corresponding kaldi label. Please check data and "
                           "kaldi labels (check %s and %s)." %
                           (data_folder, kaldi_lab))
                logger.debutg(err_msg)

        if missing_lab:
            continue

        # Reading the signal (to retrieve duration in seconds)
        signal = read_audio(wav_file)
        duration = len(signal) / SAMPLERATE

        # Retrieving words and check for uppercase
        if uppercase:
            wrd_file = wav_file.replace(".WAV", ".WRD")
        else:
            wrd_file = wav_file.replace(".wav", ".wrd")
        if not os.path.exists(os.path.dirname(wrd_file)):
            err_msg = "the wrd file %s does not exists!" % (wrd_file)
            raise FileNotFoundError(err_msg)

        words = [line.rstrip("\n").split(" ")[2] for line in open(wrd_file)]
        words = " ".join(words)

        # Retrieving phonemes
        if uppercase:
            phn_file = wav_file.replace(".WAV", ".PHN")
        else:
            phn_file = wav_file.replace(".wav", ".phn")

        if not os.path.exists(os.path.dirname(phn_file)):
            err_msg = "the wrd file %s does not exists!" % (phn_file)
            raise FileNotFoundError(err_msg)

        # Getting the phoneme and ground truth ends lists from the phn files
        phonemes, ends = get_phoneme_lists(phn_file, phn_set)

        # Composition of the csv_line
        csv_line = [
            snt_id,
            str(duration),
            wav_file,
            "wav",
            "",
            spk_id,
            "string",
            "",
            str(phonemes),
            "string",
            "",
            str(words),
            "string",
            "label:False",
            str(ends),
            "string",
            "label:False",
        ]

        if kaldi_lab is not None:
            csv_line.append(snt_lab_path)
            csv_line.append("pkl")
            csv_line.append("")

        # Adding this line to the csv_lines list
        csv_lines.append(csv_line)

    # Writing the csv lines
    _write_csv(csv_lines, csv_file)
    msg = "%s sucessfully created!" % (csv_file)
    logger.info(msg)
Beispiel #12
0
def get_wsj_files(wsj0root, output_dir, save_fs="wav8k", min_maxs=["min"]):
    """
    This function constructs the wsj0-2mix dataset out of wsj0 dataset.
    (We are assuming that we have the wav files and not the sphere format)

    Argument:
        wsj0root (str): This string specifies the root folder for the wsj0 dataset.
        output_dir (str): The string that species the save folder.
        save_fs (str): The string that specifies the saving sampling frequency, in ['wav8k', 'wav16k']
        min_maxs (list): The list that contains the specification on whether we take min. or max. of signals
                         to construct the mixtures. example: ["min", "max"]
    """

    data_types = ["tr", "cv", "tt"]  # train, valid and test sets

    from oct2py import octave

    filedir = os.path.dirname(os.path.realpath(__file__))
    octave.addpath(
        filedir + "/meta"
    )  # add the matlab functions to octave dir here

    fs_read = 8000 if save_fs == "wav8k" else 16000

    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    if not os.path.exists(os.path.join(output_dir, save_fs)):
        os.mkdir(os.path.join(output_dir, save_fs))

    log_dir = os.path.join(output_dir, save_fs + "/mixture_definitions_log")
    if not os.path.exists(log_dir):
        os.mkdir(log_dir)

    # get the the text files in the current working directory
    filelinks = [
        "https://www.dropbox.com/s/u5gk5h3htzw4cgo/mix_2_spk_tr.txt?dl=1",
        "https://www.dropbox.com/s/s3s6311d95n4sip/mix_2_spk_cv.txt?dl=1",
        "https://www.dropbox.com/s/9kdxb2uz18a5k9d/mix_2_spk_tt.txt?dl=1",
    ]
    for filelink, data_type in zip(filelinks, data_types):
        filepath = os.path.join(
            filedir, "meta", "mix_2_spk_" + data_type + ".txt"
        )
        if not os.path.exists(filepath):
            download_file(filelink, filepath)

    inner_folders = ["s1", "s2", "mix"]
    for min_max in min_maxs:
        for data_type in data_types:
            save_dir = os.path.join(
                output_dir, save_fs + "/" + min_max + "/" + data_type
            )

            if not os.path.exists(
                os.path.join(output_dir, save_fs + "/" + min_max)
            ):
                os.mkdir(os.path.join(output_dir, save_fs + "/" + min_max))

            if not os.path.exists(save_dir):
                os.mkdir(save_dir)

            for inner_folder in inner_folders:
                if not os.path.exists(os.path.join(save_dir, inner_folder)):
                    os.mkdir(os.path.join(save_dir, inner_folder))

            TaskFile = os.path.join(
                filedir, "meta", "mix_2_spk_" + data_type + ".txt"
            )
            Source1File, Source2File, MixFile, C = arrange_task_files(
                TaskFile, min_max, data_type, log_dir
            )

            fid_s1 = open(Source1File, "w")
            fid_s2 = open(Source2File, "w")
            fid_m = open(MixFile, "w")

            num_files = len(C)

            print("{} \n".format(min_max + "_" + data_type))

            for i, line in tqdm(enumerate(C)):

                _, inwav1_dir, _, inwav1_name = line[0].split("/")
                _, inwav2_dir, _, inwav2_name = line[2].split("/")

                # write the log data to the log files
                fid_s1.write("{}\n".format(line[0]))
                fid_s2.write("{}\n".format(line[2]))

                inwav1_snr = line[1]
                inwav2_snr = line[3]

                mix_name = (
                    inwav1_name
                    + "_"
                    + str(inwav1_snr)
                    + "_"
                    + inwav2_name
                    + "_"
                    + str(inwav2_snr)
                )
                fid_m.write("{}\n".format(mix_name))

                fs, _ = wavfile.read(os.path.join(wsj0root, line[0]))
                s1 = read_audio(os.path.join(wsj0root, line[0]))
                s2 = read_audio(os.path.join(wsj0root, line[2]))

                # resample, determine levels for source 1
                s1_8k = signal.resample(s1, int((fs_read / fs) * len(s1)))
                out = octave.activlev(s1_8k, fs_read, "n")
                s1_8k, lev1 = out[:-1].squeeze(), out[-1]
                # print('lev1 {}'.format(lev1))

                # resample, determine levels for source 2
                s2_8k = signal.resample(s2, int((fs_read / fs) * len(s2)))
                out = octave.activlev(s2_8k, fs_read, "n")
                s2_8k, lev2 = out[:-1].squeeze(), out[-1]

                weight_1 = 10 ** (float(inwav1_snr) / 20)
                weight_2 = 10 ** (float(inwav2_snr) / 20)

                # apply same gain to 16 kHz file
                if save_fs == "wav8k":
                    s1_8k = weight_1 * s1_8k
                    s2_8k = weight_2 * s2_8k

                    scaling_8k, scaling16bit_8k = save_mixture(
                        s1_8k,
                        s2_8k,
                        min_max,
                        weight_1,
                        weight_2,
                        num_files,
                        lev1,
                        lev2,
                        save_fs,
                        output_dir,
                        data_type,
                        mix_name,
                        i,
                    )
                elif save_fs == "wav16k":
                    s1_16k = weight_1 * s1 / np.sqrt(lev1)
                    s2_16k = weight_2 * s2 / np.sqrt(lev2)

                    scaling_16k, scaling16bit_16k = save_mixture(
                        s1_16k,
                        s2_16k,
                        min_max,
                        weight_1,
                        weight_2,
                        num_files,
                        lev1,
                        lev2,
                        save_fs,
                        output_dir,
                        data_type,
                        mix_name,
                        i,
                    )
                else:
                    raise ValueError("Incorrect sampling frequency for saving")

            if save_fs == "wav8k":
                pickle.dump(
                    {
                        "scaling_8k": scaling_8k,
                        "scaling8bit_8k": scaling16bit_8k,
                    },
                    open(
                        output_dir
                        + "/"
                        + save_fs
                        + "/"
                        + min_max
                        + "/"
                        + data_type
                        + "/scaling.pkl",
                        "wb",
                    ),
                )
            elif save_fs == "wav16k":
                pickle.dump(
                    {
                        "scaling_16k": scaling_16k,
                        "scaling16bit_16k": scaling16bit_16k,
                    },
                    open(
                        output_dir
                        + "/"
                        + save_fs
                        + "/"
                        + min_max
                        + "/"
                        + data_type
                        + "/scaling.pkl",
                        "wb",
                    ),
                )
            else:
                raise ValueError("Incorrect sampling frequency for saving")
Beispiel #13
0
def create_csv(
    csv_file,
    wav_lst,
    seg_size=None,
    has_target=False,
    noise_csv=None,
    noisy_folder=None,
    noise_snr_low=0,
    noise_snr_high=0,
):
    """
    Creates the csv file given a list of wav files.

    Arguments
    ---------
    csv_file : str
        The path of the output csv file
    wav_lst : list
        The list of wav files of a given data split.
    seg_size : int
        Split the file into multiple fix length segments (ms).
    has_target : bool
        Whether clean utterances are present in a similar directory.
    noise_csv : str
        A set of noise files to mix with the signals in `wav_lst`.
    noisy_folder : str
        A location for storing the mixed samples, if `noise_csv` is provided.
    noise_snr_low : float
        The lowest amplitude ratio to use when mixing `noise_csv`.
    noise_snr_high : float
        The highest amplitude ratio to use when mixing `noise_csv`.
    """

    if noise_csv and has_target:
        raise ValueError("Expected only one of `noise_csv` and `has_target`")

    logger.info("Creating csv list: %s" % csv_file)

    csv_lines = [["ID", "duration", "wav", "wav_format", "wav_opts"]]
    if noise_csv or has_target:
        csv_lines[0].extend(["target", "target_format", "target_opts"])

    if noise_csv:
        if not os.path.exists(noisy_folder):
            os.makedirs(noisy_folder)

        noise_adder = AddNoise(
            csv_file=noise_csv,
            snr_low=noise_snr_low,
            snr_high=noise_snr_high,
            pad_noise=True,
            normalize=True,
        )

    # Processing all the wav files in the list
    fileid = 0
    for wav_file in wav_lst:
        full_file_name = os.path.basename(wav_file)

        if has_target:
            fileid = full_file_name.split("_")[-1]
            target_folder = os.path.join(
                os.path.split(os.path.split(wav_file)[0])[0], "clean")
            target_file = os.path.join(target_folder, "clean_fileid_" + fileid)

        # Reading the signal (to retrieve duration in seconds)
        signal = read_audio(wav_file)
        duration = signal.shape[0] / SAMPLERATE

        if noise_csv:
            target = torch.Tensor(signal).unsqueeze(0)
            signal = noise_adder(target, torch.ones(1))
            filepath = os.path.join(noisy_folder, full_file_name)
            torchaudio.save(filepath, signal, SAMPLERATE)
            target_file = wav_file
            wav_file = filepath

        # Composition of the csv_line
        if not seg_size or duration < seg_size:
            csv_line = [full_file_name, str(duration), wav_file, "wav", ""]
            if noise_csv or has_target:
                csv_line.extend([target_file, "wav", ""])
            csv_lines.append(csv_line)

        else:
            for idx in range(int(duration // seg_size)):
                start = int(idx * seg_size * SAMPLERATE)
                stop = int((idx + 1) * seg_size * SAMPLERATE)
                csv_line = [
                    full_file_name + str(idx),
                    str(seg_size),
                    wav_file,
                    "wav",
                    "start:{} stop:{}".format(start, stop),
                ]
                if noise_csv or has_target:
                    csv_line.extend([
                        target_file,
                        "wav",
                        "start:{} stop:{}".format(start, stop),
                    ])

                # Adding this line to the csv_lines list
                csv_lines.append(csv_line)

    # Writing the csv lines
    _write_csv(csv_lines, csv_file)
    logger.info("%s successfully created!" % csv_file)
Beispiel #14
0
def prepare_TAS(data_folder, save_folder, type, train_splits, skip_prep=False):
    """
    This function prepares the Timers and Such dataset.
    If the folder does not exist, the zip file will be extracted. If the zip file does not exist, it will be downloaded.

    data_folder : path to Timers and Such dataset.
    save_folder: path there to save the csv manifest files.
    type : one of the following:

      "direct":{input=audio, output=semantics}
      "multistage":{input=audio, output=semantics} (using ASR transcripts in the middle)
      "decoupled":{input=transcript, output=semantics} (using ground-truth transcripts)

    train_splits : list of splits to be joined to form train .csv
    skip_prep: If True, skip data preparation

    """
    if skip_prep:
        return
    if type == "decoupled":
        try:
            import inflect

            p = inflect.engine()
        except ModuleNotFoundError:
            logger.info(
                'Error: the inflect module must be installed to run the "decoupled" SLU recipe.'
            )
            logger.info("Install using `pip install inflect`.")
            raise

    # If the data folders do not exist, we need to extract the data
    if not os.path.isdir(os.path.join(data_folder, "train-synth")):
        # Check for zip file and download if it doesn't exist
        zip_location = os.path.join(data_folder, "timers-and-such.zip")
        if not os.path.exists(zip_location):
            url = "https://zenodo.org/record/4623772/files/timers-and-such-v1.0.zip?download=1"
            download_file(url, zip_location, unpack=True)
        else:
            logger.info("Extracting timers-and-such.zip...")
            shutil.unpack_archive(zip_location, data_folder)

    splits = [
        "train-real",
        "dev-real",
        "test-real",
        "train-synth",
        "dev-synth",
        "test-synth",
    ]
    ID_start = 0  # needed to have a unique ID for each audio
    for split in splits:
        new_filename = os.path.join(save_folder, split) + "-type=%s.csv" % type
        if os.path.exists(new_filename):
            continue
        logger.info("Preparing %s..." % new_filename)

        ID = []
        duration = []

        wav = []
        wav_format = []
        wav_opts = []

        spk_id = []
        spk_id_format = []
        spk_id_opts = []

        semantics = []
        semantics_format = []
        semantics_opts = []

        transcript = []
        transcript_format = []
        transcript_opts = []

        df = pd.read_csv(os.path.join(data_folder, split) + ".csv")
        for i in range(len(df)):
            ID.append(ID_start + i)
            signal = read_audio(os.path.join(data_folder, df.path[i]))
            duration.append(signal.shape[0] / 16000)

            wav.append(os.path.join(data_folder, df.path[i]))
            wav_format.append("wav")
            wav_opts.append(None)

            spk_id.append(df.speakerId[i])
            spk_id_format.append("string")
            spk_id_opts.append(None)

            transcript_ = df.transcription[i]
            if type == "decoupled":
                words = transcript_.split()
                for w in range(len(words)):
                    words[w] = words[w].upper()
                    # If the word is numeric, we need to convert it to letters, to match what the ASR would output.
                    if any(c.isdigit() for c in words[w]):
                        if "AM" in words[w] or "PM" in words[w]:
                            AM_or_PM = "A M" if "AM" in words[w] else "P M"
                            if ":" in words[w]:
                                hour = words[w].split(":")[0]
                                minute = (
                                    words[w].split(":")[1].split("AM")[0]
                                    if "AM" in words[w] else
                                    words[w].split(":")[1].split("PM")[0])
                                words[w] = (p.number_to_words(hour).upper() +
                                            " " +
                                            p.number_to_words(minute).upper() +
                                            " " + AM_or_PM)
                            else:
                                hour = (words[w].split("AM")[0]
                                        if "AM" in words[w] else
                                        words[w].split("PM")[0])
                                words[w] = (p.number_to_words(hour).upper() +
                                            " " + AM_or_PM)
                        else:
                            words[w] = p.number_to_words(words[w]).upper()
                transcript_ = " ".join(words).replace("-", " ")

            transcript.append(transcript_)
            transcript_format.append("string")
            transcript_opts.append(None)

            semantics_ = df.semantics[i].replace(
                ".3333333333333333",
                ".33")  # Fix formatting error in some labels
            if type == "direct" or type == "multistage" or type == "decoupled":
                semantics.append(semantics_)
            if type == "joint-transcript-semantics":
                semantics.append("{'transcript': '" + transcript_ + "'| " +
                                 semantics_[1:])
            if type == "joint-semantics-transcript":
                semantics.append(semantics_[:-1] + "| 'transcript': '" +
                                 transcript_ + "'}")
            semantics_format.append("string")
            semantics_opts.append(None)

        new_df = pd.DataFrame({
            "ID": ID,
            "duration": duration,
            "wav": wav,
            "spk_id": spk_id,
            "semantics": semantics,
            "transcript": transcript,
        })
        new_df.to_csv(new_filename, index=False)
        ID_start += len(df)

    # Merge train splits
    train_splits = [split + "-type=%s.csv" % type for split in train_splits]
    merge_csvs(save_folder, train_splits, "train-type=%s.csv" % type)

    # Create "all-real" split
    real_splits = [
        split + "-type=%s.csv" % type
        for split in ["train-real", "dev-real", "test-real"]
    ]
    merge_csvs(save_folder, real_splits, "all-real-type=%s.csv" % type)
Beispiel #15
0
def create_csv(wav_lst, csv_file, clean_folder, txt_folder, lexicon):
    """
    Creates the csv file given a list of wav files.

    Arguments
    ---------
    wav_lst : list
        The list of wav files.
    csv_file : str
        The path of the output csv file
    clean_folder : str
        The location of parallel clean samples.
    txt_folder : str
        The location of the transcript files.
    """
    logger.debug(f"Creating csv lists in {csv_file}")

    csv_lines = [["ID", "duration"]]
    csv_lines[0].extend(["noisy_wav", "noisy_wav_format", "noisy_wav_opts"])
    csv_lines[0].extend(["clean_wav", "clean_wav_format", "clean_wav_opts"])
    csv_lines[0].extend(["wrd", "wrd_format", "wrd_opts"])
    csv_lines[0].extend(["phn", "phn_format", "phn_opts"])
    csv_lines[0].extend(["biphn", "biphn_format", "biphn_opts"])

    # Processing all the wav files in the list
    for wav_file in wav_lst:  # ex:p203_122.wav

        # Example wav_file: p232_001.wav
        snt_id = os.path.basename(wav_file).replace(".wav", "")
        clean_wav = os.path.join(clean_folder, snt_id + ".wav")

        # Reading the signal (to retrieve duration in seconds)
        signal = read_audio(wav_file)
        duration = signal.shape[0] / SAMPLERATE

        # Read text
        snt_id = os.path.basename(wav_file).replace(".wav", "")
        with open(os.path.join(txt_folder, snt_id + ".txt")) as f:
            words = f.read()
        words = remove_punctuation(words).strip().upper()
        phones = " ".join([lexicon[word] for word in words.split()])

        biphones = zip(["<B>"] + phones.split(), phones.split() + ["<E>"])
        biphones = [phn1 + phn2 for phn1, phn2 in biphones]

        # Composition of the csv_line
        csv_line = [snt_id, str(duration)]
        csv_line.extend([wav_file, "wav", ""])
        csv_line.extend([clean_wav, "wav", ""])
        csv_line.extend([words, "string", ""])
        csv_line.extend([phones, "string", ""])
        csv_line.extend([biphones, "string", ""])

        # Adding this line to the csv_lines list
        csv_lines.append(csv_line)

    # Writing the csv lines
    with open(csv_file, mode="w") as csv_f:
        csv_writer = csv.writer(csv_f,
                                delimiter=",",
                                quotechar='"',
                                quoting=csv.QUOTE_MINIMAL)

        for line in csv_lines:
            csv_writer.writerow(line)

    print(f"{csv_file} successfully created!")
Beispiel #16
0
def prepare_SLURP(data_folder,
                  save_folder,
                  slu_type,
                  train_splits,
                  skip_prep=False):
    """
    This function prepares the SLURP dataset.
    If the folder does not exist, the zip file will be extracted. If the zip file does not exist, it will be downloaded.

    data_folder : path to SLURP dataset.
    save_folder: path where to save the csv manifest files.
    slu_type : one of the following:

      "direct":{input=audio, output=semantics}
      "multistage":{input=audio, output=semantics} (using ASR transcripts in the middle)
      "decoupled":{input=transcript, output=semantics} (using ground-truth transcripts)

    train_splits : list of splits to be joined to form train .csv
    skip_prep: If True, data preprations is skipped.
    """
    if skip_prep:
        return
    # If the data folders do not exist, we need to download/extract the data
    if not os.path.isdir(os.path.join(data_folder, "slurp_synth")):
        # Check for zip file and download if it doesn't exist
        zip_location = os.path.join(data_folder, "slurp_synth.tar.gz")
        if not os.path.exists(zip_location):
            url = "https://zenodo.org/record/4274930/files/slurp_synth.tar.gz?download=1"
            download_file(url, zip_location, unpack=True)
        else:
            print("Extracting slurp_synth...")
            shutil.unpack_archive(zip_location, data_folder)

    if not os.path.isdir(os.path.join(data_folder, "slurp_real")):
        # Check for zip file and download if it doesn't exist
        zip_location = os.path.join(data_folder, "slurp_real.tar.gz")
        if not os.path.exists(zip_location):
            url = "https://zenodo.org/record/4274930/files/slurp_real.tar.gz?download=1"
            download_file(url, zip_location, unpack=True)
        else:
            print("Extracting slurp_real...")
            shutil.unpack_archive(zip_location, data_folder)

    splits = [
        "train_real",
        "train_synthetic",
        "devel",
        "test",
    ]
    id = 0
    for split in splits:
        new_filename = (os.path.join(save_folder, split) +
                        "-type=%s.csv" % slu_type)
        if os.path.exists(new_filename):
            continue
        print("Preparing %s..." % new_filename)

        IDs = []
        duration = []

        wav = []
        wav_format = []
        wav_opts = []

        semantics = []
        semantics_format = []
        semantics_opts = []

        transcript = []
        transcript_format = []
        transcript_opts = []

        jsonl_path = os.path.join(data_folder, split + ".jsonl")
        if not os.path.isfile(jsonl_path):
            if split == "train_real":
                url_split = "train"
            else:
                url_split = split
            url = (
                "https://github.com/pswietojanski/slurp/raw/master/dataset/slurp/"
                + url_split + ".jsonl")
            download_file(url, jsonl_path, unpack=False)

        with jsonlines.open(jsonl_path) as reader:
            for obj in reader:
                scenario = obj["scenario"]
                action = obj["action"]
                sentence_annotation = obj["sentence_annotation"]
                num_entities = sentence_annotation.count("[")
                entities = []
                for slot in range(num_entities):
                    type = (sentence_annotation.split("[")[slot + 1].split("]")
                            [0].split(":")[0].strip())
                    filler = (sentence_annotation.split("[")[slot + 1].split(
                        "]")[0].split(":")[1].strip())
                    entities.append({"type": type, "filler": filler})
                for recording in obj["recordings"]:
                    IDs.append(id)
                    if "synthetic" in split:
                        audio_folder = "slurp_synth/"
                    else:
                        audio_folder = "slurp_real/"
                    path = os.path.join(data_folder, audio_folder,
                                        recording["file"])
                    signal = read_audio(path)
                    duration.append(signal.shape[0] / 16000)

                    wav.append(path)
                    wav_format.append("flac")
                    wav_opts.append(None)

                    transcript_ = obj["sentence"]
                    if slu_type == "decoupled":
                        transcript_ = transcript_.upper()
                    transcript.append(transcript_)
                    transcript_format.append("string")
                    transcript_opts.append(None)

                    semantics_dict = {
                        "scenario": scenario,
                        "action": action,
                        "entities": entities,
                    }
                    semantics_ = str(semantics_dict).replace(
                        ",", "|"
                    )  # Commas in dict will make using csv files tricky; replace with pipe.
                    semantics.append(semantics_)
                    semantics_format.append("string")
                    semantics_opts.append(None)
                    id += 1

        df = pd.DataFrame({
            "ID": IDs,
            "duration": duration,
            "wav": wav,
            "semantics": semantics,
            "transcript": transcript,
        })
        df.to_csv(new_filename, index=False)

    # Merge train splits
    train_splits = [
        split + "-type=%s.csv" % slu_type for split in train_splits
    ]
    merge_csvs(save_folder, train_splits, "train-type=%s.csv" % slu_type)
dir_test = set(dir_test)
dir_test = [i[len("TIMIT_4_channels/test/") :] for i in dir_test]
dir_test = [i for i in dir_test if len(i) > 3]

for i in dir_test:
    os.makedirs("TIMIT_combined/test/" + i)

for i in range(len(train_df)):
    if i % 4 == 0:
        fname = (
            train_df["location"][i][len("TIMIT_4_channels/train/") :]
            .split(".")[0]
            .split("_")[0]
            + ".wav"
        )
        mic1 = read_audio(train_df["location"][i])
        mic2 = read_audio(train_df["location"][i + 1])
        mic3 = read_audio(train_df["location"][i + 2])
        mic4 = read_audio(train_df["location"][i + 3])
        sa = torch.stack((mic1, mic2, mic3, mic4)).transpose(0, 1)
        write_audio("TIMIT_combined/train/" + fname, sa, samplerate=fs)

for i in range(len(test_df)):
    if i % 4 == 0:
        fname = (
            test_df["location"][i][len("TIMIT_4_channels/test/") :]
            .split(".")[0]
            .split("_")[0]
            + ".wav"
        )
        mic1 = read_audio(test_df["location"][i])
Beispiel #18
0
def prepare_aishell(data_folder, save_folder, skip_prep=False):
    """
    This function prepares the AISHELL-1 dataset.
    If the folder does not exist, the zip file will be extracted. If the zip file does not exist, it will be downloaded.

    data_folder : path to AISHELL-1 dataset.
    save_folder: path where to store the manifest csv files.
    skip_prep: If True, skip data preparation.

    """
    if skip_prep:
        return

    # If the data folders do not exist, we need to extract the data
    if not os.path.isdir(os.path.join(data_folder, "data_aishell/wav")):
        # Check for zip file and download if it doesn't exist
        zip_location = os.path.join(data_folder, "data_aishell.tgz")
        if not os.path.exists(zip_location):
            url = "https://www.openslr.org/resources/33/data_aishell.tgz"
            download_file(url, zip_location, unpack=True)
        logger.info("Extracting data_aishell.tgz...")
        shutil.unpack_archive(zip_location, data_folder)
        wav_dir = os.path.join(data_folder, "data_aishell/wav")
        tgz_list = glob.glob(wav_dir + "/*.tar.gz")
        for tgz in tgz_list:
            shutil.unpack_archive(tgz, wav_dir)
            os.remove(tgz)

    # Create filename-to-transcript dictionary
    filename2transcript = {}
    with open(
        os.path.join(
            data_folder, "data_aishell/transcript/aishell_transcript_v0.8.txt"
        ),
        "r",
    ) as f:
        lines = f.readlines()
        for line in lines:
            key = line.split()[0]
            value = " ".join(line.split()[1:])
            filename2transcript[key] = value

    splits = [
        "train",
        "dev",
        "test",
    ]
    ID_start = 0  # needed to have a unique ID for each audio
    for split in splits:
        new_filename = os.path.join(save_folder, split) + ".csv"
        if os.path.exists(new_filename):
            continue
        logger.info("Preparing %s..." % new_filename)

        ID = []
        duration = []

        wav = []
        wav_format = []
        wav_opts = []

        # spk_id = []
        # spk_id_format = []
        # spk_id_opts = []

        transcript = []
        transcript_format = []
        transcript_opts = []

        all_wavs = glob.glob(
            os.path.join(data_folder, "data_aishell/wav")
            + "/"
            + split
            + "/*/*.wav"
        )
        for i in range(len(all_wavs)):
            filename = all_wavs[i].split("/")[-1].split(".wav")[0]
            if filename not in filename2transcript:
                continue
            transcript_ = filename2transcript[filename]
            transcript.append(transcript_)
            transcript_format.append("string")
            transcript_opts.append(None)

            ID.append(ID_start + i)

            signal = read_audio(all_wavs[i])
            duration.append(signal.shape[0] / 16000)

            wav.append(all_wavs[i])
            wav_format.append("wav")
            wav_opts.append(None)

            # spk_id.append(df.speakerId[i])
            # spk_id_format.append("string")
            # spk_id_opts.append(None)

        new_df = pd.DataFrame(
            {
                "ID": ID,
                "duration": duration,
                "wav": wav,
                "transcript": transcript,
            }
        )
        new_df.to_csv(new_filename, index=False)
        ID_start += len(all_wavs)
def prepare_FSC(data_folder, skip_prep=False):
    """
    This function prepares the Fluent Speech Commands dataset.

    data_folder : path to dataset.
    skip_prep: If True, skip data preparation

    """
    if skip_prep:
        return

    splits = [
        "train",
        "valid",
        "test",
    ]
    ID_start = 0  # needed to have a unique ID for each audio
    for split in splits:
        new_filename = os.path.join(data_folder, split) + ".csv"
        if os.path.exists(new_filename):
            continue
        logger.info("Preparing %s..." % new_filename)

        ID = []
        duration = []

        wav = []
        wav_format = []
        wav_opts = []

        spk_id = []
        spk_id_format = []
        spk_id_opts = []

        semantics = []
        semantics_format = []
        semantics_opts = []

        transcript = []
        transcript_format = []
        transcript_opts = []

        df = pd.read_csv(
            os.path.join(data_folder, "data", split) + "_data.csv")
        for i in range(len(df)):
            ID.append(ID_start + i)
            signal = read_audio(os.path.join(data_folder, df.path[i]))
            duration.append(signal.shape[0] / 16000)

            wav.append(os.path.join(data_folder, df.path[i]))
            wav_format.append("wav")
            wav_opts.append(None)

            spk_id.append(df.speakerId[i])
            spk_id_format.append("string")
            spk_id_opts.append(None)

            transcript_ = df.transcription[i]
            transcript.append(transcript_)
            transcript_format.append("string")
            transcript_opts.append(None)

            semantics_ = ('{"action:" "' + df.action[i] + '"| "object": "' +
                          df.object[i] + '"| "location": "' + df.location[i] +
                          '"}')
            semantics.append(semantics_)
            semantics_format.append("string")
            semantics_opts.append(None)

        new_df = pd.DataFrame({
            "ID": ID,
            "duration": duration,
            "wav": wav,
            "wav_format": wav_format,
            "wav_opts": wav_opts,
            "spk_id": spk_id,
            "spk_id_format": spk_id_format,
            "spk_id_opts": spk_id_opts,
            "semantics": semantics,
            "semantics_format": semantics_format,
            "semantics_opts": semantics_opts,
            "transcript": transcript,
            "transcript_format": transcript_format,
            "transcript_opts": transcript_opts,
        })
        new_df.to_csv(new_filename, index=False)
        ID_start += len(df)
Beispiel #20
0
 def audio_pipeline(wav):
     sig = read_audio(wav)
     return sig