def save_spectrogram_tdsv():
    """ Select text specific utterance and perform STFT with the audio file.
        Audio spectrogram files are divided as train set and test set and saved as numpy file. 
        Need : utterance data set (VTCK)
    """
    print("start text dependent utterance selection")
    os.makedirs(config.train_path, exist_ok=True)   # make folder to save train file
    os.makedirs(config.test_path, exist_ok=True)    # make folder to save test file

    # =============================================================================================2020/03/23 23:25
    # ============================
    total_speaker_num  = 0
    for audio_path in audio_paths:
        total_speaker_num += len(os.listdir(audio_path))
    train_speaker_num = (total_speaker_num // 10) * 9  # split total data 90% train and 10% test
    print("total speaker number : %d" % total_speaker_num)
    print("train : %d, test : %d" % (train_speaker_num, total_speaker_num - train_speaker_num))
    i = 0
    # ============================
    # =============================================================================================2020/03/23 23:25
    for audio_path in audio_paths:
        for folder in os.listdir(audio_path):
            speaker_path = os.path.join(audio_path, folder)  # path of each speaker
            print("%dth speaker processing..." % i)
            utterances_spec = []
            for utter_name in os.listdir(speaker_path):
                utter_path = os.path.join(speaker_path, utter_name)  # path of each utterance
                # utter_path= os.path.join(audio_path, folder, os.listdir(os.path.join(audio_path, folder))[0])
                # if os.path.splitext(os.path.basename(utter_path))[0][-3:] != '001':  # if the text utterance doesn't exist pass
                #     print(os.path.basename(utter_path)[:4], "001 file doesn't exist")
                #     continue
                try:
                    utter, sr = librosa.core.load(utter_path, config.sr)               # load the utterance audio
                    utter_trim, index = librosa.effects.trim(utter, top_db=14)         # trim the beginning and end blank
                    if utter_trim.shape[0]/sr <= config.hop*(config.tdsv_frame+2):     # if trimmed file is too short, then pass
                        print(os.path.basename(utter_path), "voice trim fail")
                        continue

                    S = librosa.core.stft(y=utter_trim, n_fft=config.nfft,
                                          win_length=int(config.window * sr), hop_length=int(config.hop * sr))  # perform STFT
                    S = keyword_spot(S)          # keyword spot (for now, just slice last 80 frames which contains "Call Stella")
                    utterances_spec.append(S)    # make spectrograms list
                except:
                    continue

            utterances_spec = np.array(utterances_spec)  # list to numpy array
            # np.random.shuffle(utterances_spec)           # shuffle spectrogram (by person)
            # total_num = utterances_spec.shape[0]
            # train_num = (total_num//10)*9                # split total data 90% train and 10% test
            # print("selection is end")
            # print("total utterances number : %d"%total_num, ", shape : ", utterances_spec.shape)
            # print("train : %d, test : %d"%(train_num, total_num- train_num))
            print(utterances_spec.shape)
            if i < train_speaker_num:  # save spectrogram as numpy file
                np.save(os.path.join(config.train_path, "speaker%d.npy" % i), utterances_spec)
            else:
                np.save(os.path.join(config.test_path, "speaker%d.npy" % (i - train_speaker_num)), utterances_spec)
            i+=1
Exemple #2
0
def save_spectrogram_tdsv():
    """ Select text specific utterance and perform STFT with the audio file.
        Audio spectrogram files are divided as train set and test set and saved as numpy file. 
        Need : utterance data set (VTCK)
    """
    print("start text dependent utterance selection")
    os.makedirs(config.train_path,
                exist_ok=True)  # make folder to save train file
    os.makedirs(config.test_path,
                exist_ok=True)  # make folder to save test file

    utterances_spec = []
    for folder in os.listdir(audio_path):
        spk_dir = os.path.join(audio_path, folder)
        spk_utts = os.listdir(spk_dir)
        spk_utts.sort()
        utter_path = os.path.join(spk_dir, spk_utts[0])
        if os.path.splitext(
                os.path.basename(utter_path)
        )[0][-3:] != '001':  # if the text utterance doesn't exist pass
            print(os.path.basename(utter_path)[:4], "001 file doesn't exist")
            continue

        utter, sr = librosa.core.load(utter_path,
                                      config.sr)  # load the utterance audio
        utter_trim, index = librosa.effects.trim(
            utter, top_db=14)  # trim the beginning and end blank
        if utter_trim.shape[0] / sr <= config.hop * (
                config.tdsv_frame +
                2):  # if trimmed file is too short, then pass
            print(os.path.basename(utter_path), "voice trim fail")
            continue

        S = librosa.core.stft(y=utter_trim,
                              n_fft=config.nfft,
                              win_length=int(config.window * sr),
                              hop_length=int(config.hop * sr))  # perform STFT
        S = keyword_spot(
            S
        )  # keyword spot (for now, just slice last 80 frames which contains "Call Stella")
        utterances_spec.append(S)  # make spectrograms list

    utterances_spec = np.array(utterances_spec)  # list to numpy array
    np.random.shuffle(utterances_spec)  # shuffle spectrogram (by person)
    total_num = utterances_spec.shape[0]
    train_num = (total_num //
                 10) * 9  # split total data 90% train and 10% test
    print("selection is end")
    print("total utterances number : %d" % total_num, ", shape : ",
          utterances_spec.shape)
    print("train : %d, test : %d" % (train_num, total_num - train_num))
    np.save(os.path.join(config.train_path, "train.npy"),
            utterances_spec[:train_num])  # save spectrogram as numpy file
    np.save(os.path.join(config.test_path, "test.npy"),
            utterances_spec[train_num:])
Exemple #3
0
def save_spectrogram_tdsv(path, data_type):
    """ Select text specific utterance and perform STFT with the audio file.
        Audio spectrogram files are divided as train set and test set and saved as numpy file. 
        Need : utterance data set (VTCK)
    """
    print('Preprocess ' + data_type)
    utterances_spec = []
    for folder in os.listdir(path):
        if not os.path.isdir(os.path.join(path, folder)):
            continue
        audios = os.listdir(os.path.join(path, folder))
        audios.sort()
        utter_path = os.path.join(path, folder, audios[0])
        if config.train and os.path.splitext(
                os.path.basename(utter_path)
        )[0][-3:] != '001':  # if the text utterance doesn't exist pass
            print(os.path.basename(utter_path)[:4], "001 file doesn't exist")
            continue

        utter, sr = librosa.core.load(utter_path,
                                      config.sr)  # load the utterance audio
        utter_trim, index = librosa.effects.trim(
            utter, top_db=14)  # trim the beginning and end blank
        if utter_trim.shape[0] / sr <= config.hop * (
                config.tdsv_frame +
                2):  # if trimmed file is too short, then pass
            print(os.path.basename(utter_path), "voice trim fail")
            continue

        S = librosa.core.stft(y=utter_trim,
                              n_fft=config.nfft,
                              win_length=int(config.window * sr),
                              hop_length=int(config.hop * sr))  # perform STFT
        S = keyword_spot(
            S
        )  # keyword spot (for now, just slice last 80 frames which contains "Call Stella")
        utterances_spec.append(S)  # make spectrograms list

    utterances_spec = np.array(utterances_spec)  # list to numpy array
    np.random.shuffle(utterances_spec)  # shuffle spectrogram (by person)
    total_num = utterances_spec.shape[0]
    print("Speaker number : %d" % total_num, ", shape : ",
          utterances_spec.shape)

    np.save(os.path.join(path, data_type + ".npy"),
            utterances_spec)  # save spectrogram as numpy file