Example #1
0
def _process_utterance(out_dir, in_dir, source_wav_name, target_wav_name,
                       emotion_id):
    '''Preprocesses a single utterance audio/text pair.

    This writes the mel and linear scale spectrograms to disk and returns a tuple to write
    to the train.txt file.

    Args:
      out_dir: The directory to write the spectrograms into
      index: The numeric index to use in the spectrogram filenames.
      wav_path: Path to the audio file containing the speech input
      text: The text spoken in the input audio file

    Returns:
      A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
    '''

    # Load the audio to a numpy array:
    source_wav = audio.load_wav(os.path.join(in_dir, source_wav_name))
    target_wav = audio.load_wav(os.path.join(in_dir, target_wav_name))

    if hparams.rescaling:
        source_wav = source_wav / np.abs(
            source_wav).max() * hparams.rescaling_max
        target_wav = target_wav / np.abs(
            target_wav).max() * hparams.rescaling_max

    # Compute the linear-scale spectrogram from the wav:
    #s_spectrogram = audio.spectrogram(source_wav).astype(np.float32)
    t_spectrogram = audio.spectrogram(target_wav).astype(np.float32)

    # Compute a mel-scale spectrogram from the wav:
    smel_spectrogram = audio.melspectrogram(source_wav).astype(np.float32)
    tmel_spectrogram = audio.melspectrogram(target_wav).astype(np.float32)
    s_n_frames = smel_spectrogram.shape[1]
    t_n_frames = tmel_spectrogram.shape[1]

    # Write the spectrograms to disk:
    #s_spectrogram_filename = 'source-spec-{}.npy'.format(source_wav_name)
    t_spectrogram_filename = 'target-spec-{}.npy'.format(
        target_wav_name.replace('.wav', ''))
    smel_filename = 'source-mel-{}.npy'.format(
        source_wav_name.replace('.wav', ''))
    tmel_filename = 'target-mel-{}.npy'.format(
        target_wav_name.replace('.wav', ''))
    #np.save(os.path.join(out_dir, s_spectrogram_filename), s_spectrogram.T, allow_pickle=False)
    np.save(os.path.join(out_dir, t_spectrogram_filename),
            t_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, smel_filename),
            smel_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, tmel_filename),
            tmel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (emotion_id, t_spectrogram_filename, smel_filename, tmel_filename,
            s_n_frames, t_n_frames)
Example #2
0
def load_one_person_m_utterances(speaker_dir: Path,
                                 utterances_num,
                                 extension="wav"):
    """
    返回一个speaker的utterances_num条音频,每条音频是经过分割的
    返回格式:
    [
        [bsz1,n_frames,n_mels],
        [bsz2,n_frames,n_mels],
        ...
    ]
    """
    wav_path_cl = [
        wav_path for wav_path in speaker_dir.glob("*.{}".format(extension))
    ]
    wav_path_cl_samples = random.sample(
        wav_path_cl, utterances_num)  # 随机抽样utterances_num条音频

    utterances_mels_cl = []
    for fp in wav_path_cl_samples:
        wav, sr = audio.load_wav(str(fp))
        bsz_n_frames_mel = slice_utterance_mel(
            wav)  # 这里的shape=[bsz,n_frames,n_mels],bsz不是定值
        utterances_mels_cl.append(bsz_n_frames_mel)

    return utterances_mels_cl
Example #3
0
def load_multi_person_one_utterances(read_dir: Path,
                                     n_speaker,
                                     extension="wav"):
    """
    后去n个speaker,每个speaker去随机去一段语音
    :param read_dir: 保持spearker的文件夹
    :param n_speaker: n个speaker
    :return:
    [
        [bsz1,n_frames,n_mels],
        [bsz2,n_frames,n_mels],
        ...
    ]
    """
    speaker_dirs = [speak_dir for speak_dir in read_dir.glob("*")]
    n_speaker_dirs = random.sample(speaker_dirs, n_speaker)

    n_speaker_utterance_mel_cl = []
    for speaker_dir in n_speaker_dirs:
        utterancs_wavs_path = [
            str(wav_path) for wav_path in Path(str(speaker_dir)).glob(
                "*.{}".format(extension))
        ]
        wav_path = random.sample(utterancs_wavs_path, 1)[0]  # 随机抽样1条音频
        wav, sr = audio.load_wav(wav_path)
        bsz_n_frames_mel = slice_utterance_mel(
            wav)  # 这里的shape=[bsz,n_frames,n_mels],bsz不是定值
        n_speaker_utterance_mel_cl.append(bsz_n_frames_mel)

    return n_speaker_utterance_mel_cl
Example #4
0
def gen_samples(out_dir, wav_path, n_samples):
    wav = audio.load_wav(wav_path)
    hop_size = hparams.hop_length
    seg_len = hparams.seg_len
    spec_len = hparams.spec_len
    # not sure why we have to minus 1 here ?
    wav_len = wav.shape[0] // hop_size * hop_size - 1
    wav = wav[:wav_len]
    spec = audio.spectrogram(wav)
    mel = audio.melspectrogram(wav)
    max_val = spec.shape[1] - 1 - spec_len
    if max_val < 0:
        return []
    idx = np.random.randint(0, max_val, size=(n_samples))
    d = []
    i = 0
    for offset in idx:
        i += 1
        w = wav[offset * hop_size:offset * hop_size + seg_len]
        s = spec[:, offset:offset + spec_len]
        m = mel[:, offset:offset + spec_len]
        wav_name = wav_path.split('/')[-1].split('.')[0]
        file_path = "{0}/{1}_{2:03d}.npz".format(out_dir, wav_name, i)
        np.savez(file_path, wav=w, spec=s, mel=m)
        d.append(file_path)
    return d
Example #5
0
def _process_utterance(out_dir, index, speaker_id, wav_path, text):
    sr = hparams.sample_rate

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    wav, _ = librosa.effects.trim(wav, top_db=15)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrograms to disk:
    spectrogram_filename = 'vctk-spec-%05d.npy' % index
    mel_filename = 'vctk-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return spectrogram_filename, mel_filename, n_frames, text, speaker_id
Example #6
0
def gen_data(audio_path, full_frames):
    wav = audio.load_wav(audio_path, 16000)
    mel = audio.melspectrogram(wav)
    print(mel.shape)

    if np.isnan(mel.reshape(-1)).sum() > 0:
        raise ValueError(
            'Mel contains nan! Using a TTS voice? Add a small epsilon noise to the wav file and try again'
        )

    mel_chunks = []
    mel_idx_multiplier = 80. / fps
    i = 0
    while 1:
        start_idx = int(i * mel_idx_multiplier)
        if start_idx + mel_step_size > len(mel[0]):
            mel_chunks.append(mel[:, len(mel[0]) - mel_step_size:])
            break
        mel_chunks.append(mel[:, start_idx:start_idx + mel_step_size])
        i += 1

    print("Length of mel chunks: {}".format(len(mel_chunks)))

    full_frames = full_frames[:len(mel_chunks)]

    gen = datagen(full_frames.copy(), mel_chunks)
    return gen
Example #7
0
def _process_utterance(out_dir, index, wav_path, text):
    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        # [0, quantize_channels)
        out = P.mulaw_quantize(wav, hparams.quantize_channels)

        # Trim silences
        start, end = audio.start_and_end_indices(out,
                                                 hparams.silence_threshold)
        wav = wav[start:end]
        out = out[start:end]
        constant_values = P.mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16
    elif is_mulaw(hparams.input_type):
        # [-1, 1]
        out = P.mulaw(wav, hparams.quantize_channels)
        constant_values = P.mulaw(0.0, hparams.quantize_channels)
        out_dtype = np.float32
    else:
        # [-1, 1]
        out = wav
        constant_values = 0.0
        out_dtype = np.float32

    # Compute a mel-scale spectrogram from the trimmed wav:
    # (N, D)
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
    # lws pads zeros internally before performing stft
    # this is needed to adjust time resolution between audio and mel-spectrogram
    l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size())

    # zero pad for quantized signal
    out = np.pad(out, (l, r), mode="constant", constant_values=constant_values)
    N = mel_spectrogram.shape[0]
    assert len(out) >= N * audio.get_hop_size()

    # time resolution adjustment
    # ensure length of raw audio is multiple of hop_size so that we can use
    # transposed convolution to upsample
    out = out[:N * audio.get_hop_size()]
    assert len(out) % audio.get_hop_size() == 0

    timesteps = len(out)

    # Write the spectrograms to disk:
    audio_filename = 'ljspeech-audio-%05d.npy' % index
    mel_filename = 'ljspeech-mel-%05d.npy' % index
    # np.save(os.path.join(out_dir, audio_filename),
    #         out.astype(out_dtype), allow_pickle=False)
    # np.save(os.path.join(out_dir, mel_filename),
    #         mel_spectrogram.astype(np.float32), allow_pickle=False)

    # Return a tuple describing this training example:
    return (audio_filename, mel_filename, timesteps, text)
Example #8
0
def _process_utterance(out_dir, index, wav_path, text):
    '''Preprocesses a single utterance audio/text pair.

    This writes the mel and linear scale spectrograms to disk and returns a tuple to write
    to the train.txt file.

    Args:
      out_dir: The directory to write the spectrograms into
      index: The numeric index to use in the spectrogram filenames.
      wav_path: Path to the audio file containing the speech input
      text: The text spoken in the input audio file

    Returns:
      A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
    '''

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrograms to disk:
    spectrogram_filename = 'nikl-single-spec-%05d.npy' % index
    mel_filename = 'nikl-single-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)

    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, text)
Example #9
0
def extract_MFCC_and_text(wav_file_path, mfcc_dir):

    wav_filenames = glob.glob(wav_file_path)

    for wav_fname in wav_filenames:
        text_filename = wav_fname.replace(".WAV.wav", ".TXT")
        fullname = wav_fname.split('/')[-1]
        fname = fullname.split('.')[0]

        # Process the text: remove the first two numbers from the text file
        with open(text_filename, 'r') as file:
            sentence = file.read()
        sentence = sentence.split()[2:] + ['\n']
        sentence = ' '.join(sentence).lower()
        # Write the prcoeesed text to the mfcc directory
        text_fname = mfcc_dir + '/' + fname + '.txt'
        with open(text_fname, "w") as file:
            file.write(sentence)

        # Generate the MFCC features
        wav = audio.load_wav(wav_fname)
        mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
        mspec_fname = mfcc_dir + '/' + fname
        np.save(mspec_fname, mel_spectrogram,
                allow_pickle=False)  #generates features of shape: L x 80

    return
def _process_utterance(out_dir, wav_path):
    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        # [0, quantize_channels)
        out = P.mulaw_quantize(wav, hparams.quantize_channels)

        # Trim silences
        start, end = audio.start_and_end_indices(out,
                                                 hparams.silence_threshold)
        wav = wav[start:end]
        out = out[start:end]
        constant_values = P.mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16
    elif is_mulaw(hparams.input_type):
        # [-1, 1]
        out = P.mulaw(wav, hparams.quantize_channels)
        constant_values = P.mulaw(0.0, hparams.quantize_channels)
        out_dtype = np.float32
    else:
        # [-1, 1]
        out = wav
        constant_values = 0.0
        out_dtype = np.float32

    # Compute a mel-scale spectrogram from the trimmed wav:
    # (N, D)
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T

    return mel_spectrogram.astype(np.float32)
Example #11
0
def main():
    #这一部分用于处理LJSpeech格式的数据集
    a = open(meta_path, 'r').readlines()
    b = []
    i = 0
    while i < len(a):
        t = a[i][0:6]
        b.append(t)
        i += 2
    print(b[:2])
    a = b
    # a = [i.strip().split('|')[0] for i in a]
    cnt = 0
    cnt_list = []
    bad_cnt = 0
    bad_list = []
    for fname in tqdm(a):
        try:
            # 提取声学参数
            wav_f = os.path.join(wav_dir, fname + '.wav')
            wav_arr = load_wav(wav_f)
            mfcc_feats = wav2unnormalized_mfcc(wav_arr)
            mel_feats = wav2normalized_db_mel(wav_arr)
            spec_feats = wav2normalized_db_spec(wav_arr)
            
            # 验证声学参数提取的对
            save_name = fname + '.npy'
            save_mel_rec_name = fname + '_mel_rec.wav'
            save_spec_rec_name = fname + '_spec_rec.wav'
            # 这句话有可能错,不知道为什么,可能是服务器临时变动有关
            ppg_already_feats = np.load(os.path.join(ppg_dir, save_name))

            assert ppg_already_feats.shape[0] == mfcc_feats.shape[0]
            assert mfcc_feats.shape[0] == mel_feats.shape[0] and mel_feats.shape[0] == spec_feats.shape[0]
            write_wav(os.path.join(rec_wav_dir, save_mel_rec_name), normalized_db_mel2wav(mel_feats))
            write_wav(os.path.join(rec_wav_dir, save_spec_rec_name), normalized_db_spec2wav(spec_feats))
            
            # 存储声学参数
            mfcc_save_name = os.path.join(mfcc_dir, save_name)
            mel_save_name = os.path.join(mel_dir, save_name)
            spec_save_name = os.path.join(spec_dir, save_name)
            np.save(mfcc_save_name, mfcc_feats)
            np.save(mel_save_name, mel_feats)
            np.save(spec_save_name, spec_feats)

            f_good_meta.write(fname + '\n')
            cnt_list.append(fname)
            cnt += 1
        except:
            bad_list.append(fname)
            bad_cnt += 1
        
        # print(cnt)
        # break

    print(cnt)
    print('bad:', bad_cnt)
    print(bad_list)

    return
Example #12
0
def main():
    #这一部分用于处理LibriSpeech格式的数据集。
    for first_dir in server_common_data_list:
        for second_dir in os.listdir(os.path.join(wav_dir, first_dir)):
            second_wav_dir = os.path.join(os.path.join(wav_dir,first_dir),second_dir)
            wav_files = [os.path.join(second_wav_dir, f) for f in os.listdir(second_wav_dir) if f.endswith('.wav')]
            cnt = 0
            for wav_f in wav_files:
                try:
                    wav_arr = load_wav(wav_f, sr=hparams['sample_rate'])
                    mfcc_feats = wav2mfcc_v2(wav_arr, sr=hparams['sample_rate'],
                                            n_mfcc=hparams['n_mfcc'], n_fft=hparams['n_fft'],
                                            hop_len=hparams['hop_length'], win_len=hparams['win_length'],
                                            window=hparams['window'], num_mels=hparams['num_mels'],
                                            center=hparams['center'])
                    save_name = wav_f.split('/')[-1].split('.')[0] + '.npy'
                    save_name = os.path.join(mfcc_dir, save_name)
                    np.save(save_name, mfcc_feats)
                    cnt += 1
                    print(cnt)
                except:
                    print(wav_f)
                # break
            # break
        # break
        # 提取完毕以后,需要手动将3个文件夹的东西mv到同一个,和ppg一样的2338个文件夹
    return
Example #13
0
def process(info_dict):
    wav_path = os.path.join(hp.data_path, "Wave")
    wav_file_name = os.path.join(wav_path, info_dict["sentence_id"]+".wav")
    wav = audio.load_wav(wav_file_name)
    mel = audio.melspectrogram(wav).T
    mel_file_path = os.path.join(hp.mel_path, info_dict["sentence_id"]+".npy")
    np.save(mel_file_path, mel)

    phone_idx = info_dict["sentence_id"] + "|"
    for phone_duration in info_dict["alignment"]:
        phone_idx += str(phone_map[phone_duration[0]]) + " "

    duration_idx = info_dict["sentence_id"] + "|"
    length_mel = mel.shape[0]
    length_phone_list = len(info_dict["alignment"])
    cur_pointer = 0
    for frame_id in range(length_mel):
        added = False
        cur_time = hp.frame_length_ms / 2 + frame_id * hp.frame_shift_ms
        cur_time = cur_time / 1000.0
        for i in range(cur_pointer, length_phone_list):
            if cur_time >= info_dict["alignment"][i][1][0] and cur_time < info_dict["alignment"][i][1][1]:
                phone_id = phone_map[info_dict["alignment"][i][0]]
                duration_idx += str(phone_id) + " "
                cur_pointer = i
                added = True
                break
        if not added:
            phone_id = phone_map[info_dict["alignment"][cur_pointer][0]]
            duration_idx += str(phone_id) + " "

    return phone_idx[:-1], duration_idx[:-1]
Example #14
0
 def __getitem__(self, index):
     # Read audio
     filename = self.audio_files[index]
     wav = deepaudio.load_wav(filename)
     # load in raw_audio via utils
     raw_audio, _ = utils.load_wav_to_torch(filename)
     # convert wav to numpy
     audio = torch.from_numpy(wav)
     # take segment
     if audio.size(0) >= self.segment_length:
         max_audio_start = audio.size(0) - self.segment_length
         audio_start = random.randint(0, max_audio_start)
         audio = audio[audio_start:audio_start + self.segment_length]
         # update raw audio as well
         raw_audio = raw_audio[audio_start:audio_start +
                               self.segment_length]
     else:
         audio = torch.nn.functional.pad(
             audio, (0, self.segment_length - audio.size(0)),
             'constant').data
         # pad raw audio as well
         raw_audio = torch.nn.functional.pad(
             raw_audio, (0, self.segment_length - raw_audio.size(0)),
             'constant').data
     # compute mel
     mel = deepaudio.melspectrogram(audio.numpy())
     # convert mel to torch
     mel = torch.from_numpy(mel)
     audio = utils.mu_law_encode(raw_audio / utils.MAX_WAV_VALUE,
                                 self.mu_quantization)
     return (mel, audio)
Example #15
0
def test():
    wavs_path = os.path.join("data", "LJSpeech-1.1")
    wavs_path = os.path.join(wavs_path, "wavs")
    wav_path = os.path.join(wavs_path, "LJ001-0001.wav")
    wav = audio.load_wav(wav_path)
    mel_spec = audio.melspectrogram(wav)
    wav_after_inv = audio.inv_mel_spectrogram(mel_spec)
    audio.save_wav(wav_after_inv, "test.wav")
Example #16
0
    def __getitem__(self, idx):
        while 1:
            idx = random.randint(0, len(self.all_videos) - 1)
            vidname = self.all_videos[idx]
            img_names = list(glob(join(vidname, '*.jpg')))
            if len(img_names) <= 3 * syncnet_T:
                continue
            
            img_name = random.choice(img_names)
            wrong_img_name = random.choice(img_names)
            while wrong_img_name == img_name:
                wrong_img_name = random.choice(img_names)

            window_fnames = self.get_window(img_name)
            wrong_window_fnames = self.get_window(wrong_img_name)
            if window_fnames is None or wrong_window_fnames is None:
                continue

            window = self.read_window(window_fnames)
            if window is None:
                continue

            wrong_window = self.read_window(wrong_window_fnames)
            if wrong_window is None:
                continue

            try:
                wavpath = join(vidname, "audio.wav")
                if wavpath not in self.shared_dict:
                    wav = audio.load_wav(wavpath, hparams.sample_rate)
                    orig_mel = audio.melspectrogram(wav).T
                    self.shared_dict[wavpath] = orig_mel
                else:
                    orig_mel = self.shared_dict[wavpath]
            except Exception as e:
                continue

            mel = self.crop_audio_window(orig_mel.copy(), img_name)
            
            if (mel.shape[0] != syncnet_mel_step_size):
                continue

            indiv_mels = self.get_segmented_mels(orig_mel.copy(), img_name)
            if indiv_mels is None: continue

            window = self.prepare_window(window)
            y = window.copy()
            window[:, :, window.shape[2]//2:] = 0.

            wrong_window = self.prepare_window(wrong_window)
            x = np.concatenate([window, wrong_window], axis=0)

            x = torch.FloatTensor(x)
            mel = torch.FloatTensor(mel.T).unsqueeze(0)
            indiv_mels = torch.FloatTensor(indiv_mels).unsqueeze(1)
            y = torch.FloatTensor(y)
            # print(x.shape)
            return x, indiv_mels, mel, y
Example #17
0
def _process_utterance(out_dir, index, wav_path, text):
    '''Preprocesses a single utterance audio/text pair.

    This writes the mel and linear scale spectrograms to disk and returns a tuple to write
    to the train.txt file.

    Args:
      out_dir: The directory to write the spectrograms into
      index: The numeric index to use in the spectrogram filenames.
      wav_path: Path to the audio file containing the speech input
      text: The text spoken in the input audio file

    Returns:
      A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
    '''

    # Load the audio to a numpy array:

    wav = audio.load_wav(wav_path)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    #world parameters
    f0, sp, ap = audio.world(wav, hparams.sample_rate)
    f0 = (f0 / hparams.f0_norm).astype(np.float32)  #normalize
    sp = audio._normalize(sp).astype(np.float32)
    ap = ap.astype(np.float32)  #apは0~1の範囲しか値を取らないので正規化不要
    world_frames = f0.shape[0]

    # Write the spectrograms to disk:
    spectrogram_filename = 'ljspeech-spec-%05d.npy' % index
    mel_filename = 'ljspeech-mel-%05d.npy' % index

    f0_filename = 'ljspeech-f0-%05d.npy' % index
    sp_filename = 'ljspeech-sp-%05d.npy' % index
    ap_filename = 'ljspeech-ap-%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, f0_filename), f0, allow_pickle=False)
    np.save(os.path.join(out_dir, sp_filename), sp, allow_pickle=False)
    np.save(os.path.join(out_dir, ap_filename), ap, allow_pickle=False)

    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, f0_filename,
            sp_filename, ap_filename, world_frames, text)
    '''
Example #18
0
def _process_utterance(out_dir, index, wav_path, text):
    '''Preprocesses a single utterance audio/text pair.

    This writes the mel and linear scale spectrograms to disk and returns a tuple to write
    to the train.txt file.

    Args:
      out_dir: The directory to write the spectrograms into
      index: The numeric index to use in the spectrogram filenames.
      wav_path: Path to the audio file containing the speech input
      text: The text spoken in the input audio file

    Returns:
      A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
    '''

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    try:
        # Compute the linear-scale spectrogram from the wav:
        spectrogram = audio.spectrogram(wav).astype(np.float32)
        n_frames = spectrogram.shape[1]

        # Compute a mel-scale spectrogram from the wav:
        mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
    except Exception as e:
        print("Problem with :", wav_path)
        print(e)

    # Write the spectrograms to disk:
    spectrogram_filename = 'ljspeech-spec-%05d.npy' % index
    mel_filename = 'ljspeech-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    # mdda added START :
    wav_filename = mel_filename.replace('-mel-', '-audio-')
    #wav_samples = hparams.fft_size + (n_frames-1)*hparams.hop_size  # No : 3 extra frames added : Don't bother chomping
    np.save(os.path.join(out_dir, wav_filename),
            wav.astype(np.float32),
            allow_pickle=False)
    spectrogramraw_filename = 'ljspeech-specraw-%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogramraw_filename),
            spectrogram_raw(wav).T,
            allow_pickle=False)
    # mdda added END

    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, text)
Example #19
0
 def __getitem__(self, index):
     text = self.metadata.iloc[index]['text']
     wav_filename = self.metadata.iloc[index]['wav']
     audio, _ = load_wav(f'{self.path}/wavs/{wav_filename}.wav')
     if self.text_transforms:
         text = self.text_transforms(text)
     if self.audio_transforms:
         audio = self.audio_transforms(audio)
     return text, audio
Example #20
0
def _process_utterance(out_dir, out_path, wav_path, text, stft):
    '''Preprocesses a single utterance audio/text pair.

    This writes the mel and linear scale spectrograms to disk and returns a tuple to write
    to the train.txt file.

    Args:
      out_dir: The directory to write the spectrograms into
      index: The numeric index to use in the spectrogram filenames.
      wav_path: Path to the audio file containing the speech input
      text: The text spoken in the input audio file

    Returns:
      A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
    '''

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)
    wav = wav / np.abs(wav).max() * 0.999
    #stft = audio.taco_stft()

    # delete the silence in back of the audio file.
    wav = librosa.effects.trim(wav,
                               top_db=23,
                               frame_length=1024,
                               hop_length=256)[0]

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav,
                                           stft).numpy().astype(np.float32)

    # Write the spectrograms to disk:
    # spectrogram_filename = 'ljspeech-spec-%05d.npy' % index
    parts = out_path.strip().split('/')
    mel_filename = parts[4] + parts[5] + parts[6]
    o_path = os.path.join(parts[0], parts[1], parts[4])

    #    print(o_path)
    #    mel_filename = 'nam_speech-mel-%05d.npy' % index
    #  print(out_path)

    if (not os.path.exists(o_path)):
        os.mkdir(o_path)
    o_path = os.path.join(o_path, parts[5])
    if (not os.path.exists(o_path)):
        os.mkdir(o_path)
    o_path = os.path.join(o_path, parts[6])

    np.save(o_path, mel_spectrogram.T, allow_pickle=False)

    # Return a tuple describing this training example:
    # return (spectrogram_filename, mel_filename, n_frames, text)
    return (mel_filename, n_frames, text)
Example #21
0
 def cache_spectrograms(self):
     wav_filenames = self.metadata['wav']
     spectrograms_path = f'{self.path}/spectrograms'
     if not os.path.exists(spectrograms_path):
         os.makedirs(spectrograms_path)
         print('Building Cache..')
         for name in tqdm(wav_filenames, total=len(wav_filenames)):
             audio, _ = load_wav(f'{self.path}/wavs/{name}.wav')
             S = self.audio_transforms(audio)
             np.save(f'{spectrograms_path}/{name}.npy', S)
Example #22
0
def _process_utterance_single(out_dir, text, wav_path, hparams=hparams):
    # modified version of LJSpeech _process_utterance
    audio.set_hparams(hparams)

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)
    sr = hparams.sample_rate
    # Added from the multispeaker version
    lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab")
    if not exists(lab_path):
        lab_path = os.path.splitext(wav_path)[0] + '.lab'

    # Trim silence from hts labels if available
    if exists(lab_path):
        labels = hts.load(lab_path)
        wav = clean_by_phoneme(labels, wav, sr)
        wav, _ = librosa.effects.trim(wav, top_db=25)
    else:
        if hparams.process_only_htk_aligned:
            return None
        wav, _ = librosa.effects.trim(wav, top_db=15)
    # End added from the multispeaker version

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    if hparams.max_audio_length != 0 and librosa.core.get_duration(
            y=wav, sr=sr) > hparams.max_audio_length:
        return None
    if hparams.min_audio_length != 0 and librosa.core.get_duration(
            y=wav, sr=sr) < hparams.min_audio_length:
        return None

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrograms to disk:
    # Get filename from wav_path
    wav_name = os.path.basename(wav_path)
    wav_name = os.path.splitext(wav_name)[0]
    spectrogram_filename = 'spec-{}.npy'.format(wav_name)
    mel_filename = 'mel-{}.npy'.format(wav_name)
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, text)
Example #23
0
def _process_utterance(out_dir, text, wav_path, speaker_id=None):

    # check whether singlespeaker_mode
    if speaker_id is None:
        return _process_utterance_single(out_dir, text, wav_path)
    # modified version of VCTK _process_utterance
    sr = hparams.sample_rate

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab")
    if not exists(lab_path):
        lab_path = os.path.splitext(wav_path)[0] + '.lab'

    # Trim silence from hts labels if available
    if exists(lab_path):
        labels = hts.load(lab_path)
        b = int(start_at(labels) * 1e-7 * sr)
        e = int(end_at(labels) * 1e-7 * sr)
        wav = wav[b:e]
        wav, _ = librosa.effects.trim(wav, top_db=25)
    else:
        if hparams.process_only_htk_aligned:
            return None
        wav, _ = librosa.effects.trim(wav, top_db=15)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrograms to disk:
    # Get filename from wav_path
    wav_name = os.path.basename(wav_path)
    wav_name = os.path.splitext(wav_name)[0]

    # case if wave files across different speakers have the same naming format.
    # e.g. Recording0.wav
    spectrogram_filename = 'spec-{}-{}.npy'.format(speaker_id, wav_name)
    mel_filename = 'mel-{}-{}.npy'.format(speaker_id, wav_name)
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, text, speaker_id)
Example #24
0
def _process_utterance(out_dir, index, wav_path, text, phone):
    '''Preprocesses a single utterance audio/text pair.

    This writes the mel and linear scale spectrograms to disk and returns a tuple to write
    to the train.txt file.

    Args:
      out_dir: The directory to write the spectrograms into
      index: The numeric index to use in the spectrogram filenames.
      wav_path: Path to the audio file containing the speech input
      text: The text spoken in the input audio file

    Returns:
      A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
    '''

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    if hparams.vocoder=="world":
        spectrogram = audio.spectrogram(wav).astype(np.float32)

        f0, sp, ap = pw.wav2world(wav.astype(np.double), hparams.sample_rate)
        ap_coded = pw.code_aperiodicity(ap, hparams.sample_rate)
        sp_coded = pw.code_spectral_envelope(sp,hparams.sample_rate, hparams.coded_env_dim)
        
        world_spec = np.hstack([f0[:,np.newaxis],sp_coded,ap_coded])
        n_frames = world_spec.shape[0]
        spectrogram_filename = 'synpaflex-spec-%05d.npy' % index
        encoded_filename = 'synpaflex-world-%05d.npy' % index
        np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
        np.save(os.path.join(out_dir, encoded_filename), world_spec, allow_pickle=False)

    else:
        # Compute the linear-scale spectrogram from the wav:
        spectrogram = audio.spectrogram(wav).astype(np.float32)
        n_frames = spectrogram.shape[1]

        # Compute a mel-scale spectrogram from the wav:
        mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

        # Write the spectrograms to disk:
        spectrogram_filename = 'synpaflex-spec-%05d.npy' % index
        encoded_filename = 'synpaflex-mel-%05d.npy' % index
        np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
        np.save(os.path.join(out_dir, encoded_filename), mel_spectrogram.T, allow_pickle=False)


    # Return a tuple describing this training example:
    return (spectrogram_filename, encoded_filename, n_frames, text, phone)
Example #25
0
def _extract_mel(wav_path):
    # Load the audio to a numpy array. Resampled if needed.
    wav = audio.load_wav(wav_path)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        # [0, quantize_channels)
        out = P.mulaw_quantize(wav, hparams.quantize_channels)

        # Trim silences
        start, end = audio.start_and_end_indices(out,
                                                 hparams.silence_threshold)
        wav = wav[start:end]
        out = out[start:end]
        constant_values = P.mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16
    elif is_mulaw(hparams.input_type):
        # [-1, 1]
        out = P.mulaw(wav, hparams.quantize_channels)
        constant_values = P.mulaw(0.0, hparams.quantize_channels)
        out_dtype = np.float32
    else:
        # [-1, 1]
        out = wav
        constant_values = 0.0
        out_dtype = np.float32

    # Compute a mel-scale spectrogram from the trimmed wav:
    # (N, D)
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
    # lws pads zeros internally before performing stft
    # this is needed to adjast time resolution between audio and mel-spectrogram
    l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size())

    # zero pad for quantized signal
    out = np.pad(out, (l, r), mode="constant", constant_values=constant_values)
    N = mel_spectrogram.shape[0]
    assert len(out) >= N * audio.get_hop_size()

    # time resolution adjastment
    # ensure length of raw audio is multiple of hop_size so that we can use
    # transposed convolution to upsample
    out = out[:N * audio.get_hop_size()]
    assert len(out) % audio.get_hop_size() == 0
    assert len(out) // N == audio.get_hop_size()

    timesteps = len(out)

    return out, mel_spectrogram, timesteps, out_dtype
Example #26
0
    def __getitem__(self, index):
        text = self.metadata.iloc[index]['text']
        filename = self.metadata.iloc[index]['wav']
        if self.text_transforms:
            text = self.text_transforms(text)
        if self.cache:
            audio = np.load(f'{self.path}/spectrograms/{filename}.npy')
            return text, audio

        audio, _ = load_wav(f'{self.path}/wavs/{filename}.wav')
        if self.audio_transforms:
            audio = self.audio_transforms(audio)
        return text, audio
Example #27
0
def main():
    #这一部分用于处理LibriSpeech格式的数据集。
    for first_dir in os.listdir(wav_dir):
        for second_dir in os.listdir(os.path.join(wav_dir, first_dir)):
            for third_dir in os.listdir(
                    os.path.join(os.path.join(wav_dir, first_dir),
                                 second_dir)):
                third_mfcc_dir = os.path.join(
                    os.path.join(os.path.join(mfcc_dir, first_dir),
                                 second_dir), third_dir)
                third_mel_dir = os.path.join(
                    os.path.join(os.path.join(mel_dir, first_dir), second_dir),
                    third_dir)
                third_spec_dir = os.path.join(
                    os.path.join(os.path.join(spec_dir, first_dir),
                                 second_dir), third_dir)
                third_wav_dir = os.path.join(
                    os.path.join(os.path.join(wav_dir, first_dir), second_dir),
                    third_dir)
                #print('Now in the '+mfcc_dir+' from '+ third_wav_dir)
                if not os.path.exists(third_mfcc_dir):
                    os.makedirs(third_mfcc_dir)

                wav_files = [
                    os.path.join(third_wav_dir, f)
                    for f in os.listdir(third_wav_dir) if f.endswith('.wav')
                ]
                print('Extracting MFCC from {} to {}...'.format(
                    third_wav_dir, third_mfcc_dir))
                cnt = 0
                for wav_f in wav_files:
                    wav_arr = load_wav(wav_f, sr=hparams['sample_rate'])
                    mfcc_feats = wav2unnormalized_mfcc(wav_arr)
                    mel_feats = wav2normalized_db_mel(wav_arr)
                    spec_feats = wav2normalized_db_spec(wav_arr)

                    save_name = wav_f.split('/')[-1].split('.')[0] + '.npy'
                    mfcc_save_name = os.path.join(third_mfcc_dir, save_name)
                    mel_save_name = os.path.join(third_mel_dir, save_name)
                    spec_save_name = os.path.join(third_spec_dir, save_name)
                    np.save(mfcc_save_name, mfcc_feats)
                    np.save(mel_save_name, mel_feats)
                    np.save(spec_save_name, spec_feats)
                    cnt += 1
                    print(cnt)
        #             break
        #         break
        #     break
        # break
        # 提取完毕以后,需要手动将3个文件夹的东西mv到同一个,和ppg一样的2338个文件夹
    return
Example #28
0
def process_video_file(vfile, args, split):
    video_stream = cv2.VideoCapture(vfile)
    frames = []
    while 1:
        still_reading, frame = video_stream.read()
        if not still_reading:
            video_stream.release()
            break
        frames.append(frame)
    mid_frames = []
    ss = 0.
    es = (ss + (window_size / 1000.))

    while int(es * fps) <= len(frames):
        mid_second = (ss + es) / 2.
        mid_frames.append(frames[int(mid_second * fps)])

        ss += (video_step_size_in_ms / 1000.)
        es = (ss + (window_size / 1000.))

    dst_subdir = path.join(
        vfile.split('/')[-2],
        vfile.split('/')[-1].split('.')[0])
    fulldir = path.join(args.final_data_root, split, dst_subdir)
    os.makedirs(fulldir, exist_ok=True)
    wavpath = path.join(fulldir, 'audio.wav')

    command = template.format(vfile, sr, wavpath)
    subprocess.call(command, shell=True)

    specpath = path.join(fulldir, 'mels.npz')

    if path.isfile(wavpath):
        wav = audio.load_wav(wavpath, sr)

        spec = audio.melspectrogram(wav)
        np.savez_compressed(specpath, spec=spec)
    else:
        return

    for i, f in enumerate(mid_frames):
        face, valid_frame = face_detect(f)

        if not valid_frame:
            continue

        resized_face = cv2.resize(face, (args.img_size, args.img_size))

        cv2.imwrite(path.join(fulldir, '{}.jpg'.format(i)), resized_face)
Example #29
0
def _process_utterance(out_dir, index, speaker_id, wav_path, text):
    sr = hparams.sample_rate
    filename = os.path.basename(wav_path).replace('.wav', '')

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab")

    # Trim silence from hts labels if available
    if exists(lab_path):
        labels = hts.load(lab_path)
        b = int(start_at(labels) * 1e-7 * sr)
        e = int(end_at(labels) * 1e-7 * sr)
        wav = wav[b:e]
        wav, _ = librosa.effects.trim(wav, top_db=25)
    # Librosa trim seems to cut off the ending part of speech
    else:
        wav, _ = librosa.effects.trim(wav, top_db=15)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # Save trimmed wav
    save_wav_path = re.sub('wav48', 'wav_trim_22050', wav_path)
    dir = os.path.dirname(save_wav_path)
    if not os.path.exists(dir):
        os.system('mkdir {} -p'.format(dir))
    audio.save_wav(wav, save_wav_path)

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrograms to disk:
    spectrogram_filename = '{}-spec.npy'.format(filename)
    mel_filename = '{}-mel.npy'.format(filename)
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, text, speaker_id)
Example #30
0
def testing_performance(orig, synth, rate):
    
    # Naming wavs
    synth_ts = synth[:-4] + '_' + str(rate) + '.wav'
    librosa_synth_ts_name = synth_ts[:-4] + '_librosa.wav'

    # Load wavs to be compared
    orig = audio.load_wav(orig)
    synth = audio.load_wav(synth)
    synth_ts = audio.load_wav(synth_ts)
    
    # Compare results to Librosa timestretch
    librosa_synth_ts = librosa.effects.time_stretch(synth, rate)
    audio.save_wav(librosa_synth_ts, librosa_synth_ts_name)
    
    # Calculate MSE measure between then
    min_len = min(len(orig) , len(synth))
    model_err = ((orig[:min_len] - synth[:min_len]) ** 2).mean()
    
    min_len = min(len(librosa_synth_ts) , len(synth_ts))
    ts_err = ((librosa_synth_ts[:min_len] - synth_ts[:min_len]) ** 2).mean()
    
    print('model reconstruction error {:.4f}'.format(model_err))
    print('time stretching error {:.4f}'.format(ts_err))
Example #31
0
def get_crp(filestr, window_length = None, threshold = None, use_matlab = False):
    """
    Extract crp.

    """
    if not use_matlab:
        return audio.load_chroma(filestr, crp = True, window_length = window_length,
                                 threshold = threshold).T
    data, info = audio.load_wav(filestr)
    commands = ["crp", data, info['fs']]
    if window_length is not None:
        commands.append(window_length)
        if threshold is not None:
            commands.append(threshold)
    return ipc.get_response(commands) 
Example #32
0
            page.update(x, state[x])

        self.aserver.write_buf(self.root_page.render(force=True))

        # Update LED state.
        # State is stored in in column-major order so we have to transpose into
        # row-major order to suit /grid/led/map
        m = ([1 << self.selected_dur, 1 << self.selected_var] + 
                            [sum(((col >> nrow) & 1) << ncol 
                                for ncol, col in enumerate(state))
                            for nrow in range(ROWS - 2)])

        self.led_map(0, 0, m)

aserver = audio.AudioServer(BPM)
aserver.start()

instruments = [audio.load_wav('samples/%d.wav' % p) for p in range(1, 8)]

app = Drilldown(aserver, instruments, monome.find_any_monome())
app.start()

app.led_all(0)
try:
    while True:
        time.sleep(5)
except KeyboardInterrupt:
    app.led_all(0)
    app.close()