Esempio n. 1
0
def speaker_f0(wav, sr=24000, lo=100, hi=600):
    f0_rapt = sptk.rapt(wav.astype(np.float32)*32768, sr, 256, min=lo, max=hi, otype=2)
    index_nonzero = (f0_rapt != -1e10)
    mean_f0, std_f0 = np.mean(f0_rapt[index_nonzero]), np.std(f0_rapt[index_nonzero])
    f0_norm = speaker_normalization(f0_rapt, index_nonzero, mean_f0, std_f0)    

    return f0_norm    
Esempio n. 2
0
    def get_f0(self,
               audio,
               f0_mean=None,
               f0_var=None,
               sampling_rate=22050,
               frame_length=1024,
               hop_length=256,
               f0_min=80,
               f0_max=880,
               harm_thresh=0.25,
               mel_fmin=70.0):
        '''f0, harmonic_rates, argmins, times = compute_yin(
            audio, sampling_rate, frame_length, hop_length, f0_min, f0_max,
            harm_thresh, mel_fmin)'''
        f0 = sptk.rapt(audio * 32768,
                       sampling_rate,
                       hop_length,
                       min=f0_min,
                       max=f0_max,
                       otype=2)

        f0 = np.clip(f0, 0, f0_max)

        index_nonzero = np.nonzero(f0)
        f0[index_nonzero] += 10.0
        f0 -= 10.0

        if f0_mean == None:
            f0_mean = np.mean(f0[index_nonzero])
        if f0_var == None:
            f0_var = np.std(f0[index_nonzero])

        f0[index_nonzero] = (f0[index_nonzero] - f0_mean) / f0_var

        return f0
def extract_f0_func_audiofile(audio_file, gender='M'):
    floor_sp, ceil_sp = -80, 30
    mel_basis = mel(16000, 1024, fmin=90, fmax=7600, n_mels=80).T
    min_level = np.exp(-100 / 20 * np.log(10))
    b, a = butter_highpass(30, 16000, order=5)

    if gender == 'M':
        lo, hi = 50, 250
    elif gender == 'F':
        lo, hi = 100, 600
    else:
        raise ValueError
    prng = RandomState(0)
    x, fs = sf.read(audio_file)
    if(len(x.shape) >= 2):
        x = x[:, 0]
    if x.shape[0] % 256 == 0:
        x = np.concatenate((x, np.array([1e-06])), axis=0)
    y = signal.filtfilt(b, a, x)
    wav = y * 0.95 + (prng.rand(y.shape[0]) - 0.5) * 1e-06
    D = pySTFT(wav).T
    D_mel = np.dot(D, mel_basis)
    D_db = 20 * np.log10(np.maximum(min_level, D_mel)) - 16
    S = (D_db + 100) / 100

    f0_rapt = sptk.rapt(wav.astype(np.float32) * 32768, fs, 256, min=lo, max=hi, otype=2)
    index_nonzero = (f0_rapt != -1e10)
    tmp = f0_rapt[index_nonzero]
    mean_f0, std_f0 = np.mean(tmp), np.std(tmp)

    f0_norm = speaker_normalization(f0_rapt, index_nonzero, mean_f0, std_f0)

    return S, f0_norm
Esempio n. 4
0
def get_f0(audio, sampling_rate, frame_length, hop_length, f0_min, f0_max,
           harm_thresh):
    f0 = sptk.rapt(audio * 32768,
                   sampling_rate,
                   hop_length,
                   min=f0_min,
                   max=f0_max,
                   otype=2)
    f0 = np.clip(f0, 0, f0_max)
    return f0
Esempio n. 5
0
def get_f0_noisy(wav, duration=None):
    f0 = sptk.rapt(wav.astype(np.float32) * hparams.max_wav_value,
                   hparams.sampling_rate,
                   hparams.encoder_hidden,
                   min=hparams.f0_min,
                   max=hparams.f0_max,
                   otype=2)  # log f0
    if duration is not None:
        f0 = f0[:sum(duration)]
    f0 = np.exp(f0)
    return f0
Esempio n. 6
0
def _processing_data(hparams, full_path, spk_label, spk_emb, gender, npz_name,
                     pbar, i):
    if gender == 'M':
        lo, hi = 50, 250
    elif gender == 'F':
        lo, hi = 100, 600
    else:
        raise ValueError

    prng = RandomState(int(random.random()))
    x, fs = librosa.load(full_path, sr=hparams.sample_rate)
    assert fs == hparams.sample_rate
    if x.shape[0] % hparams.hop_size == 0:
        x = np.concatenate((x, np.array([1e-06])), axis=0)
    y = signal.filtfilt(b, a, x)
    wav = y * 0.96 + (prng.rand(y.shape[0]) - 0.5) * 1e-06

    # compute spectrogram
    D = pySTFT(wav).T
    D_mel = np.dot(D, mel_basis)
    D_db = 20 * np.log10(np.maximum(min_level, D_mel)) - hparams.ref_level_db
    S = (D_db + 100) / 100

    # extract f0
    f0_rapt = sptk.rapt(wav.astype(np.float32) * 32768,
                        fs,
                        hparams.hop_size,
                        min=lo,
                        max=hi,
                        otype=2)
    index_nonzero = (f0_rapt != -1e10)
    mean_f0, std_f0 = np.mean(f0_rapt[index_nonzero]), np.std(
        f0_rapt[index_nonzero])
    f0_norm = speaker_normalization(f0_rapt, index_nonzero, mean_f0, std_f0)

    assert len(S) == len(f0_rapt)

    data = {
        'mel': S.astype(np.float32),
        'f0': f0_norm.astype(np.float32),
        'spk_label': spk_label
    }
    if spk_emb is not None:
        data['spk_emb'] = spk_emb

    np.savez(npz_name, **data)
    pbar.update(i)
Esempio n. 7
0
def Pattern_Generate(path,
                     n_fft: int,
                     num_mels: int,
                     sample_rate: int,
                     hop_size: int,
                     win_size: int,
                     fmin: int,
                     fmax: int,
                     center: bool = False,
                     top_db=60):
    audio, _ = librosa.load(path, sr=sample_rate)
    audio = librosa.effects.trim(audio,
                                 top_db=top_db,
                                 frame_length=512,
                                 hop_length=256)[0]
    audio = librosa.util.normalize(audio) * 0.95
    audio = audio[:audio.shape[0] - (audio.shape[0] % hop_size)]
    spect = spectrogram(y=torch.from_numpy(audio).float().unsqueeze(0),
                        n_fft=n_fft,
                        hop_size=hop_size,
                        win_size=win_size,
                        center=center).squeeze(0).T.numpy()
    mel = mel_spectrogram(y=torch.from_numpy(audio).float().unsqueeze(0),
                          n_fft=n_fft,
                          num_mels=num_mels,
                          sampling_rate=sample_rate,
                          hop_size=hop_size,
                          win_size=win_size,
                          fmin=fmin,
                          fmax=fmax,
                          center=center).squeeze(0).T.numpy()

    log_f0 = np.log(rapt(
        x=audio * 32768,
        fs=sample_rate,
        hopsize=hop_size,
    ))

    if log_f0.shape[0] != mel.shape[0]:
        print(path, audio.shape[0], log_f0.shape[0], mel.shape[0])

    return audio, spect, mel, log_f0
Esempio n. 8
0
def extract_f0(wav, fs):
    f0_rapt = sptk.rapt(wav.astype(np.float32) * 32768,
                        fs,
                        256,
                        min=lo,
                        max=hi,
                        otype=2)
    index_nonzero = (f0_rapt != -1e10)
    mean_f0, std_f0 = np.mean(f0_rapt[index_nonzero]), np.std(
        f0_rapt[index_nonzero])
    f0_norm = speaker_normalization(f0_rapt, index_nonzero, mean_f0, std_f0)

    f0_quantized = quantize_f0_numpy(f0_norm)[0]
    f0_onehot = f0_quantized[np.newaxis, :, :]
    print(f0_onehot.shape)

    if f0_onehot.shape[1] <= 192:
        f0_onehot, _ = pad_seq_to_2(f0_onehot, 192)

    return torch.from_numpy(f0_onehot).to(device)
Esempio n. 9
0
def extract_f0_func(gender):
    floor_sp, ceil_sp = -80, 30
    mel_basis = mel(16000, 1024, fmin=90, fmax=7600, n_mels=80).T
    min_level = np.exp(-100 / 20 * np.log(10))
    b, a = butter_highpass(30, 16000, order=5)

    # Set the directory you want to start from
    ROOT = r'E:\Dataset\VCTK\test_audio'
    rootDir = os.path.join(ROOT, 'audio')
    targetDir_f0 = os.path.join(ROOT, 'f0')
    targetDir = os.path.join(ROOT, 'mel-sp')

    pt = glob.glob1(rootDir, '*')

    cep_all = []
    dirName, subdirList, _ = next(os.walk(rootDir))
    print('Found directory: %s' % dirName)
    for subdir in sorted(pt):
        print(subdir)
        if not os.path.exists(os.path.join(targetDir, subdir)):
            os.makedirs(os.path.join(targetDir, subdir))
        if not os.path.exists(os.path.join(targetDir_f0, subdir)):
            os.makedirs(os.path.join(targetDir_f0, subdir))
        _, _, fileList = next(os.walk(os.path.join(dirName, subdir)))
        if gender == 'M':
            lo, hi = 50, 250
        elif gender == 'F':
            lo, hi = 100, 600
        else:
            raise ValueError
        prng = RandomState(0)
        for fileName in sorted(fileList):
            print(subdir, fileName)
            x, fs = sf.read(os.path.join(dirName, subdir, fileName))
            if (len(x.shape) >= 2):
                x = x[:, 0]
            if x.shape[0] % 256 == 0:
                x = np.concatenate((x, np.array([1e-06])), axis=0)
            y = signal.filtfilt(b, a, x)
            wav = y * 0.95 + (prng.rand(y.shape[0]) - 0.5) * 1e-06
            D = pySTFT(wav).T
            D_mel = np.dot(D, mel_basis)
            D_db = 20 * np.log10(np.maximum(min_level, D_mel)) - 16
            S = (D_db + 100) / 100

            f0_rapt = sptk.rapt(wav.astype(np.float32) * 32768,
                                fs,
                                256,
                                min=lo,
                                max=hi,
                                otype=2)
            index_nonzero = (f0_rapt != -1e10)
            tmp = f0_rapt[index_nonzero]
            mean_f0, std_f0 = np.mean(tmp), np.std(tmp)

            f0_norm = speaker_normalization(f0_rapt, index_nonzero, mean_f0,
                                            std_f0)

            if len(S) != len(f0_norm):
                pdb.set_trace()

            np.save(os.path.join(targetDir, subdir, fileName[:-4]),
                    S.astype(np.float32),
                    allow_pickle=False)

            np.save(os.path.join(targetDir_f0, subdir, fileName[:-4]),
                    f0_norm.astype(np.float32),
                    allow_pickle=False)

            print(S.shape)
            print(f0_norm.shape)
            # assert fs == 16000
            if x.shape[0] % 256 == 0:
                x = np.concatenate((x, np.array([1e-06])), axis=0)
            y = signal.filtfilt(b, a, x)
            wav = y * 0.96 + (prng.rand(y.shape[0]) - 0.5) * 1e-06

            # compute spectrogram
            D = pySTFT(wav).T
            D_mel = np.dot(D, mel_basis)
            D_db = 20 * np.log10(np.maximum(min_level, D_mel)) - 16
            S = (D_db + 100) / 100

            # extract f0
            f0_rapt = sptk.rapt(wav.astype(np.float32) * 32768,
                                fs,
                                256,
                                min=lo,
                                max=hi,
                                otype=2)
            index_nonzero = (f0_rapt != -1e10)
            mean_f0, std_f0 = np.mean(f0_rapt[index_nonzero]), np.std(
                f0_rapt[index_nonzero])
            f0_norm = speaker_normalization(f0_rapt, index_nonzero, mean_f0,
                                            std_f0)

            assert len(S) == len(f0_rapt)

            np.save(os.path.join(targetDir, basename),
                    S.astype(np.float32),
                    allow_pickle=False)
            np.save(os.path.join(targetDir_f0, basename),
                    f0_norm.astype(np.float32),
Esempio n. 11
0
[fs, sig] = util.WavRead(args.i)
sig = sig.astype('float32')
if not args.isolate:
    x_axis = np.arange(0, len(sig) / fs, 1 / fs)
    plot.subplot(211)
    plot.plot(x_axis, sig)
    plot.grid()
    plot.title(args.i.split('/')[-1] + ' Signal')
    plot.xlabel('Time (s)')
    plot.ylabel('Amplitude')
    plot.subplot(212)

fmin = float(args.fmin)
fmax = float(args.fmax)
pitch = sptk.rapt(sig, fs, 250, float(120), float(400))
x_axis = np.linspace(0, len(sig) / fs, len(pitch))

if args.smooth:
    pitch = smooth_pitch(pitch)
    plot.title(args.i.split('/')[-1] + ' Pitch Contour(Smoothed)')
else:
    plot.title(args.i.split('/')[-1] + ' Pitch Contour')

plot.plot(x_axis, pitch)
plot.ylim([fmin, fmax])
plot.grid()

if args.mean:
    length = 0
    add = 0
Esempio n. 12
0
def synthesize_with_reference(idx_info, name, noisy_input, audio_path, tg_path,
                              speaker_id, inspection):
    global model, vocoder, step
    start_time = time.perf_counter()

    # Prepare Reference Data
    if speaker_id is not None:
        spker_embed_path = os.path.join(
            hp.preprocessed_path, "spker_embed",
            "{}-spker_embed-{}.npy".format(hp.dataset, speaker_id))
        speaker_embed = torch.from_numpy(np.load(spker_embed_path)).to(device)
    else:
        try:
            # VCTK fileformat
            speaker_id = name.split("_")[0]
            spker_embed_path = os.path.join(
                hp.preprocessed_path, "spker_embed",
                "{}-spker_embed-{}.npy".format(hp.dataset, speaker_id))
            speaker_embed = torch.from_numpy(
                np.load(spker_embed_path)).to(device)
        except:
            # General cases
            speaker_id = None
            speaker_embed = torch.from_numpy(
                embedding.predict_embedding(speaker_embedder, audio_path))

    # Outdir
    outdir = os.path.join(hp.test_path(),
                          "{}_by_{}_{}".format(name, speaker_id, step))
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    text = utils.get_transcript(
        os.path.join(audio_path.replace(".wav", ".txt")))
    if not os.path.isfile(tg_path):
        tg_path = "NO TextGrid"
        _, wav = read(audio_path)
        if noisy_input:
            f0 = sptk.rapt(wav.astype(np.float32) * hp.max_wav_value,
                           hp.sampling_rate,
                           hp.encoder_hidden,
                           min=hp.f0_min,
                           max=hp.f0_max,
                           otype=2)  # log f0
            f0 = np.exp(f0)
        else:
            f0, _ = pw.dio(wav.astype(np.float64),
                           hp.sampling_rate,
                           frame_period=hp.hop_length / hp.sampling_rate *
                           1000)
        mel, energy, _ = Audio.tools.get_mel_from_wav(
            torch.FloatTensor(np.array(wav)))
        mel = mel.T.numpy().astype(np.float32)
        energy = energy.numpy().astype(np.float32)
        utils.plot_data([(mel.T, f0, energy)], ['Reference Spectrogram'],
                        filename=os.path.join(
                            outdir,
                            '{}_{}_{}.png'.format("Reference", name,
                                                  text[:100])))
    else:
        f0, energy, mel = get_processed_data_from_wav(audio_path, tg_path,
                                                      noisy_input)
        utils.plot_data([(mel.T, f0, energy)], ['Reference Spectrogram'],
                        filename=os.path.join(
                            outdir,
                            '{}_{}_{}.png'.format("Reference", name,
                                                  text[:100])))

    # Prepare Audio Inputs
    energy = (energy - hp.energy_min) / (hp.energy_max - hp.energy_min)
    f0_norm = utils.speaker_normalization(f0)
    mel, mel_len, energy, f0, f0_norm = preprocess_audio(
        mel, energy, f0, f0_norm)

    print("\n\n---------------- [{}/{}]: {} ----------------".format(
        idx_info[0] + 1, idx_info[1],
        audio_path.split('/')[-1]))
    print('Audio Path:', audio_path)
    print('TextGrid Path:', tg_path)
    print('Speaker ID:', speaker_id)

    # Synthesize
    success = 0
    for sentence in sentences:
        text = preprocess_text(sentence)
        synthesize(outdir, model, vocoder, text, sentence, speaker_embed,
                   speaker_id, inspection, mel, mel_len, f0, f0_norm, energy,
                   args.duration_control, args.pitch_control,
                   args.energy_control)
        success += 1
    print("Synthesized {} out of {} in {:.3f}s".format(
        success, len(sentences),
        time.perf_counter() - start_time))