Beispiel #1
0
def spec_to_waveform(spectrogram, order, fs, frame_period):
    alpha = pysptk.util.mcepalpha(fs)
    hop_length = int(fs * (frame_period * 0.001))
    mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha)
    engine = Synthesizer(MLSADF(order=order, alpha=alpha), hopsize=hop_length)
    b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha)
    waveform = engine.synthesis(x, b)

    return waveform
Beispiel #2
0
def test_vc_from_path(model, path, data_mean, data_std, diffvc=True):
    model.eval()

    fs, x = wavfile.read(path)
    x = x.astype(np.float64)
    f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)
    alpha = pysptk.util.mcepalpha(fs)
    mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha)
    c0, mc = mc[:, 0], mc[:, 1:]
    static_dim = mc.shape[-1]
    mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50)
    mc = P.delta_features(mc, hp.windows).astype(np.float32)

    T = mc.shape[0]

    inputs = mc[:, :static_dim].copy()

    # Normalization
    mc_scaled = P.scale(mc, data_mean, data_std)

    # Apply model
    mc_scaled = Variable(torch.from_numpy(mc_scaled))
    R = unit_variance_mlpg_matrix(hp.windows, T)
    R = torch.from_numpy(R)
    y_hat, y_hat_static = model(mc_scaled, R)
    mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim)

    # Denormalize
    mc_static_pred = P.inv_scale(mc_static_pred, data_mean[:static_dim],
                                 data_std[:static_dim])

    outputs = mc_static_pred.copy()

    if diffvc:
        mc_static_pred = mc_static_pred - mc[:, :static_dim]

    mc = np.hstack((c0[:, None], mc_static_pred))
    if diffvc:
        mc[:, 0] = 0  # remove power coefficients
        engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha),
                             hopsize=hop_length)
        b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha)
        waveform = engine.synthesis(x, b)
    else:
        fftlen = pyworld.get_cheaptrick_fft_size(fs)
        spectrogram = pysptk.mc2sp(mc.astype(np.float64),
                                   alpha=alpha,
                                   fftlen=fftlen)
        waveform = pyworld.synthesize(f0, spectrogram, aperiodicity, fs,
                                      hp.frame_period)

    return waveform, inputs, outputs
    def pysptk_imfcc(self):
        from pysptk.synthesis import MLSADF, Synthesizer

        # Convert mel-cesptrum to MLSADF coefficients
        b = pysptk.mc2b(self.mc, self.alpha)

        synthesizer = Synthesizer(MLSADF(order=self.order, alpha=self.alpha),
                                  self.hop_length)

        x_synthesized = synthesizer.synthesis(self.source_excitation, b)

        librosa.display.waveplot(x_synthesized, sr=self.sr)
        a = 0
Beispiel #4
0
 def synthesize(self, pitch, mc, unnormalize=False):
     if unnormalize and self.mean != None:
         for tt in range(len(pitch)):
             for ii in range(self.num_params + 1):
                 mc[tt][ii] = mc[tt][ii] * self.stdev[ii] + self.mean[
                     ii]  #(mc[tt][ii]-self.mean[ii]) / self.stdev[ii]
     mc = np.asarray(mc, dtype=np.float64)
     pitch = np.asarray(pitch, dtype=np.float64)
     #print mc.shape
     #print pitch.shape
     b = sptk.mc2b(mc, self.alpha)
     synthesizer = Synthesizer(
         MLSADF(order=self.num_params, alpha=self.alpha),
         self.frame_len * self.sample_rate / 1000)
     source_excitation = sptk.excite(
         pitch, self.frame_len * self.sample_rate / 1000)
     x_synthesized = synthesizer.synthesis(source_excitation, b)
     return x_synthesized
def synthesis_diff(x,
                   diffmcep,
                   rmcep=None,
                   alpha=MCEP_ALPHA,
                   fs=FS,
                   shiftms=SHIFTMS):
    """filtering with a differential mel-cesptrum
        Parameters
        ----------
        x : array, shape (`samples`)
            array of waveform sequence
        diffmcep : array, shape (`T`, `dim`)
            array of differential mel-cepstrum sequence
        rmcep : array, shape (`T`, `dim`)
            array of reference mel-cepstrum sequence
            Default set to None
        alpha : float, optional
            Parameter of all-path transfer function
            Default set to 0.42
        Return
        ----------
        wav: array, shape (`samples`)
            Synethesized waveform
        """

    x = x.astype(np.float64)
    dim = diffmcep.shape[1] - 1
    shiftl = int(fs / 1000 * shiftms)

    if rmcep is not None:
        # power modification
        diffmcep = mod_power(rmcep + diffmcep, rmcep, alpha=alpha) - rmcep

    b = np.apply_along_axis(ps.mc2b, 1, diffmcep, alpha)
    assert np.isfinite(b).all()

    mlsa_fil = ps.synthesis.Synthesizer(MLSADF(dim, alpha=alpha), shiftl)
    wav = mlsa_fil.synthesis(x, b)

    return wav
Beispiel #6
0
def test_one_utt(src_path, tgt_path, disable_mlpg=False, diffvc=True):
    # GMM-based parameter generation is provided by the library in `baseline` module
    if disable_mlpg:
        # Force disable MLPG
        paramgen = MLPG(gmm, windows=[(0, 0, np.array([1.0]))], diff=diffvc)
    else:
        paramgen = MLPG(gmm, windows=windows, diff=diffvc)

    fs, x = wavfile.read(src_path)
    x = x.astype(np.float64)
    f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)

    pdb.set_trace()

    mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha)
    c0, mc = mc[:, 0], mc[:, 1:]
    if use_delta:
        mc = delta_features(mc, windows)
    mc = paramgen.transform(mc)
    if disable_mlpg and mc.shape[-1] != static_dim:
        mc = mc[:, :static_dim]
    assert mc.shape[-1] == static_dim
    mc = np.hstack((c0[:, None], mc))
    if diffvc:
        mc[:, 0] = 0  # remove power coefficients
        engine = Synthesizer(MLSADF(order=order, alpha=alpha),
                             hopsize=hop_length)
        b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha)
        waveform = engine.synthesis(x, b)
    else:
        spectrogram = pysptk.mc2sp(mc.astype(np.float64),
                                   alpha=alpha,
                                   fftlen=fftlen)
        waveform = pyworld.synthesize(f0, spectrogram, aperiodicity, fs,
                                      frame_period)

    return waveform
Beispiel #7
0
def apply_mlsa_filter(wav, mcep):
    if mcep.fs > wav.fs:
        mcep = kwiiyatta.resample(mcep, wav.fs)
    elif mcep.fs < wav.fs:
        spec = kwiiyatta.Synthesizer.resample_spectrum_envelope(
            mcep.extract_spectrum(),
            mcep.fs,
            wav.fs
        )
        cutoff = mcep.fs*spec.shape[1]//wav.fs
        spec[:, cutoff:] = np.tile(np.atleast_2d(spec[:, cutoff-1]).T,
                                   spec.shape[-1]-cutoff)
        mcep = kwiiyatta.MelCepstrum(wav.fs, mcep.frame_period)
        mcep.extract(spec)
    # remove power coefficients
    mc = np.hstack((np.zeros((mcep.data.shape[0], 1)), mcep.data[:, 1:]))
    alpha = mcep.alpha()
    engine = Synthesizer(MLSADF(order=mcep.order, alpha=alpha),
                         hopsize=int(mcep.fs * (mcep.frame_period * 0.001)))
    b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha)
    waveform = engine.synthesis(wav.data, b)
    return kwiiyatta.Wavdata(wav.fs, waveform)
Beispiel #8
0
def test_one_utt(path_src, path_tgt, disable_mlpg=False, diffvc=True):
    if disable_mlpg:
        paramgen = MLPG(gmm, windows=[(0, 0, np.array([1.0]))], diff=diffvc)
    else:
        paramgen = MLPG(gmm, windows=windows, diff=diffvc)

    x, fs_ = sf.read(path_src)
    x = x.astype(np.float64)
    f0, time_axis = pyworld.dio(x, fs_, frame_period=frame_period)
    f0 = pyworld.stonemask(x, f0, time_axis, fs_)
    spectrogram = pyworld.cheaptrick(x, f0, time_axis, fs_)
    aperiodicity = pyworld.d4c(x, f0, time_axis, fs_)

    mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha)
    c0, mc = mc[:, 0], mc[:, 1:]
    if use_delta:
        mc = delta_features(mc, windows)
    mc = paramgen.transform(mc)
    if disable_mlpg and mc.shape[-1] != static_dim:
        mc = mc[:, :static_dim]
    assert mc.shape[-1] == static_dim
    mc = np.hstack((c0[:, None], mc))
    if diffvc:
        mc[:, 0] = 0
        engine = Synthesizer(MLSADF(order=order, alpha=alpha),
                             hopsize=hop_length)
        b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha)
        waveform = engine.synthesis(x, b)
    else:
        spectrogram = pysptk.mc2sp(mc.astype(np.float64),
                                   alpha=alpha,
                                   fftlen=fftlen)
        waveform = pyworld.synthesize(f0, spectrogram, aperiodicity, fs_,
                                      frame_period)

    return waveform
Beispiel #9
0
 def __test(order, alpha, pd):
     __test_synthesis(MLSADF(order, alpha, pd=pd))
datalist = []
with open("conf/eval.list", "r") as f:
    for line in f:
        line = line.rstrip()
        datalist.append(line)

for i in range(0, len(datalist)):
    outfile = "result/wav/{}_diff.wav".format(datalist[i])
    with open("data/SF-TF/mgc/{}.mgc".format(datalist[i]), "rb") as f:
        conv_mgc = np.fromfile(f, dtype="<f8", sep="")
        conv_mgc = conv_mgc.reshape(len(conv_mgc) // dim, dim)

    with open("data/SF/mgc/{}.mgc".format(datalist[i]), "rb") as f:
        src_mgc = np.fromfile(f, dtype="<f8", sep="")
        src_mgc = src_mgc.reshape(len(src_mgc) // dim, dim)

    fs, data = wavfile.read("data/SF/wav/{}.wav".format(
        datalist[i]))  # 入力音声そのものをもってくる
    data = data.astype(np.float)

    diff_mgc = conv_mgc - src_mgc  # 差分のフィルタを用意する
    diff_mgc = np.zeros(shape=conv_mgc.shape)

    # 差分のフィルタを入力音声波形に適用する
    b = np.apply_along_axis(sptk.mc2b, 1, diff_mgc, alpha)
    synthesizer = Synthesizer(MLSADF(order=dim - 1, alpha=alpha), 80)
    owav = synthesizer.synthesis(data, b)

    owav = np.clip(owav, -32768, 32767)
    wavfile.write(outfile, fs, owav.astype(np.int16))
Beispiel #11
0
    pitch = pysptk.swipe(x.astype(np.float64),
                         fs=sr,
                         hopsize=hop_length,
                         min=60,
                         max=240,
                         otype="pitch")
    source_excitation = pysptk.excite(pitch, hop_length)

    # Order of mel-cepstrum

    mc = pysptk.mcep(frames, order, alpha)
    logH = pysptk.mgc2sp(mc, alpha, 0.0, frame_length).real
    print(mc.shape)
    #plt.plot(mc)
    #plotname="x_syn_coefs_" + str(order) + ".png"
    #plt.savefig(plotname)

    # Convert mel-cesptrum to MLSADF coefficients
    b = pysptk.mc2b(mc, alpha)

    synthesizer = Synthesizer(MLSADF(order=order, alpha=alpha), hop_length)

    x_synthesized = synthesizer.synthesis(source_excitation, b)

    filenam = "synthesized_sounds/" + "x_syn" + str(order + 1) + ".wav"
    #wavfile.write("x.wav", sr, x)
    wavfile.write(filenam, sr, x_synthesized)
    time_total = time.time() - start
    writestring = str(order) + "," + str(time_total) + "\n"
    f.write(writestring)
Beispiel #12
0
 def __test_invalid_pade(pd):
     MLSADF(20, pd=pd)
Beispiel #13
0
 def __test(order, alpha):
     __test_synthesis(MLSADF(order, alpha))
Beispiel #14
0
def test_vc_from_path(model, x, fs, data_mean, data_std, diffvc=True):
    model.eval()

    hop_length = int(fs * (hp.frame_period * 0.001))
    x = x.astype(np.float64)
    f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)
    alpha = pysptk.util.mcepalpha(fs)
    mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha)
    c0, mc = mc[:, 0], mc[:, 1:]
    static_dim = mc.shape[-1]
    mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50)
    mc = P.delta_features(mc, hp.windows).astype(np.float32)

    T = mc.shape[0]

    inputs = mc[:, :static_dim].copy()

    # Normalization
    mc_scaled = P.scale(mc, data_mean, data_std)

    mc_scaled = Variable(torch.from_numpy(mc_scaled))
    lengths = [len(mc_scaled)]

    # Add batch axis
    mc_scaled = mc_scaled.view(1, -1, mc_scaled.size(-1))

    # For MLPG
    R = unit_variance_mlpg_matrix(hp.windows, T)
    R = torch.from_numpy(R)

    # Apply model
    if model.include_parameter_generation():
        # Case: models include parameter generation in itself
        # Mulistream features cannot be used in this case
        y_hat, y_hat_static = model(mc_scaled, R, lengths=lengths)
    else:
        # Case: generic models (can be sequence model)
        assert hp.has_dynamic_features is not None
        y_hat = model(mc_scaled, lengths=lengths)
        y_hat_static = multi_stream_mlpg(y_hat, R, hp.stream_sizes,
                                         hp.has_dynamic_features)

    mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim)

    # Denormalize
    mc_static_pred = P.inv_scale(mc_static_pred, data_mean[:static_dim],
                                 data_std[:static_dim])

    outputs = mc_static_pred.copy()

    if diffvc:
        mc_static_pred = mc_static_pred - mc[:, :static_dim]

    mc = np.hstack((c0[:, None], mc_static_pred))
    if diffvc:
        mc[:, 0] = 0  # remove power coefficients
        engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha),
                             hopsize=hop_length)
        b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha)
        waveform = engine.synthesis(x, b)
    else:
        fftlen = pyworld.get_cheaptrick_fft_size(fs)
        spectrogram = pysptk.mc2sp(mc.astype(np.float64),
                                   alpha=alpha,
                                   fftlen=fftlen)
        waveform = pyworld.synthesize(f0, spectrogram, aperiodicity, fs,
                                      hp.frame_period)

    return waveform, inputs, outputs
Beispiel #15
0
# Parameter generation
paramgen = MLPG(gmm, windows=windows, diff=True)

# Waveform generation for test set
for idx, path in enumerate(source.test_paths):
    fs, x = wavfile.read(path)
    x = x.astype(np.float64)
    f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    # aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)

    mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha)
    c0, mc = mc[:, 0], mc[:, 1:]

    mc = delta_features(mc, windows)
    since = time.time()
    mc = paramgen.transform(mc)
    print("{}, Elapsed time in conversion: {}s".format(idx,
                                                       time.time() - since))
    assert mc.shape[-1] == static_dim
    mc = np.hstack((c0[:, None], mc))

    mc[:, 0] = 0
    engine = Synthesizer(MLSADF(order=order, alpha=alpha), hopsize=80)
    b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha)
    waveform = engine.synthesis(x, b)
    if not exists('resultsVC'):
        os.makedirs('resultsVC')
    wavfile.write(
        "resultsVC/{}_{}.wav".format(splitext(basename(path))[0], 'mlpg'), fs,
        waveform.astype(np.int16))
 def build_synth(self):
     self.synthesizer = Synthesizer(
         MLSADF(order=self.order, alpha=self.alpha), self.hop_length)
Beispiel #17
0
# 音声の分析 (基本周波数、スペクトル包絡、非周期性指標)
_, sp, _ = pyworld.wav2world(x, fs)

# メルケプストラム係数の抽出 from WORLDのスペクトル包絡
mcep = pysptk.sp2mc(sp, order=ORDER, alpha=ALPHA)

# ピッチ抽出
pitch = pysptk.swipe(x,
                     fs=fs,
                     hopsize=HOP_LENGTH,
                     min=MIN_F0,
                     max=MAX_F0,
                     otype="pitch")

# 励振源信号(声帯音源)の生成
source_excitation = pysptk.excite(pitch, HOP_LENGTH)

# メルケプストラム係数からMLSAディジタルフィルタ係数に変換
mlsa_coef = pysptk.mc2b(mcep, ALPHA)

# MLSAフィルタの作成
synthesizer = Synthesizer(MLSADF(order=ORDER, alpha=ALPHA), HOP_LENGTH)

# 励振源信号でMLSAフィルタを駆動して音声を合成
y = synthesizer.synthesis(source_excitation, mlsa_coef)

# 音声の書き込み
y = y.astype(np.int16)
wavfile.write(OUT_WAVE_FILE, fs, y)
                     hopsize=HOP_LENGTH,
                     min=MIN_F0,
                     max=MAX_F0,
                     otype="pitch")

# 励振源信号(声帯音源)の生成
source_excitation = pysptk.excite(pitch, HOP_LENGTH)

# メルケプストラム分析(=スペクトル包絡の抽出)
mc = pysptk.mcep(frames, ORDER, ALPHA)

# メルケプストラム係数からMLSAディジタルフィルタ係数に変換
mlsa_coef = pysptk.mc2b(mc, ALPHA)

# MLSAフィルタの作成
synthesizer = Synthesizer(MLSADF(order=ORDER, alpha=ALPHA), HOP_LENGTH)

# #### 以降、合成フィルタのパラメタなどを変えて色々な音声を合成

# ### ピッチシフト (音を高くする) ###
OUT_WAVE_FILE = "pitchshift_high.wav"
PITCH_SHIFT = 0.5  # 音を高くする場合は 1より小さい倍率
excitation_pitchhigh = pysptk.excite(pitch * PITCH_SHIFT, HOP_LENGTH)
y = synthesizer.synthesis(excitation_pitchhigh, mlsa_coef)  # 音声合成
y = y.astype(np.int16)
wavfile.write(OUT_WAVE_FILE, fs, y)

# ### ピッチシフト (音を低くする) ###
OUT_WAVE_FILE = "pitchshift_low.wav"
PITCH_SHIFT = 1.5  # 音を低くする場合は 1より大きい倍率
excitation_pitchlow = pysptk.excite(pitch * PITCH_SHIFT, HOP_LENGTH)