Exemple #1
0
def spec_to_waveform(spectrogram, order, fs, frame_period):
    alpha = pysptk.util.mcepalpha(fs)
    hop_length = int(fs * (frame_period * 0.001))
    mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha)
    engine = Synthesizer(MLSADF(order=order, alpha=alpha), hopsize=hop_length)
    b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha)
    waveform = engine.synthesis(x, b)

    return waveform
Exemple #2
0
def test_vc_from_path(model, path, data_mean, data_std, diffvc=True):
    model.eval()

    fs, x = wavfile.read(path)
    x = x.astype(np.float64)
    f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)
    alpha = pysptk.util.mcepalpha(fs)
    mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha)
    c0, mc = mc[:, 0], mc[:, 1:]
    static_dim = mc.shape[-1]
    mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50)
    mc = P.delta_features(mc, hp.windows).astype(np.float32)

    T = mc.shape[0]

    inputs = mc[:, :static_dim].copy()

    # Normalization
    mc_scaled = P.scale(mc, data_mean, data_std)

    # Apply model
    mc_scaled = Variable(torch.from_numpy(mc_scaled))
    R = unit_variance_mlpg_matrix(hp.windows, T)
    R = torch.from_numpy(R)
    y_hat, y_hat_static = model(mc_scaled, R)
    mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim)

    # Denormalize
    mc_static_pred = P.inv_scale(mc_static_pred, data_mean[:static_dim],
                                 data_std[:static_dim])

    outputs = mc_static_pred.copy()

    if diffvc:
        mc_static_pred = mc_static_pred - mc[:, :static_dim]

    mc = np.hstack((c0[:, None], mc_static_pred))
    if diffvc:
        mc[:, 0] = 0  # remove power coefficients
        engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha),
                             hopsize=hop_length)
        b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha)
        waveform = engine.synthesis(x, b)
    else:
        fftlen = pyworld.get_cheaptrick_fft_size(fs)
        spectrogram = pysptk.mc2sp(mc.astype(np.float64),
                                   alpha=alpha,
                                   fftlen=fftlen)
        waveform = pyworld.synthesize(f0, spectrogram, aperiodicity, fs,
                                      hp.frame_period)

    return waveform, inputs, outputs
    def pysptk_imfcc(self):
        from pysptk.synthesis import MLSADF, Synthesizer

        # Convert mel-cesptrum to MLSADF coefficients
        b = pysptk.mc2b(self.mc, self.alpha)

        synthesizer = Synthesizer(MLSADF(order=self.order, alpha=self.alpha),
                                  self.hop_length)

        x_synthesized = synthesizer.synthesis(self.source_excitation, b)

        librosa.display.waveplot(x_synthesized, sr=self.sr)
        a = 0
Exemple #4
0
    def __test_synthesis(filt):
        # dummy source excitation
        source = __dummy_source()

        hopsize = 80

        # dummy filter coef.
        windowed = __dummy_windowed_frames(
            source, frame_len=512, hopsize=hopsize)
        b = np.apply_along_axis(
            pysptk.mcep, 1, windowed, filt.order, 0.0)

        # synthesis
        synthesizer = Synthesizer(filt, hopsize)
        y = synthesizer.synthesis(source, b)
        assert np.all(np.isfinite(y))
Exemple #5
0
    def __test_synthesis(filt):
        # dummy source excitation
        source = __dummy_source()

        hopsize = 80

        # dummy filter coef.
        windowed = __dummy_windowed_frames(source,
                                           frame_len=512,
                                           hopsize=hopsize)
        b = np.apply_along_axis(pysptk.mcep, 1, windowed, filt.order, 0.0)

        # synthesis
        synthesizer = Synthesizer(filt, hopsize)
        y = synthesizer.synthesis(source, b)
        assert np.all(np.isfinite(y))
Exemple #6
0
 def synthesize(self, pitch, mc, unnormalize=False):
     if unnormalize and self.mean != None:
         for tt in range(len(pitch)):
             for ii in range(self.num_params + 1):
                 mc[tt][ii] = mc[tt][ii] * self.stdev[ii] + self.mean[
                     ii]  #(mc[tt][ii]-self.mean[ii]) / self.stdev[ii]
     mc = np.asarray(mc, dtype=np.float64)
     pitch = np.asarray(pitch, dtype=np.float64)
     #print mc.shape
     #print pitch.shape
     b = sptk.mc2b(mc, self.alpha)
     synthesizer = Synthesizer(
         MLSADF(order=self.num_params, alpha=self.alpha),
         self.frame_len * self.sample_rate / 1000)
     source_excitation = sptk.excite(
         pitch, self.frame_len * self.sample_rate / 1000)
     x_synthesized = synthesizer.synthesis(source_excitation, b)
     return x_synthesized
Exemple #7
0
    def __test_synthesis(filt):
        # dummy source excitation
        source = __dummy_source()

        hopsize = 80

        # dummy filter coef.
        windowed = __dummy_windowed_frames(source,
                                           frame_len=512,
                                           hopsize=hopsize)
        lpc = np.apply_along_axis(pysptk.lpc, 1, windowed, filt.order)
        # make sure lsp has loggain
        lsp = np.apply_along_axis(pysptk.lpc2lsp, 1, lpc)
        lsp[:, 0] = np.log(lsp[:, 0])

        # synthesis
        synthesizer = Synthesizer(filt, hopsize)
        y = synthesizer.synthesis(source, lsp)
        assert np.all(np.isfinite(y))
Exemple #8
0
    def __test_synthesis(filt):
        # dummy source excitation
        source = __dummy_source()

        hopsize = 80

        # dummy filter coef.
        windowed = __dummy_windowed_frames(
            source, frame_len=512, hopsize=hopsize)
        lpc = np.apply_along_axis(
            pysptk.lpc, 1, windowed, filt.order)
        # make sure lsp has loggain
        lsp = np.apply_along_axis(pysptk.lpc2lsp, 1, lpc)
        lsp[:, 0] = np.log(lsp[:, 0])

        # synthesis
        synthesizer = Synthesizer(filt, hopsize)
        y = synthesizer.synthesis(source, lsp)
        assert np.all(np.isfinite(y))
Exemple #9
0
    def __test_synthesis(filt):
        # dummy source excitation
        source = __dummy_source()

        hopsize = 80

        # dummy filter coef.
        windowed = __dummy_windowed_frames(source,
                                           frame_len=512,
                                           hopsize=hopsize)
        lpc = pysptk.lpc(windowed, filt.order)
        par = pysptk.lpc2par(lpc)

        # make sure par has loggain
        par[:, 0] = np.log(par[:, 0])

        # synthesis
        synthesizer = Synthesizer(filt, hopsize)
        y = synthesizer.synthesis(source, par)
        assert np.all(np.isfinite(y))
Exemple #10
0
    def __test_synthesis_levdur(filt):
        # dummy source excitation
        source = __dummy_source()

        hopsize = 80

        # dummy filter coef.
        windowed = __dummy_windowed_frames(source,
                                           frame_len=512,
                                           hopsize=hopsize)
        c = pysptk.mcep(windowed, filt.order)
        lpc = pysptk.levdur(pysptk.c2acr(c))

        # make sure lpc has loggain
        lpc[:, 0] = np.log(lpc[:, 0])

        # synthesis
        synthesizer = Synthesizer(filt, hopsize)
        y = synthesizer.synthesis(source, lpc)
        assert np.all(np.isfinite(y))
Exemple #11
0
def test_one_utt(src_path, tgt_path, disable_mlpg=False, diffvc=True):
    # GMM-based parameter generation is provided by the library in `baseline` module
    if disable_mlpg:
        # Force disable MLPG
        paramgen = MLPG(gmm, windows=[(0, 0, np.array([1.0]))], diff=diffvc)
    else:
        paramgen = MLPG(gmm, windows=windows, diff=diffvc)

    fs, x = wavfile.read(src_path)
    x = x.astype(np.float64)
    f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)

    pdb.set_trace()

    mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha)
    c0, mc = mc[:, 0], mc[:, 1:]
    if use_delta:
        mc = delta_features(mc, windows)
    mc = paramgen.transform(mc)
    if disable_mlpg and mc.shape[-1] != static_dim:
        mc = mc[:, :static_dim]
    assert mc.shape[-1] == static_dim
    mc = np.hstack((c0[:, None], mc))
    if diffvc:
        mc[:, 0] = 0  # remove power coefficients
        engine = Synthesizer(MLSADF(order=order, alpha=alpha),
                             hopsize=hop_length)
        b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha)
        waveform = engine.synthesis(x, b)
    else:
        spectrogram = pysptk.mc2sp(mc.astype(np.float64),
                                   alpha=alpha,
                                   fftlen=fftlen)
        waveform = pyworld.synthesize(f0, spectrogram, aperiodicity, fs,
                                      frame_period)

    return waveform
Exemple #12
0
def apply_mlsa_filter(wav, mcep):
    if mcep.fs > wav.fs:
        mcep = kwiiyatta.resample(mcep, wav.fs)
    elif mcep.fs < wav.fs:
        spec = kwiiyatta.Synthesizer.resample_spectrum_envelope(
            mcep.extract_spectrum(),
            mcep.fs,
            wav.fs
        )
        cutoff = mcep.fs*spec.shape[1]//wav.fs
        spec[:, cutoff:] = np.tile(np.atleast_2d(spec[:, cutoff-1]).T,
                                   spec.shape[-1]-cutoff)
        mcep = kwiiyatta.MelCepstrum(wav.fs, mcep.frame_period)
        mcep.extract(spec)
    # remove power coefficients
    mc = np.hstack((np.zeros((mcep.data.shape[0], 1)), mcep.data[:, 1:]))
    alpha = mcep.alpha()
    engine = Synthesizer(MLSADF(order=mcep.order, alpha=alpha),
                         hopsize=int(mcep.fs * (mcep.frame_period * 0.001)))
    b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha)
    waveform = engine.synthesis(wav.data, b)
    return kwiiyatta.Wavdata(wav.fs, waveform)
Exemple #13
0
def test_one_utt(path_src, path_tgt, disable_mlpg=False, diffvc=True):
    if disable_mlpg:
        paramgen = MLPG(gmm, windows=[(0, 0, np.array([1.0]))], diff=diffvc)
    else:
        paramgen = MLPG(gmm, windows=windows, diff=diffvc)

    x, fs_ = sf.read(path_src)
    x = x.astype(np.float64)
    f0, time_axis = pyworld.dio(x, fs_, frame_period=frame_period)
    f0 = pyworld.stonemask(x, f0, time_axis, fs_)
    spectrogram = pyworld.cheaptrick(x, f0, time_axis, fs_)
    aperiodicity = pyworld.d4c(x, f0, time_axis, fs_)

    mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha)
    c0, mc = mc[:, 0], mc[:, 1:]
    if use_delta:
        mc = delta_features(mc, windows)
    mc = paramgen.transform(mc)
    if disable_mlpg and mc.shape[-1] != static_dim:
        mc = mc[:, :static_dim]
    assert mc.shape[-1] == static_dim
    mc = np.hstack((c0[:, None], mc))
    if diffvc:
        mc[:, 0] = 0
        engine = Synthesizer(MLSADF(order=order, alpha=alpha),
                             hopsize=hop_length)
        b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha)
        waveform = engine.synthesis(x, b)
    else:
        spectrogram = pysptk.mc2sp(mc.astype(np.float64),
                                   alpha=alpha,
                                   fftlen=fftlen)
        waveform = pyworld.synthesize(f0, spectrogram, aperiodicity, fs_,
                                      frame_period)

    return waveform
class MCEP(object):
    def __init__(self, need_synth):
        self.frame_length = config.data.fftl
        self.hop_length = config.data.hop_length
        self.sr = config.data.sr
        self.order = config.data.mcep_order
        self.alpha = config.data.mcep_alpha
        if need_synth:
            self.build_synth()
        else:
            self.synthesizer = None

    def build_synth(self):
        self.synthesizer = Synthesizer(
            MLSADF(order=self.order, alpha=self.alpha), self.hop_length)

    def get_MCEP(self, utterance):
        utterance = librosa.util.normalize(utterance)
        utterance = utterance + np.random.normal(
            loc=0, scale=0.0000001, size=utterance.shape[0])
        utterance = librosa.util.normalize(utterance)
        utterance = utterance.astype(np.float64)  # necessary for synthesizer
        frames = librosa.util.frame(utterance,
                                    frame_length=self.frame_length,
                                    hop_length=self.hop_length).astype(
                                        np.float64).T
        # Windowing
        frames *= pysptk.blackman(self.frame_length)
        assert frames.shape[1] == self.frame_length
        # Pitch
        pitch = pysptk.swipe(utterance.astype(np.float64),
                             fs=self.sr,
                             hopsize=self.hop_length,
                             min=60,
                             max=240,
                             otype="pitch")
        mcep = pysptk.mcep(frames, self.order, self.alpha)
        return mcep, pitch

    def synthesize_from_MCEP(self, mcep, pitch):
        mcep = mcep.copy(order='C')  # fixes "ndarray not C-contiguous error
        b = pysptk.mc2b(mcep, self.alpha)
        excitation = pysptk.excite(pitch.astype(np.float64), self.hop_length)
        x = self.synthesizer.synthesis(excitation.astype(np.float64),
                                       b.astype(np.float64))
        return x
Exemple #15
0
    def __test_synthesis(filt):
        # dummy source excitation
        source = __dummy_source()

        hopsize = 80

        # dummy filter coef.
        windowed = __dummy_windowed_frames(source,
                                           frame_len=512,
                                           hopsize=hopsize)
        mc = pysptk.mcep(windowed, filt.order, filt.alpha)
        b = pysptk.mc2b(mc, filt.alpha)

        # synthesis
        synthesizer = Synthesizer(filt, hopsize)
        y = synthesizer.synthesis(source, b)
        assert np.all(np.isfinite(y))

        # transpose
        synthesizer = Synthesizer(filt, hopsize, transpose=True)
        y = synthesizer.synthesis(source, b)
        assert np.all(np.isfinite(y))
Exemple #16
0
def pitch_shift_on_lpc_residual(
    wav,
    sr,
    shift_in_cent,
    frame_length=4096,
    hop_length=240,
    mgc_order=59,
):
    assert wav.dtype == np.int16
    frames = (librosa.util.frame(wav,
                                 frame_length=frame_length,
                                 hop_length=hop_length).astype(np.float64).T)
    frames *= pysptk.blackman(frame_length)
    alpha = pysptk.util.mcepalpha(sr)

    mgc = pysptk.mcep(frames, mgc_order, alpha, eps=1e-5, etype=1)
    c = pysptk.freqt(mgc, mgc_order, -alpha)

    lpc = pysptk.levdur(pysptk.c2acr(c, mgc_order, frame_length))
    # remove gain
    lpc[:, 0] = 0

    # Compute LPC residual
    synth = Synthesizer(AllZeroDF(mgc_order), hop_length)
    wav_lpc = synth.synthesis(wav.astype(np.float64), -lpc)
    residual = wav - wav_lpc

    # Pitch-shift on LPC residual
    residual_shifted = librosa.effects.pitch_shift(residual,
                                                   sr=sr,
                                                   n_steps=shift_in_cent,
                                                   bins_per_octave=1200)

    # Filtering by LPC
    synth = Synthesizer(AllPoleDF(mgc_order), hop_length)
    wav_shifted = synth.synthesis(residual_shifted, lpc)

    return wav_shifted.astype(np.int16)
Exemple #17
0
    def __test_synthesis(filt):
        # dummy source excitation
        source = __dummy_source()

        hopsize = 80

        # dummy filter coef.
        windowed = __dummy_windowed_frames(source,
                                           frame_len=512,
                                           hopsize=hopsize)
        lpc = pysptk.lpc(windowed, filt.order)
        lpc[:, 0] = 0
        b = -lpc

        # synthesis
        synthesizer = Synthesizer(filt, hopsize)
        y = synthesizer.synthesis(source, b)
        assert np.all(np.isfinite(y))

        # transpose
        synthesizer = Synthesizer(filt, hopsize, transpose=True)
        y = synthesizer.synthesis(source, b)
        assert np.all(np.isfinite(y))
datalist = []
with open("conf/eval.list", "r") as f:
    for line in f:
        line = line.rstrip()
        datalist.append(line)

for i in range(0, len(datalist)):
    outfile = "result/wav/{}_diff.wav".format(datalist[i])
    with open("data/SF-TF/mgc/{}.mgc".format(datalist[i]), "rb") as f:
        conv_mgc = np.fromfile(f, dtype="<f8", sep="")
        conv_mgc = conv_mgc.reshape(len(conv_mgc) // dim, dim)

    with open("data/SF/mgc/{}.mgc".format(datalist[i]), "rb") as f:
        src_mgc = np.fromfile(f, dtype="<f8", sep="")
        src_mgc = src_mgc.reshape(len(src_mgc) // dim, dim)

    fs, data = wavfile.read("data/SF/wav/{}.wav".format(
        datalist[i]))  # 入力音声そのものをもってくる
    data = data.astype(np.float)

    diff_mgc = conv_mgc - src_mgc  # 差分のフィルタを用意する
    diff_mgc = np.zeros(shape=conv_mgc.shape)

    # 差分のフィルタを入力音声波形に適用する
    b = np.apply_along_axis(sptk.mc2b, 1, diff_mgc, alpha)
    synthesizer = Synthesizer(MLSADF(order=dim - 1, alpha=alpha), 80)
    owav = synthesizer.synthesis(data, b)

    owav = np.clip(owav, -32768, 32767)
    wavfile.write(outfile, fs, owav.astype(np.int16))
Exemple #19
0
    pitch = pysptk.swipe(x.astype(np.float64),
                         fs=sr,
                         hopsize=hop_length,
                         min=60,
                         max=240,
                         otype="pitch")
    source_excitation = pysptk.excite(pitch, hop_length)

    # Order of mel-cepstrum

    mc = pysptk.mcep(frames, order, alpha)
    logH = pysptk.mgc2sp(mc, alpha, 0.0, frame_length).real
    print(mc.shape)
    #plt.plot(mc)
    #plotname="x_syn_coefs_" + str(order) + ".png"
    #plt.savefig(plotname)

    # Convert mel-cesptrum to MLSADF coefficients
    b = pysptk.mc2b(mc, alpha)

    synthesizer = Synthesizer(MLSADF(order=order, alpha=alpha), hop_length)

    x_synthesized = synthesizer.synthesis(source_excitation, b)

    filenam = "synthesized_sounds/" + "x_syn" + str(order + 1) + ".wav"
    #wavfile.write("x.wav", sr, x)
    wavfile.write(filenam, sr, x_synthesized)
    time_total = time.time() - start
    writestring = str(order) + "," + str(time_total) + "\n"
    f.write(writestring)
Exemple #20
0
def test_vc_from_path(model, x, fs, data_mean, data_std, diffvc=True):
    model.eval()

    hop_length = int(fs * (hp.frame_period * 0.001))
    x = x.astype(np.float64)
    f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)
    alpha = pysptk.util.mcepalpha(fs)
    mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha)
    c0, mc = mc[:, 0], mc[:, 1:]
    static_dim = mc.shape[-1]
    mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50)
    mc = P.delta_features(mc, hp.windows).astype(np.float32)

    T = mc.shape[0]

    inputs = mc[:, :static_dim].copy()

    # Normalization
    mc_scaled = P.scale(mc, data_mean, data_std)

    mc_scaled = Variable(torch.from_numpy(mc_scaled))
    lengths = [len(mc_scaled)]

    # Add batch axis
    mc_scaled = mc_scaled.view(1, -1, mc_scaled.size(-1))

    # For MLPG
    R = unit_variance_mlpg_matrix(hp.windows, T)
    R = torch.from_numpy(R)

    # Apply model
    if model.include_parameter_generation():
        # Case: models include parameter generation in itself
        # Mulistream features cannot be used in this case
        y_hat, y_hat_static = model(mc_scaled, R, lengths=lengths)
    else:
        # Case: generic models (can be sequence model)
        assert hp.has_dynamic_features is not None
        y_hat = model(mc_scaled, lengths=lengths)
        y_hat_static = multi_stream_mlpg(
            y_hat, R, hp.stream_sizes, hp.has_dynamic_features)

    mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim)

    # Denormalize
    mc_static_pred = P.inv_scale(
        mc_static_pred, data_mean[:static_dim], data_std[:static_dim])

    outputs = mc_static_pred.copy()

    if diffvc:
        mc_static_pred = mc_static_pred - mc[:, :static_dim]

    mc = np.hstack((c0[:, None], mc_static_pred))
    if diffvc:
        mc[:, 0] = 0  # remove power coefficients
        engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha),
                             hopsize=hop_length)
        b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha)
        waveform = engine.synthesis(x, b)
    else:
        fftlen = pyworld.get_cheaptrick_fft_size(fs)
        spectrogram = pysptk.mc2sp(
            mc.astype(np.float64), alpha=alpha, fftlen=fftlen)
        waveform = pyworld.synthesize(
            f0, spectrogram, aperiodicity, fs, hp.frame_period)

    return waveform, inputs, outputs
                     hopsize=HOP_LENGTH,
                     min=MIN_F0,
                     max=MAX_F0,
                     otype="pitch")

# 励振源信号(声帯音源)の生成
source_excitation = pysptk.excite(pitch, HOP_LENGTH)

# メルケプストラム分析(=スペクトル包絡の抽出)
mc = pysptk.mcep(frames, ORDER, ALPHA)

# メルケプストラム係数からMLSAディジタルフィルタ係数に変換
mlsa_coef = pysptk.mc2b(mc, ALPHA)

# MLSAフィルタの作成
synthesizer = Synthesizer(MLSADF(order=ORDER, alpha=ALPHA), HOP_LENGTH)

# #### 以降、合成フィルタのパラメタなどを変えて色々な音声を合成

# ### ピッチシフト (音を高くする) ###
OUT_WAVE_FILE = "pitchshift_high.wav"
PITCH_SHIFT = 0.5  # 音を高くする場合は 1より小さい倍率
excitation_pitchhigh = pysptk.excite(pitch * PITCH_SHIFT, HOP_LENGTH)
y = synthesizer.synthesis(excitation_pitchhigh, mlsa_coef)  # 音声合成
y = y.astype(np.int16)
wavfile.write(OUT_WAVE_FILE, fs, y)

# ### ピッチシフト (音を低くする) ###
OUT_WAVE_FILE = "pitchshift_low.wav"
PITCH_SHIFT = 1.5  # 音を低くする場合は 1より大きい倍率
excitation_pitchlow = pysptk.excite(pitch * PITCH_SHIFT, HOP_LENGTH)
Exemple #22
0
                            hop_length=HOP_LENGTH).astype(np.float64).T
frames *= pysptk.blackman(FRAME_LENGTH)  # 窓掛け(ブラックマン窓)

# ピッチ抽出
pitch = pysptk.swipe(x,
                     fs=fs,
                     hopsize=HOP_LENGTH,
                     min=MIN_F0,
                     max=MAX_F0,
                     otype="pitch")

# 励振源信号(声帯音源)の生成
source_excitation = pysptk.excite(pitch, HOP_LENGTH)

# 線形予測分析による線形予測符号化(LPC)係数の抽出
lpc = pysptk.lpc(frames, ORDER)
lpc[:, 0] = np.log(lpc[:, 0])

# LPC係数をPARCOR係数に変換
parcor = pysptk.lpc2par(lpc)

# 全極フィルタの作成
synthesizer = Synthesizer(AllPoleLatticeDF(order=ORDER), HOP_LENGTH)

# 励振源信号でフィルタを駆動して音声を合成
y = synthesizer.synthesis(source_excitation, parcor)

# 音声の書き込み
y = y.astype(np.int16)
wavfile.write(OUT_WAVE_FILE, fs, y)
Exemple #23
0
# Parameter generation
paramgen = MLPG(gmm, windows=windows, diff=True)

# Waveform generation for test set
for idx, path in enumerate(source.test_paths):
    fs, x = wavfile.read(path)
    x = x.astype(np.float64)
    f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    # aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)

    mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha)
    c0, mc = mc[:, 0], mc[:, 1:]

    mc = delta_features(mc, windows)
    since = time.time()
    mc = paramgen.transform(mc)
    print("{}, Elapsed time in conversion: {}s".format(idx,
                                                       time.time() - since))
    assert mc.shape[-1] == static_dim
    mc = np.hstack((c0[:, None], mc))

    mc[:, 0] = 0
    engine = Synthesizer(MLSADF(order=order, alpha=alpha), hopsize=80)
    b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha)
    waveform = engine.synthesis(x, b)
    if not exists('resultsVC'):
        os.makedirs('resultsVC')
    wavfile.write(
        "resultsVC/{}_{}.wav".format(splitext(basename(path))[0], 'mlpg'), fs,
        waveform.astype(np.int16))
Exemple #24
0
                            hop_length=HOP_LENGTH).astype(np.float64).T
frames *= pysptk.blackman(FRAME_LENGTH)  # 窓掛け(ブラックマン窓)

# ピッチ抽出
pitch = pysptk.swipe(x,
                     fs=fs,
                     hopsize=HOP_LENGTH,
                     min=MIN_F0,
                     max=MAX_F0,
                     otype="pitch")

# 励振源信号(声帯音源)の生成
source_excitation = pysptk.excite(pitch, HOP_LENGTH)

# 線形予測分析による線形予測符号化(LPC)係数の抽出
lpc = pysptk.lpc(frames, ORDER)
lpc[:, 0] = np.log(lpc[:, 0])

# LPC係数を線スペクトル対に変換
lsp = pysptk.lpc2lsp(lpc, otype=0, fs=fs)

# 全極フィルタの作成
synthesizer = Synthesizer(LSPDF(order=ORDER), HOP_LENGTH)

# 励振源信号でフィルタを駆動して音声を合成
y = synthesizer.synthesis(source_excitation, lsp)

# 音声の書き込み
y = y.astype(np.int16)
wavfile.write(OUT_WAVE_FILE, fs, y)
 def build_synth(self):
     self.synthesizer = Synthesizer(
         MLSADF(order=self.order, alpha=self.alpha), self.hop_length)
Exemple #26
0
def test_vc_from_path(model, x, fs, data_mean, data_std, diffvc=True):
    model.eval()

    hop_length = int(fs * (hp.frame_period * 0.001))
    x = x.astype(np.float64)
    f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)
    alpha = pysptk.util.mcepalpha(fs)
    mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha)
    c0, mc = mc[:, 0], mc[:, 1:]
    static_dim = mc.shape[-1]
    mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50)
    mc = P.delta_features(mc, hp.windows).astype(np.float32)

    T = mc.shape[0]

    inputs = mc[:, :static_dim].copy()

    # Normalization
    mc_scaled = P.scale(mc, data_mean, data_std)

    mc_scaled = Variable(torch.from_numpy(mc_scaled))
    lengths = [len(mc_scaled)]

    # Add batch axis
    mc_scaled = mc_scaled.view(1, -1, mc_scaled.size(-1))

    # For MLPG
    R = unit_variance_mlpg_matrix(hp.windows, T)
    R = torch.from_numpy(R)

    # Apply model
    if model.include_parameter_generation():
        # Case: models include parameter generation in itself
        # Mulistream features cannot be used in this case
        y_hat, y_hat_static = model(mc_scaled, R, lengths=lengths)
    else:
        # Case: generic models (can be sequence model)
        assert hp.has_dynamic_features is not None
        y_hat = model(mc_scaled, lengths=lengths)
        y_hat_static = multi_stream_mlpg(y_hat, R, hp.stream_sizes,
                                         hp.has_dynamic_features)

    mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim)

    # Denormalize
    mc_static_pred = P.inv_scale(mc_static_pred, data_mean[:static_dim],
                                 data_std[:static_dim])

    outputs = mc_static_pred.copy()

    if diffvc:
        mc_static_pred = mc_static_pred - mc[:, :static_dim]

    mc = np.hstack((c0[:, None], mc_static_pred))
    if diffvc:
        mc[:, 0] = 0  # remove power coefficients
        engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha),
                             hopsize=hop_length)
        b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha)
        waveform = engine.synthesis(x, b)
    else:
        fftlen = pyworld.get_cheaptrick_fft_size(fs)
        spectrogram = pysptk.mc2sp(mc.astype(np.float64),
                                   alpha=alpha,
                                   fftlen=fftlen)
        waveform = pyworld.synthesize(f0, spectrogram, aperiodicity, fs,
                                      hp.frame_period)

    return waveform, inputs, outputs
Exemple #27
0
# 音声の切り出しと窓掛け
frames = librosa.util.frame(x,
                            frame_length=FRAME_LENGTH,
                            hop_length=HOP_LENGTH).astype(np.float64).T
frames *= pysptk.blackman(FRAME_LENGTH)  # 窓掛け(ブラックマン窓)

# ピッチ抽出
pitch = pysptk.swipe(x,
                     fs=fs,
                     hopsize=HOP_LENGTH,
                     min=MIN_F0,
                     max=MAX_F0,
                     otype="pitch")

# 励振源信号(声帯音源)の生成
source_excitation = pysptk.excite(pitch, HOP_LENGTH)

# 線形予測分析による線形予測係数の抽出
lpc = pysptk.lpc(frames, ORDER)
lpc[:, 0] = np.log(lpc[:, 0])

# 全極フィルタの作成
synthesizer = Synthesizer(AllPoleDF(order=ORDER), HOP_LENGTH)

# 励振源信号でフィルタを駆動して音声を合成
y = synthesizer.synthesis(source_excitation, lpc)

# 音声の書き込み
y = y.astype(np.int16)
wavfile.write(OUT_WAVE_FILE, fs, y)
Exemple #28
0
# 音声の分析 (基本周波数、スペクトル包絡、非周期性指標)
_, sp, _ = pyworld.wav2world(x, fs)

# メルケプストラム係数の抽出 from WORLDのスペクトル包絡
mcep = pysptk.sp2mc(sp, order=ORDER, alpha=ALPHA)

# ピッチ抽出
pitch = pysptk.swipe(x,
                     fs=fs,
                     hopsize=HOP_LENGTH,
                     min=MIN_F0,
                     max=MAX_F0,
                     otype="pitch")

# 励振源信号(声帯音源)の生成
source_excitation = pysptk.excite(pitch, HOP_LENGTH)

# メルケプストラム係数からMLSAディジタルフィルタ係数に変換
mlsa_coef = pysptk.mc2b(mcep, ALPHA)

# MLSAフィルタの作成
synthesizer = Synthesizer(MLSADF(order=ORDER, alpha=ALPHA), HOP_LENGTH)

# 励振源信号でMLSAフィルタを駆動して音声を合成
y = synthesizer.synthesis(source_excitation, mlsa_coef)

# 音声の書き込み
y = y.astype(np.int16)
wavfile.write(OUT_WAVE_FILE, fs, y)