def synthesize_from_MCEP(self, mcep, pitch):
     mcep = mcep.copy(order='C')  # fixes "ndarray not C-contiguous error
     b = pysptk.mc2b(mcep, self.alpha)
     excitation = pysptk.excite(pitch.astype(np.float64), self.hop_length)
     x = self.synthesizer.synthesis(excitation.astype(np.float64),
                                    b.astype(np.float64))
     return x
Ejemplo n.º 2
0
    def pysptk_mfcc(self):
        self.frame_length = 1024
        self.hop_length = 80
        self.pitch = pysptk.swipe(self.audio.astype(np.float64),
                                  fs=self.sr,
                                  hopsize=self.hop_length,
                                  min=60,
                                  max=240,
                                  otype="pitch")
        self.source_excitation = pysptk.excite(self.pitch, self.hop_length)

        # Note that almost all of pysptk functions assume input array is C-contiguous and np.float4 element type
        frames = librosa.util.frame(self.audio,
                                    frame_length=self.frame_length,
                                    hop_length=self.hop_length).astype(
                                        np.float64).T

        # Windowing
        frames *= pysptk.blackman(self.frame_length)

        assert frames.shape[1] == self.frame_length

        # Order of mel-cepstrum
        self.order = 25
        self.alpha = 0.41

        self.mc = pysptk.mcep(frames, self.order, self.alpha)
        logH = pysptk.mgc2sp(self.mc, self.alpha, 0.0, self.frame_length).real
        librosa.display.specshow(logH.T,
                                 sr=self.sr,
                                 hop_length=self.hop_length,
                                 x_axis="time",
                                 y_axis="linear")
Ejemplo n.º 3
0
def mixed_excitation(f0, voicing_str, hopsize):
    exc_voiced = pysptk.excite(f0.astype(np.float64), hopsize=hopsize, noise=False)
    exc_unvoiced = 2*np.random.rand(len(exc_voiced)) - 1

    exc = np.zeros(len(exc_voiced))

    for i in range(5):
        h = h_filters[i]
        x_v = lfilter(h, 1, exc_voiced)
        x_uv = lfilter(h, 1, exc_unvoiced)

        gain_v = np.zeros(len(exc_voiced))
        gain_uv = np.zeros(len(exc_voiced))

        str_v = voicing_str[:, i]
        for k in range(len(str_v)):
          if f0[k] > 0:
              gain_v[k*hopsize:(k+1)*hopsize] = str_v[k]
              gain_uv[k*hopsize:(k+1)*hopsize] = 1.0 - str_v[k]
          else:
              gain_v[k*hopsize:(k+1)*hopsize] = 0.0
              gain_uv[k*hopsize:(k+1)*hopsize] = 1.0

        exc += (gain_v * x_v + gain_uv * x_uv)

    return exc
Ejemplo n.º 4
0
def source_excitation_generation(np_data, rate):
    pitch = ps.swipe(np_data.astype(np.float64),
                     fs=rate,
                     hopsize=HOP_LENGTH,
                     min=60,
                     max=240,
                     otype="pitch")
    source_excitation = ps.excite(pitch, HOP_LENGTH)
    return source_excitation
Ejemplo n.º 5
0
def mgc_decoder_pulsenoise(pitch, mvf, mgc_coeff, resid_codebook_pca,
                           basefilename):

    #print(len(pitch), len(mvf))

    T0 = np.zeros(np.min([len(pitch), len(mvf)]))
    mvf_mean = np.mean(mvf)
    # print(mvf_mean)

    for i in range(len(T0)):
        if mvf[i] < 0.4 * mvf_mean:
            T0[i] = 0
        elif pitch[i] > 0:
            T0[i] = Fs / pitch[i]

    # create source excitation using SPTK
    source = pysptk.excite(T0, frshft)

    # scale for SPTK
    scaled_source = np.float32(source / np.max(np.abs(source)))
    io_wav.write(gen_path + basefilename + '_source_pulsenoise_float32.wav',
                 Fs, scaled_source)

    command = 'sox ' + gen_path + basefilename + '_source_pulsenoise_float32.wav' + ' -t raw -r ' + str(Fs) + ' - ' + ' | ' + \
              'sptk mglsadf -P 5 -m ' + str(order) + ' -p ' + str(frshft) + \
              ' -a ' + str(alpha) + ' -c ' + str(stage) + ' ' + gen_path + basefilename + '.mgc' + ' | ' + \
              'sptk x2x +fs -o | sox -c 1 -b 16 -e signed-integer -t raw -r ' + str(Fs) + ' - -t wav -r ' + str(Fs) + ' ' + gen_path + basefilename + '_synthesized_pulsenoise_0.wav'
    ###print(command)
    run(command, shell=True)

    command = "sox -G " + gen_path + basefilename + '_synthesized_pulsenoise_0.wav' + ' ' + \
        gen_path + basefilename + '_synthesized_pulsenoise.wav'
    ###print(command)
    run(command, shell=True)

    return [0]
Ejemplo n.º 6
0
    frames = librosa.util.frame(x,
                                frame_length=frame_length,
                                hop_length=hop_length).astype(np.float64).T

    # Windowing
    frames *= pysptk.blackman(frame_length)

    assert frames.shape[1] == frame_length

    pitch = pysptk.swipe(x.astype(np.float64),
                         fs=sr,
                         hopsize=hop_length,
                         min=60,
                         max=240,
                         otype="pitch")
    source_excitation = pysptk.excite(pitch, hop_length)

    # Order of mel-cepstrum

    mc = pysptk.mcep(frames, order, alpha)
    logH = pysptk.mgc2sp(mc, alpha, 0.0, frame_length).real
    print(mc.shape)
    #plt.plot(mc)
    #plotname="x_syn_coefs_" + str(order) + ".png"
    #plt.savefig(plotname)

    # Convert mel-cesptrum to MLSADF coefficients
    b = pysptk.mc2b(mc, alpha)

    synthesizer = Synthesizer(MLSADF(order=order, alpha=alpha), hop_length)
Ejemplo n.º 7
0
def mgc_decoder_residual_with_envelope(pitch, mvf, mgc_coeff,
                                       resid_codebook_pca, basefilename,
                                       envelope_type):

    # create voiced source excitation using SPTK
    source_voiced = pysptk.excite(Fs / pitch, frshft)

    # create unvoiced source excitation using SPTK
    pitch_unvoiced = np.zeros(len(pitch))
    source_unvoiced = pysptk.excite(pitch_unvoiced, frshft)

    source = np.zeros(source_voiced.shape)

    # generate excitation frame by frame pitch synchronously
    for i in range(len(source)):
        if source_voiced[
                i] > 2:  # location of impulse in original impulse excitation
            mvf_index = int(i / frshft)
            mvf_curr = mvf[mvf_index]

            if mvf_curr > 7500:
                mvf_curr = 7500

            # voiced component from binary codebook
            voiced_frame_lpf = resid_codebook_pca[int(
                (Fs / 2 - 0.95 * mvf_curr) / 100)]

            # unvoiced component by highpass filtering white noise
            if i + frlen < len(source_unvoiced):
                unvoiced_frame = source_unvoiced[i:i +
                                                 len(voiced_frame_lpf)].copy()
            else:
                unvoiced_frame = source_unvoiced[i -
                                                 len(voiced_frame_lpf):i].copy(
                                                 )

            unvoiced_frame_hpf = highpass_filter(unvoiced_frame,
                                                 mvf_curr * 1.05, Fs,
                                                 hpf_order)
            unvoiced_frame_hpf *= np.hanning(len(unvoiced_frame_hpf))

            # unvoiced component multiplied with time envelope
            unvoiced_frame_with_envelope = unvoiced_frame.copy(
            ) * apply_envelope(resid_codebook_pca[0], envelope_type)
            unvoiced_frame_with_envelope_hpf = highpass_filter(
                unvoiced_frame_with_envelope, mvf_curr * 1.05, Fs, hpf_order)
            unvoiced_frame_with_envelope_hpf *= np.hanning(
                len(unvoiced_frame_with_envelope_hpf))

            energy = np.linalg.norm(unvoiced_frame_with_envelope_hpf)
            unvoiced_frame_with_envelope_hpf /= energy

            # scale time envelope modulated noise by mvf
            unvoiced_frame_with_envelope_hpf *= (mvf_curr / 8000 * 2)

            # put voiced and unvoiced component to pitch synchronous location
            j_start = np.max((round(len(voiced_frame_lpf) / 2) - i, 0))
            j_end = np.min(
                (len(voiced_frame_lpf),
                 len(source) - (i - round(len(voiced_frame_lpf) / 2))))
            for j in range(j_start, j_end):
                source[i - round(len(voiced_frame_lpf) / 2) +
                       j] += voiced_frame_lpf[j]
                source[i - round(len(voiced_frame_lpf) / 2) +
                       j] += unvoiced_frame_hpf[j] * noise_scaling
                source[i - round(len(voiced_frame_lpf) / 2) +
                       j] += unvoiced_frame_with_envelope_hpf[j]

    # scale for SPTK
    scaled_source = np.float32(source / np.max(np.abs(source)))
    # scaled_source = np.float32(source)
    io_wav.write(
        gen_path + basefilename + '_source_' + envelope_type + '_float32.wav',
        Fs, scaled_source)

    command = 'sox ' + gen_path + basefilename + '_source_' + envelope_type + '_float32.wav' + ' -t raw -r ' + str(Fs) + ' - ' + ' | ' + \
              'sptk mglsadf -P 5 -m ' + str(order) + ' -p ' + str(frshft) + \
              ' -a ' + str(alpha) + ' -c ' + str(stage) + ' ' + gen_path + basefilename + '.mgc' + ' | ' + \
              'sptk x2x +fs -o | sox -c 1 -b 16 -e signed-integer -t raw -r ' + str(Fs) + ' - -t wav -r ' + str(Fs) + ' ' + gen_path + basefilename + '_synthesized_with_' + envelope_type + '_0.wav'
    run(command, shell=True)

    command = "sox -G " + gen_path + basefilename + '_synthesized_with_' + envelope_type + '_0.wav' + ' ' + \
        gen_path + basefilename + '_synthesized_with_' + envelope_type + '.wav ' + 'gain -n 0'
    run(command, shell=True)

    return [0]
Ejemplo n.º 8
0
def mgc_filter_residual(pitch, mvf, mgc_coeff, resid_codebook_pca,
                        basefilename):

    in_wav = gen_path + basefilename + '.wav'
    in_raw = gen_path + basefilename + '.raw'
    in_mgcep = gen_path + basefilename + '.mgc'
    in_resid = gen_path + basefilename + '_residual_original.wav'
    out_resid = gen_path + basefilename + '_residual_filtered.wav'

    # wav -> raw
    command = 'sox -c 1 -e signed-integer -b 16 -t wav ' + in_wav + \
        ' -c 1 -e signed-integer -b 16 -t raw -r ' + str(Fs) + ' ' + in_raw
    print('wav -> raw, ' + in_wav)
    call(command, shell=True)

    # raw, mgcep -> residual
    command = 'sptk x2x +sf ' + in_raw + ' | ' + \
        'sptk mglsadf -k -v -a ' + str(alpha) + ' -c 3 -m ' + str(order) + ' -p ' + \
        str(frshft) + ' ' + in_mgcep + ' | ' + \
        'sptk x2x +fs | sox -c 1 -e signed-integer -b 16 -t raw -r ' + str(Fs) + ' - ' + \
        '-c 1 -e signed-integer -b 16 -t wav -r ' + str(Fs) + ' ' + in_resid
    # print(command)
    print('raw, mgcep -> resid.wav, ' + in_wav)
    call(command, shell=True)

    (Fs_, x_residual) = io_wav.read(in_resid)

    plt.plot(x_residual[0:Fs], 'r')
    plt.show()

    # create voiced source excitation using SPTK
    source_voiced = pysptk.excite(Fs / pitch, frshft)

    source_upper = np.zeros(source_voiced.shape)
    source_lower = np.zeros(source_voiced.shape)

    # generate excitation frame by frame pitch synchronously
    for i in range(len(source_upper)):
        if source_voiced[
                i] > 2:  # location of impulse in original impulse excitation
            mvf_index = int(i / frshft)
            mvf_curr = mvf[mvf_index]
            T0_curr = int(Fs / pitch[mvf_index])

            if i > T0_curr and i + 2 * T0_curr < len(source_upper):
                residual_frame = x_residual[i - T0_curr:i + T0_curr]
                residual_frame_upper = highpass_filter(residual_frame,
                                                       mvf_curr * 1.05, Fs,
                                                       hpf_order)
                residual_frame_upper *= np.hanning(len(residual_frame_upper))
                source_upper[i - T0_curr:i + T0_curr] += residual_frame_upper

                residual_frame_lower = lowpass_filter(residual_frame,
                                                      mvf_curr * 0.95, Fs,
                                                      lpf_order)
                residual_frame_lower *= np.hanning(len(residual_frame_lower))
                source_lower[i - T0_curr:i + T0_curr] += residual_frame_lower

    # '''
    # upper frequency band
    scaled_source = np.float32(source_upper / np.max(np.abs(source_upper)))
    io_wav.write(gen_path + basefilename + '_residual_upper_float32.wav', Fs,
                 scaled_source)

    command = 'sox ' + gen_path + basefilename + '_residual_upper_float32.wav' + ' -t raw -r ' + str(Fs) + ' - ' + ' | ' + \
              'sptk mglsadf -P 5 -m ' + str(order) + ' -p ' + str(frshft) + \
              ' -a ' + str(alpha) + ' -c ' + str(stage) + ' ' + gen_path + basefilename + '.mgc' + ' | ' + \
              'sptk x2x +fs -o | sox -c 1 -b 16 -e signed-integer -t raw -r ' + str(Fs) + ' - -t wav -r ' + str(Fs) + ' ' + gen_path + basefilename + '_synthesized_based_on_residual_0.wav'
    ###print(command)
    run(command, shell=True)

    command = "sox -G " + gen_path + basefilename + '_synthesized_based_on_residual_0.wav' + ' ' + \
        gen_path + basefilename + '_synthesized_based_on_residual_upper.wav'
    ###print(command)
    run(command, shell=True)

    # lower frequency band
    scaled_source = np.float32(source_lower / np.max(np.abs(source_lower)))
    io_wav.write(gen_path + basefilename + '_residual_lower_float32.wav', Fs,
                 scaled_source)

    command = 'sox ' + gen_path + basefilename + '_residual_lower_float32.wav' + ' -t raw -r ' + str(Fs) + ' - ' + ' | ' + \
              'sptk mglsadf -P 5 -m ' + str(order) + ' -p ' + str(frshft) + \
              ' -a ' + str(alpha) + ' -c ' + str(stage) + ' ' + gen_path + basefilename + '.mgc' + ' | ' + \
              'sptk x2x +fs -o | sox -c 1 -b 16 -e signed-integer -t raw -r ' + str(Fs) + ' - -t wav -r ' + str(Fs) + ' ' + gen_path + basefilename + '_synthesized_based_on_residual_0.wav'
    run(command, shell=True)

    command = "sox -G " + gen_path + basefilename + '_synthesized_based_on_residual_0.wav' + ' ' + \
        gen_path + basefilename + '_synthesized_based_on_residual_lower.wav'
    run(command, shell=True)

    # upper and lower frequency band added together
    source = source_lower + source_upper
    scaled_source = np.float32(source / np.max(np.abs(source)))
    io_wav.write(gen_path + basefilename + '_residual_float32.wav', Fs,
                 scaled_source)

    command = 'sox ' + gen_path + basefilename + '_residual_float32.wav' + ' -t raw -r ' + str(Fs) + ' - ' + ' | ' + \
              'sptk mglsadf -P 5 -m ' + str(order) + ' -p ' + str(frshft) + \
              ' -a ' + str(alpha) + ' -c ' + str(stage) + ' ' + gen_path + basefilename + '.mgc' + ' | ' + \
              'sptk x2x +fs -o | sox -c 1 -b 16 -e signed-integer -t raw -r ' + str(Fs) + ' - -t wav -r ' + str(Fs) + ' ' + gen_path + basefilename + '_synthesized_based_on_residual_0.wav'
    ###print(command)
    run(command, shell=True)

    command = "sox -G " + gen_path + basefilename + '_synthesized_based_on_residual_0.wav' + ' ' + \
        gen_path + basefilename + '_synthesized_based_on_residual.wav'
    run(command, shell=True)

    return [0]
Ejemplo n.º 9
0
# 音声の切り出しと窓掛け
frames = librosa.util.frame(x,
                            frame_length=FRAME_LENGTH,
                            hop_length=HOP_LENGTH).astype(np.float64).T
frames *= pysptk.blackman(FRAME_LENGTH)  # 窓掛け(ブラックマン窓)

# ピッチ抽出
pitch = pysptk.swipe(x,
                     fs=fs,
                     hopsize=HOP_LENGTH,
                     min=MIN_F0,
                     max=MAX_F0,
                     otype="pitch")

# 励振源信号(声帯音源)の生成
source_excitation = pysptk.excite(pitch, HOP_LENGTH)

# 線形予測分析による線形予測符号化(LPC)係数の抽出
lpc = pysptk.lpc(frames, ORDER)
lpc[:, 0] = np.log(lpc[:, 0])

# LPC係数をPARCOR係数に変換
parcor = pysptk.lpc2par(lpc)

# 全極フィルタの作成
synthesizer = Synthesizer(AllPoleLatticeDF(order=ORDER), HOP_LENGTH)

# 励振源信号でフィルタを駆動して音声を合成
y = synthesizer.synthesis(source_excitation, parcor)

# 音声の書き込み
# 音声の切り出しと窓掛け
frames = librosa.util.frame(x,
                            frame_length=FRAME_LENGTH,
                            hop_length=HOP_LENGTH).astype(np.float64).T
frames *= pysptk.blackman(FRAME_LENGTH)  # 窓掛け(ブラックマン窓)

# ピッチ抽出
pitch = pysptk.swipe(x,
                     fs=fs,
                     hopsize=HOP_LENGTH,
                     min=MIN_F0,
                     max=MAX_F0,
                     otype="pitch")

# 励振源信号(声帯音源)の生成
source_excitation = pysptk.excite(pitch, HOP_LENGTH)

# メルケプストラム分析(=スペクトル包絡の抽出)
mc = pysptk.mcep(frames, ORDER, ALPHA)

# メルケプストラム係数からMLSAディジタルフィルタ係数に変換
mlsa_coef = pysptk.mc2b(mc, ALPHA)

# MLSAフィルタの作成
synthesizer = Synthesizer(MLSADF(order=ORDER, alpha=ALPHA), HOP_LENGTH)

# #### 以降、合成フィルタのパラメタなどを変えて色々な音声を合成

# ### ピッチシフト (音を高くする) ###
OUT_WAVE_FILE = "pitchshift_high.wav"
PITCH_SHIFT = 0.5  # 音を高くする場合は 1より小さい倍率
Ejemplo n.º 11
0
def mgc_decoder_residual(mgc_lsp_coeff,
                         log_f0cont,
                         log_mvf,
                         basefilename_out,
                         resid_codebook_pca,
                         Fs_codebook=16000,
                         Fs=22050,
                         frlen=512,
                         frshft=200,
                         order=24,
                         alpha=0.42,
                         stage=3,
                         hpf_order=11,
                         noise_scaling=0.04):

    pitch = np.float64(np.exp(log_f0cont))
    mvf = np.exp(log_mvf)

    # create voiced source excitation using SPTK
    source_voiced = pysptk.excite(Fs / pitch, frshft)

    # create unvoiced source excitation using SPTK
    pitch_unvoiced = np.zeros(len(pitch))
    source_unvoiced = pysptk.excite(pitch_unvoiced, frshft)

    source = np.zeros(source_voiced.shape)

    # generate excitation frame by frame pitch synchronously

    # voiced component
    for i in range(len(source)):
        if source_voiced[
                i] > 2:  # location of impulse in original impulse excitation
            mvf_index = int(i / frshft)
            mvf_curr = mvf[mvf_index]

            if mvf_curr > Fs_codebook / 2:
                mvf_curr = Fs_codebook / 2

            # voiced component from residual codebook
            voiced_frame_lpf = resid_codebook_pca[int(
                (Fs_codebook / 2 - mvf_curr) / 100)]

            # put voiced and unvoiced component to pitch synchronous location
            j_start = np.max((round(len(voiced_frame_lpf) / 2) - i, 0))
            j_end = np.min(
                (len(voiced_frame_lpf),
                 len(source) - (i - round(len(voiced_frame_lpf) / 2))))
            for j in range(j_start, j_end):
                source[i - round(len(voiced_frame_lpf) / 2) +
                       j] += voiced_frame_lpf[j]

    # unvoiced component
    for i in range(len(mvf)):
        unvoiced_frame = source_unvoiced[i * frshft:(i + 2) * frshft].copy()
        mvf_curr = mvf[i]
        unvoiced_frame_hpf = highpass_filter(unvoiced_frame, mvf_curr * 1.2,
                                             Fs, hpf_order)
        unvoiced_frame_hpf *= np.hanning(len(unvoiced_frame_hpf))

        source[i * frshft:(i + 2) *
               frshft] += unvoiced_frame_hpf * noise_scaling

    # scale for SPTK
    scaled_source = np.float32(source / np.max(np.abs(source)))
    io_wav.write(basefilename_out + '_source_float32.wav', Fs, scaled_source)

    # write files for SPTK
    mgc_lsp_coeff.astype('float32').tofile(basefilename_out + '.mgclsp')

    # MGC-LSPs -> MGC coefficients
    command = 'lspcheck -m ' + str(order) + ' -s ' + str(Fs / 1000) + ' -c -r 0.1 -g -G 1.0E-10 ' + basefilename_out + '.mgclsp' + ' | ' + \
              'lsp2lpc -m '  + str(order) + ' -s ' + str(Fs / 1000) + ' | ' + \
              'mgc2mgc -m '  + str(order) + ' -a ' + str(alpha) + ' -c ' + str(stage) + ' -n -u ' + \
                      '-M '  + str(order) + ' -A ' + str(alpha) + ' -C ' + str(stage) + ' > ' + basefilename_out + '.mgc'
    run(command, shell=True)

    command = 'sox ' + basefilename_out + '_source_float32.wav' + ' -t raw -r ' + str(Fs) + ' - ' + ' | ' + \
              'mglsadf -P 5 -m ' + str(order) + ' -p ' + str(frshft) + \
              ' -a ' + str(alpha) + ' -c ' + str(stage) + ' ' + basefilename_out + '.mgc' + ' | ' + \
              'x2x +fs -o | sox -c 1 -b 16 -e signed-integer -t raw -r ' + str(Fs) + ' - -t wav -r ' + str(Fs) + ' ' + basefilename_out + '_0.wav'
    # print(command)
    run(command, shell=True)

    # normalize gain
    command = "sox --norm=-3 " + basefilename_out + '_0.wav' + ' ' + \
        basefilename_out + '.wav'
    # print(command)
    run(command, shell=True)

    # remove temp files
    os.remove(basefilename_out + '_0.wav')
    os.remove(basefilename_out + '.mgc')
    os.remove(basefilename_out + '.mgclsp')
    os.remove(basefilename_out + '_source_float32.wav')

    # read file for output
    (Fs_out, x_synthesized) = io_wav.read(basefilename_out + '.wav')

    return x_synthesized