Esempio n. 1
0
    def __init__(
            self,
            acoustic_feature_param: AcousticFeatureParam,
            out_sampling_rate: int,
            buffer_size: int,
            number_of_pointers: int,
    ):
        from world4py.native import structures, apidefinitions
        super().__init__(
            acoustic_feature_param=acoustic_feature_param,
            out_sampling_rate=out_sampling_rate,
        )

        self.buffer_size = buffer_size

        self._synthesizer = structures.WorldSynthesizer()
        apidefinitions._InitializeSynthesizer(
            self.out_sampling_rate,  # sampling rate
            self.acoustic_feature_param.frame_period,  # frame period
            pyworld.get_cheaptrick_fft_size(out_sampling_rate),  # fft size
            buffer_size,  # buffer size
            number_of_pointers,  # number of pointers
            self._synthesizer,
        )
        self._before_buffer = []  # for holding memory
Esempio n. 2
0
def gen_waveform(y_predicted, Y_mean, Y_std, post_filter=False, coef=1.4,
                 fs=16000, mge_training=True):
    alpha = pysptk.util.mcepalpha(fs)
    fftlen = fftlen = pyworld.get_cheaptrick_fft_size(fs)
    frame_period = hp_acoustic.frame_period

    # Generate parameters and split streams
    mgc, lf0, vuv, bap = gen_parameters(y_predicted, Y_mean, Y_std, mge_training)

    if post_filter:
        mgc = merlin_post_filter(mgc, alpha, coef=coef)

    spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha)
    aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), fs, fftlen)
    f0 = lf0.copy()
    f0[vuv < 0.5] = 0
    f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])

    generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64),
                                            spectrogram.astype(np.float64),
                                            aperiodicity.astype(np.float64),
                                            fs, frame_period)
    # Convert range to int16
    generated_waveform = generated_waveform / \
        np.max(np.abs(generated_waveform)) * 32767

    # return features as well to compare natural/genearted later
    return generated_waveform, mgc, lf0, vuv, bap
Esempio n. 3
0
def gen_waveform(y_predicted, Y_mean, Y_std, post_filter=False, fs=16000):
    alpha = pysptk.util.mcepalpha(fs)
    fftlen = fftlen = pyworld.get_cheaptrick_fft_size(fs)
    frame_period = hp_acoustic.frame_period

    # Generate parameters and split streams
    mgc, lf0, vuv, bap = gen_parameters(y_predicted, Y_mean, Y_std)

    if post_filter:
        mgc = merlin_post_filter(mgc, alpha)

    spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha)
    aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), fs,
                                               fftlen)
    f0 = lf0.copy()
    f0[vuv < 0.5] = 0
    f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])

    generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64),
                                            spectrogram.astype(np.float64),
                                            aperiodicity.astype(np.float64),
                                            fs, frame_period)
    generated_waveform = np.clip(generated_waveform, -32768, 32768)

    # return features as well to compare natural/genearted later
    return generated_waveform, mgc, lf0, vuv, bap
Esempio n. 4
0
def world_spectrogram(wav, sr=_sr, dim_num=32, **kwargs):
    """world声码器语音转为频谱。"""
    # 分布提取参数
    frame_period = kwargs.get("frame_period", pw.default_frame_period)
    f0_floor = kwargs.get("f0_floor", pw.default_f0_floor)
    f0_ceil = kwargs.get("f0_ceil", pw.default_f0_ceil)
    fft_size = kwargs.get("fft_size", pw.get_cheaptrick_fft_size(sr, f0_floor))
    ap_threshold = kwargs.get("ap_threshold", 0.85)
    f0_extractor = kwargs.get("f0_extractor", "dio")
    x = wav.astype(np.double)
    if f0_extractor == "dio":
        # 使用DIO算法计算音频的基频F0
        f0, t = pw.dio(x, sr, f0_floor=f0_floor, f0_ceil=f0_ceil)
    elif f0_extractor == "harvest":
        f0, t = pw.harvest(x, sr, f0_floor=f0_floor, f0_ceil=f0_ceil, frame_period=frame_period)
    else:
        f0, t = f0_extractor(x, sr, f0_floor=f0_floor, f0_ceil=f0_ceil, frame_period=frame_period)

    # 使用CheapTrick算法计算音频的频谱包络
    sp = pw.cheaptrick(x, f0, t, sr, f0_floor=f0_floor, fft_size=fft_size)
    # SP降维
    sp_enc = pw.code_spectral_envelope(sp, sr, number_of_dimensions=dim_num)

    # 计算aperiodic参数
    ap = pw.d4c(x, f0, t, sr, threshold=ap_threshold, fft_size=fft_size)
    # AP降维
    ap_enc = pw.code_aperiodicity(ap, sr)
    return f0, sp_enc, ap_enc
Esempio n. 5
0
    def convert(self, in_feature: AcousticFeature):
        input = self._encode_feature(in_feature)

        pad = 128 - input.shape[1] % 128
        input = numpy.pad(input, [(0, 0), (0, pad)], mode='minimum')

        converter = partial(chainer.dataset.convert.concat_examples, device=self.gpu, padding=0)
        inputs = converter([input])

        with chainer.using_config('train', False):
            out = self.model(inputs).data[0]

        if self.gpu is not None:
            out = chainer.cuda.to_cpu(out)
        out = out[:, :-pad]

        out = self._decode_feature(out)
        out.ap = in_feature.ap
        out.voiced = in_feature.voiced
        out.f0[~out.voiced] = 0

        fftlen = pyworld.get_cheaptrick_fft_size(self.out_sampling_rate)
        sp = pysptk.mc2sp(
            out.mc,
            alpha=self._param.alpha,
            fftlen=fftlen,
        )
        out.sp = sp

        out = out.astype_only_float(numpy.float64)
        return out
Esempio n. 6
0
def world_decode_spectral_envelop(coded_sp, fs):
    fftlen = pyworld.get_cheaptrick_fft_size(fs)
    # coded_sp = coded_sp.astype(np.float32)
    # coded_sp = np.ascontiguousarray(coded_sp)
    decoded_sp = pyworld.decode_spectral_envelope(coded_sp, fs, fftlen)

    return decoded_sp
Esempio n. 7
0
def gen_waveform(y_predicted,
                 Y_mean,
                 Y_std,
                 post_filter=False,
                 coef=1.4,
                 fs=16000,
                 mge_training=False):
    alpha = pysptk.util.mcepalpha(fs)
    fftlen = fftlen = pyworld.get_cheaptrick_fft_size(fs)
    frame_period = audio_world_config.frame_period

    # Generate parameters and split streams
    mgc, lf0, vuv, bap = gen_parameters(y_predicted, Y_mean, Y_std,
                                        mge_training)

    if post_filter:
        mgc = merlin_post_filter(mgc, alpha, coef=coef)

    spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha)
    aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), fs,
                                               fftlen)
    f0 = lf0.copy()
    f0[vuv < 0.5] = 0
    f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])

    generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64),
                                            spectrogram.astype(np.float64),
                                            aperiodicity.astype(np.float64),
                                            fs, frame_period)
    # Convert range to int16

    # return features as well to compare natural/genearted later
    return generated_waveform  #, mgc, lf0, vuv, bap
Esempio n. 8
0
def world2wav(feature, frame_period):
    hparams = hp
    mgc_idx = 0
    lf0_idx = mgc_idx + hparams.num_mgc
    vuv_idx = lf0_idx + hparams.num_lf0
    bap_idx = vuv_idx + hparams.num_vuv

    mgc = feature[:, mgc_idx:mgc_idx + hparams.num_mgc]
    lf0 = feature[:, lf0_idx:lf0_idx + hparams.num_lf0]
    vuv = feature[:, vuv_idx:vuv_idx + hparams.num_vuv]
    bap = feature[:, bap_idx:bap_idx + hparams.num_bap]

    fs = hparams.sample_rate
    alpha = pysptk.util.mcepalpha(fs)
    fftlen = pyworld.get_cheaptrick_fft_size(fs)

    spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha)
    indexes = (vuv < 0.5).flatten()
    bap[indexes] = np.zeros(hparams.num_bap)

    aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), fs,
                                               fftlen)
    f0 = lf0.copy()
    f0[vuv < 0.5] = 0
    f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])

    return pyworld.synthesize(f0.flatten().astype(np.float64),
                              spectrogram.astype(np.float64),
                              aperiodicity.astype(np.float64), fs,
                              frame_period)
Esempio n. 9
0
 def mgc_to_sp(mgc, synth_fs):
     fft_size = pyworld.get_cheaptrick_fft_size(synth_fs)
     ln_sp = pysptk.mgc2sp(np.ascontiguousarray(mgc, dtype=np.float64),
                           alpha=WorldFeatLabelGen.mgc_alpha,
                           gamma=0.0,
                           fftlen=fft_size)
     return ln_sp
def generate_file(path):
    out = Path(arguments.output_directory, path.stem + '.npy')
    if out.exists() and not arguments.enable_overwrite:
        return

    # load wave and padding
    wave_file_load_process = WaveFileLoadProcess(
        sample_rate=arguments.sample_rate,
        top_db=arguments.top_db,
        pad_second=arguments.pad_second,
    )
    wave = wave_file_load_process(path, test=True)

    # make acoustic feature
    acoustic_feature_process = AcousticFeatureProcess(
        frame_period=arguments.frame_period,
        order=arguments.order,
        alpha=arguments.alpha,
        f0_estimating_method=arguments.f0_estimating_method,
    )
    feature = acoustic_feature_process(wave, test=True).astype_only_float(numpy.float32)
    high_spectrogram = feature.spectrogram

    fftlen = pyworld.get_cheaptrick_fft_size(arguments.sample_rate)
    low_spectrogram = pysptk.mc2sp(
        feature.mfcc,
        alpha=arguments.alpha,
        fftlen=fftlen,
    )

    # save
    numpy.save(out.absolute(), {
        'low': low_spectrogram,
        'high': high_spectrogram,
    })
def world_decode_spectral_env(spectral_env_mel, settings):
    mfcc = dct(spectral_env_mel) / np.sqrt(settings['coded_dim'] * 2)
    fftlen = pyworld.get_cheaptrick_fft_size(settings['sample_rate'])
    spectral_env = pyworld.decode_spectral_envelope(mfcc,
                                                    settings['sample_rate'],
                                                    fftlen)
    return spectral_env
def world_decode_spectral_envelop(coded_sp, fs):

    fftlen = pyworld.get_cheaptrick_fft_size(fs)
    #coded_sp = coded_sp.astype(np.float32)
    #coded_sp = np.ascontiguousarray(coded_sp)
    decoded_sp = pyworld.decode_spectral_envelope(coded_sp, fs, fftlen)

    return decoded_sp
Esempio n. 13
0
 def decode_spectrogram(self, feature: AcousticFeature):
     fftlen = pyworld.get_cheaptrick_fft_size(self.out_sampling_rate)
     feature.sp = pysptk.mc2sp(
         feature.mc.astype(numpy.float32),
         alpha=pysptk.util.mcepalpha(self.out_sampling_rate),
         fftlen=fftlen,
     )
     return feature
Esempio n. 14
0
    def fs_to_frame_length(fs):
        """
        Convert sampling rate to frame length for STFT frame length.

        Code base on:
        Merlin's /misc/scripts/vocoder/world/extract_features_for_merlin.sh
        """
        return pyworld.get_cheaptrick_fft_size(fs)  # Better alternative.
Esempio n. 15
0
def convertFeaturesIntoWav(f0seq, MCEPseq, APseq, fs, frame_period=5.0):
    contNumpy_MCEPseq = np.ascontiguousarray(MCEPseq.T, dtype=np.float64)
    fftlen = pyworld.get_cheaptrick_fft_size(fs)
    spectrogram = pyworld.decode_spectral_envelope(contNumpy_MCEPseq, fs,
                                                   fftlen)
    # print(f"dtypes. f0seq:{f0seq.dtype}, spectrogram:{spectrogram.dtype}, APseq:{APseq.dtype}")
    wav = pyworld.synthesize(f0seq, spectrogram, APseq, fs, frame_period)
    return wav.astype(np.float32)
Esempio n. 16
0
def gen_waveform(labels, acoustic_features, acoustic_out_scaler,
        binary_dict, continuous_dict, stream_sizes, has_dynamic_features,
        subphone_features="coarse_coding", log_f0_conditioning=True, pitch_idx=None,
        num_windows=3, post_filter=True, sample_rate=48000, frame_period=5,
        relative_f0=True):

    windows = get_windows(num_windows)

    # Apply MLPG if necessary
    if np.any(has_dynamic_features):
        acoustic_features = multi_stream_mlpg(
            acoustic_features, acoustic_out_scaler.var_, windows, stream_sizes,
            has_dynamic_features)
        static_stream_sizes = get_static_stream_sizes(
            stream_sizes, has_dynamic_features, len(windows))
    else:
        static_stream_sizes = stream_sizes

    # Split multi-stream features
    mgc, target_f0, vuv, bap = split_streams(acoustic_features, static_stream_sizes)

    # Gen waveform by the WORLD vocodoer
    fftlen = pyworld.get_cheaptrick_fft_size(sample_rate)
    alpha = pysptk.util.mcepalpha(sample_rate)

    if post_filter:
        mgc = merlin_post_filter(mgc, alpha)

    spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha)
    aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), sample_rate, fftlen)


    ### F0 ###
    if relative_f0:
        diff_lf0 = target_f0
        # need to extract pitch sequence from the musical score
        linguistic_features = fe.linguistic_features(labels,
                                                    binary_dict, continuous_dict,
                                                    add_frame_features=True,
                                                    subphone_features=subphone_features)
        f0_score = _midi_to_hz(linguistic_features, pitch_idx, False)[:, None]
        lf0_score = f0_score.copy()
        nonzero_indices = np.nonzero(lf0_score)
        lf0_score[nonzero_indices] = np.log(f0_score[nonzero_indices])
        lf0_score = interp1d(lf0_score, kind="slinear")

        f0 = diff_lf0 + lf0_score
        f0[vuv < 0.5] = 0
        f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])
    else:
        f0 = target_f0

    generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64),
                                            spectrogram.astype(np.float64),
                                            aperiodicity.astype(np.float64),
                                            sample_rate, frame_period)

    return generated_waveform
Esempio n. 17
0
 def get_sizes(sampling_rate: int, order: int):
     fft_size = pyworld.get_cheaptrick_fft_size(fs=sampling_rate)
     return dict(
         f0=1,
         spectrogram=fft_size // 2 + 1,
         aperiodicity=fft_size // 2 + 1,
         mfcc=order + 1,
         voiced=1,
     )
Esempio n. 18
0
def inv_world_spectrogram(f0, sp, ap, sr=_sr, **kwargs):
    """world声码器频谱转为语音。"""
    frame_period = kwargs.get("frame_period", pw.default_frame_period)
    f0_floor = kwargs.get("f0_floor", pw.default_f0_floor)
    fft_size = kwargs.get("fft_size", pw.get_cheaptrick_fft_size(sr, f0_floor))
    sp_dec = pw.decode_spectral_envelope(sp, sr, fft_size=fft_size)
    ap_dec = pw.decode_aperiodicity(ap, sr, fft_size=fft_size)
    y = pw.synthesize(f0, sp_dec, ap_dec, sr, frame_period=frame_period)
    return y
    def convert_to_feature(self,
                           input: AcousticFeature,
                           out_sampling_rate: Optional[int] = None):
        if out_sampling_rate is None:
            out_sampling_rate = self.config.dataset.param.voice_param.sample_rate

        input_feature = input
        input = self._feature_normalize(input, test=True)
        input = self._encode_feature(input, test=True)

        pad = 128 - input.shape[1] % 128
        input = numpy.pad(input, [(0, 0), (0, pad)], mode='minimum')

        converter = partial(chainer.dataset.convert.concat_examples,
                            device=self.gpu,
                            padding=0)
        inputs = converter([input])

        with chainer.using_config('train', False):
            out = self.model(inputs).data[0]

        if self.gpu is not None:
            out = chainer.cuda.to_cpu(out)
        out = out[:, :-pad]

        out = self._decode_feature(out, test=True)
        out = AcousticFeature(
            f0=out.f0,
            spectrogram=out.spectrogram,
            aperiodicity=out.aperiodicity,
            mfcc=out.mfcc,
            voiced=input_feature.voiced,
        )
        out = self._feature_denormalize(out, test=True)
        out = AcousticFeature(
            f0=out.f0,
            spectrogram=out.spectrogram,
            aperiodicity=input_feature.aperiodicity,
            mfcc=out.mfcc,
            voiced=out.voiced,
        )

        fftlen = pyworld.get_cheaptrick_fft_size(out_sampling_rate)
        spectrogram = pysptk.mc2sp(
            out.mfcc,
            alpha=self._param.acoustic_feature_param.alpha,
            fftlen=fftlen,
        )

        out = AcousticFeature(
            f0=out.f0,
            spectrogram=spectrogram,
            aperiodicity=out.aperiodicity,
            mfcc=out.mfcc,
            voiced=out.voiced,
        ).astype(numpy.float64)
        return out
Esempio n. 20
0
 def get_sizes(sampling_rate: int, order: int):
     fft_size = pyworld.get_cheaptrick_fft_size(fs=sampling_rate)
     return dict(
         f0=1,
         sp=fft_size // 2 + 1,
         ap=fft_size // 2 + 1,
         mc=order + 1,
         voiced=1,
     )
Esempio n. 21
0
def world_decode_mc(mc, fs):

    fftlen = pyworld.get_cheaptrick_fft_size(fs)
    #coded_sp = coded_sp.astype(np.float32)
    #coded_sp = np.ascontiguousarray(coded_sp)
    alpha = pysptk.util.mcepalpha(fs)
    sp = pysptk.conversion.mc2sp(mc, alpha, fftlen)
    # decoded_sp = pyworld.decode_spectral_envelope(coded_sp, fs, fftlen)

    return sp
Esempio n. 22
0
def test_vc_from_path(model, path, data_mean, data_std, diffvc=True):
    model.eval()

    fs, x = wavfile.read(path)
    x = x.astype(np.float64)
    f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)
    alpha = pysptk.util.mcepalpha(fs)
    mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha)
    c0, mc = mc[:, 0], mc[:, 1:]
    static_dim = mc.shape[-1]
    mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50)
    mc = P.delta_features(mc, hp.windows).astype(np.float32)

    T = mc.shape[0]

    inputs = mc[:, :static_dim].copy()

    # Normalization
    mc_scaled = P.scale(mc, data_mean, data_std)

    # Apply model
    mc_scaled = Variable(torch.from_numpy(mc_scaled))
    R = unit_variance_mlpg_matrix(hp.windows, T)
    R = torch.from_numpy(R)
    y_hat, y_hat_static = model(mc_scaled, R)
    mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim)

    # Denormalize
    mc_static_pred = P.inv_scale(mc_static_pred, data_mean[:static_dim],
                                 data_std[:static_dim])

    outputs = mc_static_pred.copy()

    if diffvc:
        mc_static_pred = mc_static_pred - mc[:, :static_dim]

    mc = np.hstack((c0[:, None], mc_static_pred))
    if diffvc:
        mc[:, 0] = 0  # remove power coefficients
        engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha),
                             hopsize=hop_length)
        b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha)
        waveform = engine.synthesis(x, b)
    else:
        fftlen = pyworld.get_cheaptrick_fft_size(fs)
        spectrogram = pysptk.mc2sp(mc.astype(np.float64),
                                   alpha=alpha,
                                   fftlen=fftlen)
        waveform = pyworld.synthesize(f0, spectrogram, aperiodicity, fs,
                                      hp.frame_period)

    return waveform, inputs, outputs
    def convert_to_feature(self, input: AcousticFeature, out_sampling_rate: Optional[int] = None):
        if out_sampling_rate is None:
            out_sampling_rate = self.config.dataset.param.voice_param.sample_rate

        input_feature = input
        input = self._feature_normalize(input, test=True)
        input = self._encode_feature(input, test=True)

        pad = 128 - input.shape[1] % 128
        input = numpy.pad(input, [(0, 0), (0, pad)], mode='minimum')

        converter = partial(chainer.dataset.convert.concat_examples, device=self.gpu, padding=0)
        inputs = converter([input])

        with chainer.using_config('train', False):
            out = self.model(inputs).data[0]

        if self.gpu is not None:
            out = chainer.cuda.to_cpu(out)
        out = out[:, :-pad]

        out = self._decode_feature(out, test=True)
        out = AcousticFeature(
            f0=out.f0,
            spectrogram=out.spectrogram,
            aperiodicity=out.aperiodicity,
            mfcc=out.mfcc,
            voiced=input_feature.voiced,
        )
        out = self._feature_denormalize(out, test=True)
        out = AcousticFeature(
            f0=out.f0,
            spectrogram=out.spectrogram,
            aperiodicity=input_feature.aperiodicity,
            mfcc=out.mfcc,
            voiced=out.voiced,
        )

        fftlen = pyworld.get_cheaptrick_fft_size(out_sampling_rate)
        spectrogram = pysptk.mc2sp(
            out.mfcc,
            alpha=self._param.acoustic_feature_param.alpha,
            fftlen=fftlen,
        )

        out = AcousticFeature(
            f0=out.f0,
            spectrogram=spectrogram,
            aperiodicity=out.aperiodicity,
            mfcc=out.mfcc,
            voiced=out.voiced,
        ).astype(numpy.float64)
        return out
Esempio n. 24
0
def collect_features(x, fs):
    fftlen = pyworld.get_cheaptrick_fft_size(fs)
    alpha = pysptk.util.mcepalpha(fs)
    order = 25
    frame_period = 5
    hop_length = int(fs * (frame_period * 0.001))

    x = x.astype(np.float64)
    _f0, _timeaxis = pyworld.dio(x, fs, frame_period=frame_period)
    f0 = pyworld.stonemask(x, _f0, _timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, _timeaxis, fs)
    mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha)
    return mc
Esempio n. 25
0
    def __call__(self, data: Wave, test):
        acoustic_feature = self._acoustic_feature_process(data, test=True).astype_only_float(self._dtype)
        high_spectrogram = acoustic_feature.spectrogram

        fftlen = pyworld.get_cheaptrick_fft_size(data.sampling_rate)
        low_spectrogram = pysptk.mc2sp(
            acoustic_feature.mfcc,
            alpha=self._alpha,
            fftlen=fftlen,
        )

        feature = LowHighSpectrogramFeature(
            low=low_spectrogram,
            high=high_spectrogram,
        )
        feature.validate()
        return feature
Esempio n. 26
0
    def create_synthesizer(
        self,
        buffer_size: int,
        number_of_pointers: int,
    ):
        assert self._synthesizer is None

        self._synthesizer = structures.WorldSynthesizer()
        apidefinitions._InitializeSynthesizer(
            self.out_sampling_rate,  # sampling rate
            self.acoustic_param.frame_period,  # frame period
            pyworld.get_cheaptrick_fft_size(
                self.out_sampling_rate),  # fft size
            buffer_size,  # buffer size
            number_of_pointers,  # number of pointers
            self._synthesizer,
        )
Esempio n. 27
0
    def __call__(self, data: Wave, test):
        acoustic_feature = self._acoustic_feature_process(data, test=True).astype_only_float(self._dtype)
        high_spectrogram = acoustic_feature.spectrogram

        fftlen = pyworld.get_cheaptrick_fft_size(data.sampling_rate)
        low_spectrogram = pysptk.mc2sp(
            acoustic_feature.mfcc,
            alpha=self._alpha,
            fftlen=fftlen,
        )

        feature = LowHighSpectrogramFeature(
            low=low_spectrogram,
            high=high_spectrogram,
        )
        feature.validate()
        return feature
Esempio n. 28
0
    def __init__(self, config_parser):
        super().__init__(config_parser)

        self.sampling_rate = self.to_int(self.get_value('sampling_rate'))
        self.frame_period = self.to_int(self.get_value('frame_period'))
        self.has_delta = self.to_bool(self.get_value('has_delta'))

        if self.has_delta:
            self.window = [(0, 0, np.array([1.0])),
                           (1, 1, np.array([-0.5, 0.0, 0.5])),
                           (1, 1, np.array([1.0, -2.0, 1.0]))]
        else:
            self.window = [(0, 0, np.array([1.0]))]

        self.fft_length = pyworld.get_cheaptrick_fft_size(self.sampling_rate)
        self.alpha = pysptk.util.mcepalpha(self.sampling_rate)
        self.hop_length = int(self.sampling_rate * 0.001 *
                              self.frame_period)  # require [Hz] -> [kHz]
Esempio n. 29
0
def worldDecodeSpectralEnvelop(coded_sp: np.ndarray,
                               fs: int = SAMPLE_RATE) -> np.ndarray:
    '''
    MCEPsをスペクトル包絡に戻す

    Parameters
    ----------
    coded_sp: np.ndarray
        MCEPsのデータ
    fs: int, default SAMPLE_RATE
        サンプリング周波数
    
    Returns
    -------
    decoded_sp: np.ndarray
        スペクトル包絡
    '''
    fftlen = pyworld.get_cheaptrick_fft_size(fs)
    decoded_sp = pyworld.decode_spectral_envelope(coded_sp, fs, fftlen)
    return decoded_sp
Esempio n. 30
0
    def run_world_synth(self, synth_output, hparams):
        """Run the WORLD synthesize method."""
        fft_size = pyworld.get_cheaptrick_fft_size(hparams.synth_fs)

        save_dir = hparams.synth_dir if hparams.synth_dir is not None else hparams.out_dir if hparams.out_dir is not None else os.path.curdir
        for id_name, output in synth_output.items():
            logging.info("Synthesise {} with the WORLD vocoder.".format(id_name))

            coded_sp, lf0, vuv, bap = WorldFeatLabelGen.convert_to_world_features(output, contains_deltas=False, num_coded_sps=hparams.num_coded_sps)
            ln_sp = pysptk.mgc2sp(np.ascontiguousarray(coded_sp, dtype=np.float64), alpha=WorldFeatLabelGen.mgc_alpha, gamma=0.0, fftlen=fft_size)
            # sp = np.exp(sp.real * 2.0)
            # sp.imag = sp.imag * 180.0 / np.pi
            sp = np.exp(ln_sp.real)
            sp = np.power(sp.real / 32768.0, 2)
            # sp = np.power(sp.real / 32768.0, 2)
            # sp = np.exp(np.power(sp.real, 2))
            # sp = pyworld.decode_spectral_envelope(np.ascontiguousarray(coded_sp, np.float64), self.synth_fs, fft_size)  # Cepstral version.
            f0 = np.exp(lf0, dtype=np.float64)
            vuv[f0 < WorldFeatLabelGen.f0_silence_threshold] = 0  # WORLD throws an error for too small f0 values.
            f0[vuv == 0] = 0.0
            ap = pyworld.decode_aperiodicity(np.ascontiguousarray(bap.reshape(-1, 1), np.float64), hparams.synth_fs, fft_size)

            waveform = pyworld.synthesize(f0, sp, ap, hparams.synth_fs)
            waveform = waveform.astype(np.float32, copy=False)  # Does inplace conversion, if possible.

            # Always save as wav file first and convert afterwards if necessary.
            wav_file_path = os.path.join(save_dir, "{}{}{}.wav".format(os.path.basename(id_name),
                                                                       "_" + hparams.model_name if hparams.model_name is not None else "",
                                                                       hparams.synth_file_suffix, "_WORLD", ".wav"))
            makedirs_safe(hparams.synth_dir)
            soundfile.write(wav_file_path, waveform, hparams.synth_fs)

            # Use PyDub for special audio formats.
            if hparams.synth_ext.lower() != 'wav':
                as_wave = pydub.AudioSegment.from_wav(wav_file_path)
                as_wave.export(os.path.join(hparams.synth_dir, id_name + "." + hparams.synth_ext), format=hparams.synth_ext)
                os.remove(wav_file_path)
def generate_file(path):
    out = Path(arguments.output_directory, path.stem + '.npy')
    if out.exists() and not arguments.enable_overwrite:
        return

    # load wave and padding
    wave_file_load_process = WaveFileLoadProcess(
        sample_rate=arguments.sample_rate,
        top_db=arguments.top_db,
        pad_second=arguments.pad_second,
    )
    wave = wave_file_load_process(path, test=True)

    # make acoustic feature
    acoustic_feature_process = AcousticFeatureProcess(
        frame_period=arguments.frame_period,
        order=arguments.order,
        alpha=arguments.alpha,
        f0_estimating_method=arguments.f0_estimating_method,
    )
    feature = acoustic_feature_process(wave, test=True).astype_only_float(
        numpy.float32)
    high_spectrogram = feature.spectrogram

    fftlen = pyworld.get_cheaptrick_fft_size(arguments.sample_rate)
    low_spectrogram = pysptk.mc2sp(
        feature.mfcc,
        alpha=arguments.alpha,
        fftlen=fftlen,
    )

    # save
    numpy.save(out.absolute(), {
        'low': low_spectrogram,
        'high': high_spectrogram,
    })
Esempio n. 32
0
    def __init__(
            self,
            acoustic_feature_param: AcousticFeatureParam,
            out_sampling_rate: int,
            buffer_size: int,
            number_of_pointers: int,
    ):
        super().__init__(
            acoustic_feature_param=acoustic_feature_param,
            out_sampling_rate=out_sampling_rate,
        )

        self.buffer_size = buffer_size

        self._synthesizer = structures.WorldSynthesizer()
        apidefinitions._InitializeSynthesizer(
            self.out_sampling_rate,  # sampling rate
            self.acoustic_feature_param.frame_period,  # frame period
            pyworld.get_cheaptrick_fft_size(out_sampling_rate),  # fft size
            buffer_size,  # buffer size
            number_of_pointers,  # number of pointers
            self._synthesizer,
        )
        self._before_buffer = []  # for holding memory
Esempio n. 33
0
def world_decode_spectral_envelop(coded_sp, fs):
    # Decode Mel-cepstral to sp
    fftlen = pyworld.get_cheaptrick_fft_size(fs)
    decoded_sp = pyworld.decode_spectral_envelope(coded_sp, fs, fftlen)

    return decoded_sp
Esempio n. 34
0
from pysptk.synthesis import MLSADF, Synthesizer
import librosa
import librosa.display
import IPython
from IPython.display import Audio

from os import listdir, path, makedirs, mkdir
from os.path import join, expanduser

DATA_ROOT = join(expanduser("~"), "Documents", "Hit", "GeneralResources", "VCProjectDate", "Training")
print(DATA_ROOT)
print(listdir(DATA_ROOT))


fs = 16000
fftlen = pyworld.get_cheaptrick_fft_size(fs)
alpha = pysptk.util.mcepalpha(fs)
order = 24
frame_period = 5
hop_length = int(fs * (frame_period * 0.001))
max_files = 530  # number of utterances to be used.
test_size = 0.03
use_delta = True

if use_delta:
    windows = [
        (0, 0, np.array([1.0])),
        (1, 1, np.array([-0.5, 0.0, 0.5])),
        (1, 1, np.array([1.0, -2.0, 1.0])),
    ]
else:
Esempio n. 35
0
 def decode_ap(ap: numpy.ndarray, sampling_rate: int):
     return pyworld.decode_aperiodicity(
         ap.astype(numpy.float64),
         sampling_rate,
         pyworld.get_cheaptrick_fft_size(sampling_rate),
     )
Esempio n. 36
0
 def mc2sp(mc: numpy.ndarray, sampling_rate: int, alpha: float):
     return pysptk.mc2sp(
         mc.astype(numpy.float64),
         alpha=alpha,
         fftlen=pyworld.get_cheaptrick_fft_size(sampling_rate),
     )
Esempio n. 37
0
def test_vc_from_path(model, x, fs, data_mean, data_std, diffvc=True):
    model.eval()

    hop_length = int(fs * (hp.frame_period * 0.001))
    x = x.astype(np.float64)
    f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)
    alpha = pysptk.util.mcepalpha(fs)
    mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha)
    c0, mc = mc[:, 0], mc[:, 1:]
    static_dim = mc.shape[-1]
    mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50)
    mc = P.delta_features(mc, hp.windows).astype(np.float32)

    T = mc.shape[0]

    inputs = mc[:, :static_dim].copy()

    # Normalization
    mc_scaled = P.scale(mc, data_mean, data_std)

    mc_scaled = Variable(torch.from_numpy(mc_scaled))
    lengths = [len(mc_scaled)]

    # Add batch axis
    mc_scaled = mc_scaled.view(1, -1, mc_scaled.size(-1))

    # For MLPG
    R = unit_variance_mlpg_matrix(hp.windows, T)
    R = torch.from_numpy(R)

    # Apply model
    if model.include_parameter_generation():
        # Case: models include parameter generation in itself
        # Mulistream features cannot be used in this case
        y_hat, y_hat_static = model(mc_scaled, R, lengths=lengths)
    else:
        # Case: generic models (can be sequence model)
        assert hp.has_dynamic_features is not None
        y_hat = model(mc_scaled, lengths=lengths)
        y_hat_static = multi_stream_mlpg(
            y_hat, R, hp.stream_sizes, hp.has_dynamic_features)

    mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim)

    # Denormalize
    mc_static_pred = P.inv_scale(
        mc_static_pred, data_mean[:static_dim], data_std[:static_dim])

    outputs = mc_static_pred.copy()

    if diffvc:
        mc_static_pred = mc_static_pred - mc[:, :static_dim]

    mc = np.hstack((c0[:, None], mc_static_pred))
    if diffvc:
        mc[:, 0] = 0  # remove power coefficients
        engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha),
                             hopsize=hop_length)
        b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha)
        waveform = engine.synthesis(x, b)
    else:
        fftlen = pyworld.get_cheaptrick_fft_size(fs)
        spectrogram = pysptk.mc2sp(
            mc.astype(np.float64), alpha=alpha, fftlen=fftlen)
        waveform = pyworld.synthesize(
            f0, spectrogram, aperiodicity, fs, hp.frame_period)

    return waveform, inputs, outputs