コード例 #1
0
ファイル: acoustic_converter.py プロジェクト: BURI55/yukarin
    def convert(self, in_feature: AcousticFeature):
        input = self._encode_feature(in_feature)

        pad = 128 - input.shape[1] % 128
        input = numpy.pad(input, [(0, 0), (0, pad)], mode='minimum')

        converter = partial(chainer.dataset.convert.concat_examples, device=self.gpu, padding=0)
        inputs = converter([input])

        with chainer.using_config('train', False):
            out = self.model(inputs).data[0]

        if self.gpu is not None:
            out = chainer.cuda.to_cpu(out)
        out = out[:, :-pad]

        out = self._decode_feature(out)
        out.ap = in_feature.ap
        out.voiced = in_feature.voiced
        out.f0[~out.voiced] = 0

        fftlen = pyworld.get_cheaptrick_fft_size(self.out_sampling_rate)
        sp = pysptk.mc2sp(
            out.mc,
            alpha=self._param.alpha,
            fftlen=fftlen,
        )
        out.sp = sp

        out = out.astype_only_float(numpy.float64)
        return out
コード例 #2
0
def synthesis(f0, mcep, ap, r=None, alpha=0.42):
    if r is not None:
        mcep = mod_p(mcep, r)
    spc = pysptk.mc2sp(mcep, alpha, 1024)
    wav = pyworld.synthesize(f0, spc, ap, 16000, frame_period=5)

    return wav
コード例 #3
0
    def generate(self, parm_var, do_postfilter=True):
        config = self.analysis_config

        for path in self.paths:
            file_id = splitext(basename(path))[0]
            print('Synthesizing %s ... ' % (file_id), end='')
            mgc, lf0, vuv, bap = self._generate_parameters(path, parm_var)

            if do_postfilter:
                mgc = merlin_post_filter(mgc, config.alpha)

            sp = pysptk.mc2sp(mgc,
                              fftlen=config.fft_length,
                              alpha=config.alpha)
            ap = pyworld.decode_aperiodicity(bap.astype(np.float64),
                                             config.sampling_rate,
                                             config.fft_length)
            f0 = self._lf0_to_f0(lf0, vuv)
            generated = pyworld.synthesize(f0.flatten().astype(np.float64),
                                           sp.astype(np.float64),
                                           ap.astype(np.float64),
                                           config.sampling_rate,
                                           config.frame_period)
            with open(join(self.out_dir, file_id + '.wav'), 'wb') as f:
                f.write(Audio(generated, rate=config.sampling_rate).data)
            print('done!')
コード例 #4
0
def generate_file(path):
    out = Path(arguments.output_directory, path.stem + '.npy')
    if out.exists() and not arguments.enable_overwrite:
        return

    # load wave and padding
    wave_file_load_process = WaveFileLoadProcess(
        sample_rate=arguments.sample_rate,
        top_db=arguments.top_db,
        pad_second=arguments.pad_second,
    )
    wave = wave_file_load_process(path, test=True)

    # make acoustic feature
    acoustic_feature_process = AcousticFeatureProcess(
        frame_period=arguments.frame_period,
        order=arguments.order,
        alpha=arguments.alpha,
        f0_estimating_method=arguments.f0_estimating_method,
    )
    feature = acoustic_feature_process(wave, test=True).astype_only_float(numpy.float32)
    high_spectrogram = feature.spectrogram

    fftlen = pyworld.get_cheaptrick_fft_size(arguments.sample_rate)
    low_spectrogram = pysptk.mc2sp(
        feature.mfcc,
        alpha=arguments.alpha,
        fftlen=fftlen,
    )

    # save
    numpy.save(out.absolute(), {
        'low': low_spectrogram,
        'high': high_spectrogram,
    })
コード例 #5
0
def generate_file(path):
    out = Path(arguments.output_directory, path.stem + '.npy')
    if out.exists() and not arguments.enable_overwrite:
        return

    # load wave and padding
    wave_file_load_process = WaveFileLoadProcess(
        sample_rate=arguments.sample_rate,
        top_db=arguments.top_db,
        pad_second=arguments.pad_second,
    )
    wave = wave_file_load_process(path, test=True)

    # make acoustic feature
    acoustic_feature_process = AcousticFeatureProcess(
        frame_period=arguments.frame_period,
        order=arguments.order,
        alpha=arguments.alpha,
        f0_estimating_method=arguments.f0_estimating_method,
    )
    feature = acoustic_feature_process(wave, test=True).astype_only_float(numpy.float32)
    high_spectrogram = feature.spectrogram

    fftlen = pyworld.get_cheaptrick_fft_size(arguments.sample_rate)
    low_spectrogram = pysptk.mc2sp(
        feature.mfcc,
        alpha=arguments.alpha,
        fftlen=fftlen,
    )

    # save
    numpy.save(out.absolute(), {
        'low': low_spectrogram,
        'high': high_spectrogram,
    })
コード例 #6
0
 def extract_spectrum(self, spectrum_len=None, Synthesizer=None):
     if spectrum_len is None:
         if Synthesizer is None:
             Synthesizer = kwiiyatta.Synthesizer
         spectrum_len = Synthesizer.fs_spectrum_len(self.fs)
     return pysptk.mc2sp(self.data, fftlen=(spectrum_len - 1) * 2,
                         alpha=self.alpha())
コード例 #7
0
 def __test(order, alpha, fftlen):
     np.random.seed(98765)
     sp = np.random.rand(int(fftlen // 2 + 1))
     mc = pysptk.sp2mc(sp, order, alpha)
     approx_sp = pysptk.mc2sp(mc, alpha, fftlen)
     # TODO: tolerance should be more carefully chosen
     assert np.allclose(sp, approx_sp, atol=0.9)
コード例 #8
0
ファイル: evaluation_tts.py プロジェクト: shamanez/gantts
def gen_waveform(y_predicted, Y_mean, Y_std, post_filter=False, coef=1.4,
                 fs=16000, mge_training=True):
    alpha = pysptk.util.mcepalpha(fs)
    fftlen = fftlen = pyworld.get_cheaptrick_fft_size(fs)
    frame_period = hp_acoustic.frame_period

    # Generate parameters and split streams
    mgc, lf0, vuv, bap = gen_parameters(y_predicted, Y_mean, Y_std, mge_training)

    if post_filter:
        mgc = merlin_post_filter(mgc, alpha, coef=coef)

    spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha)
    aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), fs, fftlen)
    f0 = lf0.copy()
    f0[vuv < 0.5] = 0
    f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])

    generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64),
                                            spectrogram.astype(np.float64),
                                            aperiodicity.astype(np.float64),
                                            fs, frame_period)
    # Convert range to int16
    generated_waveform = generated_waveform / \
        np.max(np.abs(generated_waveform)) * 32767

    # return features as well to compare natural/genearted later
    return generated_waveform, mgc, lf0, vuv, bap
コード例 #9
0
def world2wav(feature, frame_period):
    hparams = hp
    mgc_idx = 0
    lf0_idx = mgc_idx + hparams.num_mgc
    vuv_idx = lf0_idx + hparams.num_lf0
    bap_idx = vuv_idx + hparams.num_vuv

    mgc = feature[:, mgc_idx:mgc_idx + hparams.num_mgc]
    lf0 = feature[:, lf0_idx:lf0_idx + hparams.num_lf0]
    vuv = feature[:, vuv_idx:vuv_idx + hparams.num_vuv]
    bap = feature[:, bap_idx:bap_idx + hparams.num_bap]

    fs = hparams.sample_rate
    alpha = pysptk.util.mcepalpha(fs)
    fftlen = pyworld.get_cheaptrick_fft_size(fs)

    spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha)
    indexes = (vuv < 0.5).flatten()
    bap[indexes] = np.zeros(hparams.num_bap)

    aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), fs,
                                               fftlen)
    f0 = lf0.copy()
    f0[vuv < 0.5] = 0
    f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])

    return pyworld.synthesize(f0.flatten().astype(np.float64),
                              spectrogram.astype(np.float64),
                              aperiodicity.astype(np.float64), fs,
                              frame_period)
コード例 #10
0
ファイル: predict.py プロジェクト: tkm2261/dnn-voice-changer
def generate_changed_voice(model, input_path):

    fs, x = wavfile.read(input_path)
    x = x.astype(np.float64)
    if len(x.shape) > 1:
        x = x.mean(axis=1)

    f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)
    alpha = pysptk.util.mcepalpha(fs)
    mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha)
    c0, mc = mc[:, 0], mc[:, 1:]

    mc = P.modspec_smoothing(mc, FS / HOP_LENGHT, cutoff=50)
    mc = P.delta_features(mc, hp.windows).astype(np.float32)

    gen_data = model.predict(mc)

    gen_data = np.hstack([c0.reshape((-1, 1)), gen_data])

    fftlen = pyworld.get_cheaptrick_fft_size(fs)
    spectrogram = pysptk.mc2sp(
        gen_data.astype(np.float64), alpha=alpha, fftlen=fftlen)
    waveform = pyworld.synthesize(
        f0, spectrogram, aperiodicity, fs, hp.frame_period)

    return waveform
コード例 #11
0
def gen_waveform(y_predicted,
                 Y_mean,
                 Y_std,
                 post_filter=False,
                 coef=1.4,
                 fs=16000,
                 mge_training=True):
    alpha = pysptk.util.mcepalpha(fs)
    fftlen = fftlen = pyworld.get_cheaptrick_fft_size(fs)
    frame_period = hp_acoustic.frame_period

    # Generate parameters and split streams
    mgc, lf0, vuv, bap = gen_parameters(y_predicted, Y_mean, Y_std,
                                        mge_training)

    if post_filter:
        mgc = merlin_post_filter(mgc, alpha, coef=coef)

    spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha)
    aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), fs,
                                               fftlen)
    f0 = lf0.copy()
    f0[vuv < 0.5] = 0
    f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])

    generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64),
                                            spectrogram.astype(np.float64),
                                            aperiodicity.astype(np.float64),
                                            fs, frame_period)
    # Convert range to int16
    generated_waveform = generated_waveform / \
        np.max(np.abs(generated_waveform)) * 32767

    # return features as well to compare natural/genearted later
    return generated_waveform, mgc, lf0, vuv, bap
コード例 #12
0
    def synthesis(self, feat, se_kind='sp'):
        batch_size = feat['ap'].size(0)
        device = feat['ap'].device

        audio = []
        for i in range(batch_size):
            ap = feat['ap'][i].detach().t().cpu().double().numpy()
            f0 = feat['f0'][i].detach().view(-1).cpu().double().numpy()
            if se_kind == 'mcc':
                mcc = feat['mcc'][i].detach().t().cpu().double().numpy()
                sp = pysptk.mc2sp(mcc.copy(order='C'), self.mcc_alpha,
                                  self.fft_size)
            else:
                sp = feat['sp'][i].detach().t().cpu().double().numpy()

            syn = pyworld.synthesize(f0.copy(order='C'),
                                     sp.copy(order='C'),
                                     ap.copy(order='C'),
                                     self.fs,
                                     frame_period=self.shiftms)
            audio.append(torch.from_numpy(syn).float().view(-1))

        audio = torch.cat([syn.unsqueeze(0) for syn in audio],
                          dim=0).to(device)

        return audio / MAX_WAV_VALUE
コード例 #13
0
ファイル: maptask.py プロジェクト: ErikEkstedt/maptaskdataset
def vizualize_hardcoded(x, mgc, lf0, f0, vuv, fs, timeaxis):
    plt.subplot(5, 1, 1)
    plt.plot(x, label="Wav")
    plt.xlim(0, len(x))
    # Spec
    plt.subplot(5, 1, 2)
    sp = pysptk.mc2sp(mgc[:, :60], alpha=alpha, fftlen=fftlen)
    logsp = np.log(sp)
    librosa.display.specshow(logsp.T,
                             sr=fs,
                             hop_length=hop_length,
                             x_axis="time",
                             y_axis="linear")
    # Lof_f0, Vuv
    plt.subplot(5, 1, 3)
    # plt.plot(np.exp(lf0[:,0]), linewidth=2, label="Continuous log-f0")
    plt.plot(f0, linewidth=2, label="Continuous log-f0")
    plt.xlim(0, len(f0))
    plt.subplot(5, 1, 4)
    plt.plot(vuv, linewidth=2, label="Voiced/unvoiced flag")
    plt.xlim(0, len(vuv))
    plt.legend(prop={"size": 14}, loc="upper right")
    # aperiodicity
    plt.subplot(5, 1, 5)
    bap = bap[:, :2]
    bap = np.ascontiguousarray(bap).astype(np.float64)
    aperiodicity = pyworld.decode_aperiodicity(bap, fs, fftlen)
    librosa.display.specshow(aperiodicity.T,
                             sr=fs,
                             hop_length=hop_length,
                             x_axis="time",
                             y_axis="linear")
    plt.show()
コード例 #14
0
ファイル: gen.py プロジェクト: r9y9/nnsvs
def gen_world_params(mgc, lf0, vuv, bap, sample_rate, vuv_threshold=0.3):
    """Generate WORLD parameters from mgc, lf0, vuv and bap.

    Args:
        mgc (ndarray): mgc
        lf0 (ndarray): lf0
        vuv (ndarray): vuv
        bap (ndarray): bap
        sample_rate (int): sample rate
        vuv_threshold (float): threshold for VUV

    Returns:
        tuple: tuple of f0, spectrogram and aperiodicity
    """
    fftlen = pyworld.get_cheaptrick_fft_size(sample_rate)
    alpha = pysptk.util.mcepalpha(sample_rate)
    spectrogram = pysptk.mc2sp(np.ascontiguousarray(mgc), fftlen=fftlen, alpha=alpha)
    aperiodicity = pyworld.decode_aperiodicity(
        np.ascontiguousarray(bap).astype(np.float64), sample_rate, fftlen
    )

    # fill aperiodicity with ones for unvoiced regions
    aperiodicity[vuv.reshape(-1) < vuv_threshold, :] = 1.0
    # WORLD fails catastrophically for out of range aperiodicity
    aperiodicity = np.clip(aperiodicity, 0.0, 1.0)

    f0 = lf0.copy()
    f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])
    f0[vuv < vuv_threshold] = 0

    f0 = f0.flatten().astype(np.float64)
    spectrogram = spectrogram.astype(np.float64)
    aperiodicity = aperiodicity.astype(np.float64)

    return f0, spectrogram, aperiodicity
コード例 #15
0
    def gen_waveform(self, feature):
        mcep_dim = self.config['mcep_order'] + 1
        mgc = feature[:, :mcep_dim]
        lf0 = feature[:, mcep_dim:mcep_dim + 1]
        vuv = feature[:, mcep_dim + 1: mcep_dim + 2]
        bap = feature[:, mcep_dim + 2:]

        spectrogram = pysptk.mc2sp(
            mgc,
            fftlen=self.config['fft_size'],
            alpha=pysptk.util.mcepalpha(self.config['sampling_rate']),
        )
        aperiodicity = pyworld.decode_aperiodicity(
            bap.astype(np.float64),
            self.config['sampling_rate'],
            self.config['fft_size'],
        )
        f0 = lf0.copy()
        f0[vuv < 0.5] = 0
        f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])

        waveform = pyworld.synthesize(
            f0.flatten().astype(np.float64),
            spectrogram.astype(np.float64),
            aperiodicity.astype(np.float64),
            self.config['sampling_rate'],
            self.config['hop_size_in_ms'],
        )
        return waveform
コード例 #16
0
ファイル: test_world.py プロジェクト: patrick-g-zhang/merlin
def synthesis():
    # pdb.set_trace()
    lf0_file = "p225_001.lf0"
    bap_file_name="p225_001.bap"
    mgc_file_name="p225_001.mgc"
    fl=4096
    sr=48000
    # pdb.set_trace()
    lf0 = read_binfile(lf0_file, dim=1, dtype=np.float32)
    zeros_index = np.where(lf0 == -1E+10)
    nonzeros_index = np.where(lf0 != -1E+10)
    f0 = lf0.copy()
    f0[zeros_index] = 0
    f0[nonzeros_index] = np.exp(lf0[nonzeros_index])
    f0 = f0.astype(np.float64)
    bap_dim = 5
    bap = read_binfile(bap_file_name, dim=bap_dim, dtype=np.float32)
    ap = pyworld.decode_aperiodicity(bap.astype(np.float64).reshape(-1, bap_dim), sr, fl)
    mc = read_binfile(mgc_file_name, dim=60, dtype=np.float32)
    alpha = pysptk.util.mcepalpha(sr)
    sp = pysptk.mc2sp(mc.astype(np.float64), fftlen=fl, alpha=alpha)
    wav = pyworld.synthesize(f0, sp, ap, sr, 5)
    x2 = wav * 32768
    x2 = x2.astype(np.int16)
    scipy.io.wavfile.write("resynthesis.wav", sr, x2)
コード例 #17
0
 def decode_spectrogram(self, feature: AcousticFeature):
     fftlen = pyworld.get_cheaptrick_fft_size(self.out_sampling_rate)
     feature.sp = pysptk.mc2sp(
         feature.mc.astype(numpy.float32),
         alpha=pysptk.util.mcepalpha(self.out_sampling_rate),
         fftlen=fftlen,
     )
     return feature
コード例 #18
0
def gen_waveform(labels, acoustic_features, acoustic_out_scaler,
        binary_dict, continuous_dict, stream_sizes, has_dynamic_features,
        subphone_features="coarse_coding", log_f0_conditioning=True, pitch_idx=None,
        num_windows=3, post_filter=True, sample_rate=48000, frame_period=5,
        relative_f0=True):

    windows = get_windows(num_windows)

    # Apply MLPG if necessary
    if np.any(has_dynamic_features):
        acoustic_features = multi_stream_mlpg(
            acoustic_features, acoustic_out_scaler.var_, windows, stream_sizes,
            has_dynamic_features)
        static_stream_sizes = get_static_stream_sizes(
            stream_sizes, has_dynamic_features, len(windows))
    else:
        static_stream_sizes = stream_sizes

    # Split multi-stream features
    mgc, target_f0, vuv, bap = split_streams(acoustic_features, static_stream_sizes)

    # Gen waveform by the WORLD vocodoer
    fftlen = pyworld.get_cheaptrick_fft_size(sample_rate)
    alpha = pysptk.util.mcepalpha(sample_rate)

    if post_filter:
        mgc = merlin_post_filter(mgc, alpha)

    spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha)
    aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), sample_rate, fftlen)


    ### F0 ###
    if relative_f0:
        diff_lf0 = target_f0
        # need to extract pitch sequence from the musical score
        linguistic_features = fe.linguistic_features(labels,
                                                    binary_dict, continuous_dict,
                                                    add_frame_features=True,
                                                    subphone_features=subphone_features)
        f0_score = _midi_to_hz(linguistic_features, pitch_idx, False)[:, None]
        lf0_score = f0_score.copy()
        nonzero_indices = np.nonzero(lf0_score)
        lf0_score[nonzero_indices] = np.log(f0_score[nonzero_indices])
        lf0_score = interp1d(lf0_score, kind="slinear")

        f0 = diff_lf0 + lf0_score
        f0[vuv < 0.5] = 0
        f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])
    else:
        f0 = target_f0

    generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64),
                                            spectrogram.astype(np.float64),
                                            aperiodicity.astype(np.float64),
                                            sample_rate, frame_period)

    return generated_waveform
コード例 #19
0
ファイル: test.py プロジェクト: ishihara1989/ABSE
def make_conversion(root, result_dir, checkpoint, ut_min=91, ut_max=100, sp_min=91, sp_max=100):
    alpha = 0.42
    n_fft = 1024
    root = Path(root)
    result_dir = Path(result_dir)
    dicts = torch.load(checkpoint, map_location='cpu')
    model = VC(dicts['config']['model'], train=False)
    model.load_state_dict(dicts['model'])
    model.remove_wn()
    model = model.eval()
    for s in range(sp_min, sp_max+1):
        sp = f'jvs{s:03}'
        sp_root = result_dir / sp
        sp_root.mkdir(parents=True, exist_ok=True)
        sp_dict_path = sp_root / 'sp_dict.pt'
        
        if not sp_dict_path.is_file():
            nonparas = list((root / sp / 'nonpara30/wav24kHz16bit').glob('BASIC5000_*.mcep.npy'))
            index = max(enumerate(nonparas), key=lambda p: p[1].stat().st_size)[0]
            ref_mcep = nonparas[index]
            ref_f0 = ref_mcep.parent / ref_mcep.stem.replace('.mcep', '.f0.npy')
            sp_dict = extract_from(model, ref_mcep, ref_f0, sp_dict_path)
        else:
            sp_dict = torch.load(sp_dict_path)
        for s2 in range(sp_min, sp_max+1):
            sp2 = f'jvs{s2:03}'
            sp2_root = result_dir / sp2
            sp2_root.mkdir(parents=True, exist_ok=True)
            
            target_root = sp_root / sp2
            target_root.mkdir(parents=True, exist_ok=True)
            for u in range(ut_min, ut_max+1):
                src_mcep = root / sp2 / 'parallel100/wav24kHz16bit' / f'VOICEACTRESS100_{u:03}.mcep.npy'
                src_f0 = root / sp2 / 'parallel100/wav24kHz16bit' / f'VOICEACTRESS100_{u:03}.f0.npy'
                src_c0 = root / sp2 / 'parallel100/wav24kHz16bit' / f'VOICEACTRESS100_{u:03}.c0.npy'
                src_ap = root / sp2 / 'parallel100/wav24kHz16bit' / f'VOICEACTRESS100_{u:03}.ap.npy'
                src_dict_path = sp2_root / f'VOICEACTRESS100_{u:03}.pt'
                if not src_dict_path.is_file():
                    src_dict = prep_content(model, src_mcep, src_dict_path)
                else:
                    src_dict = torch.load(src_dict_path)
                    
                converted_mcep = model.reconstruct_mcep(src_dict['cq'], sp_dict['kv']).squeeze().numpy()
                tgt_mcep = target_root / f'VOICEACTRESS100_{u:03}.mcep.npy'
                np.save(tgt_mcep, converted_mcep)
                
                f0 = np.load(src_f0).astype(np.float64)
                f0 = convert_f0(f0, sp_dict)
                ap = np.load(src_ap).astype(np.float64)
                ap = reconstruct_ap(ap)
                c0 = np.load(src_c0).astype(np.float64)
                assert (c0.shape[0] <= converted_mcep.shape[-1]), f'{s}->{s2}/{u}, {c0.shape[0]} <= {converted_mcep.shape[-1]}'
                mcep = np.hstack([c0[:, None], converted_mcep[:, :c0.shape[0]].T]).astype(np.float64)
                sp = pysptk.mc2sp(np.ascontiguousarray(mcep), alpha, n_fft)
                wav = pyworld.synthesize(f0, sp, ap, 16000)
                tgt_wav = target_root / f'VOICEACTRESS100_{u:03}.wav'
                wavfile.write(tgt_wav, 16000, (wav*32768).astype(np.int16))
                print(tgt_wav, flush=True)
コード例 #20
0
    def convert_to_feature(self,
                           input: AcousticFeature,
                           out_sampling_rate: Optional[int] = None):
        if out_sampling_rate is None:
            out_sampling_rate = self.config.dataset.param.voice_param.sample_rate

        input_feature = input
        input = self._feature_normalize(input, test=True)
        input = self._encode_feature(input, test=True)

        pad = 128 - input.shape[1] % 128
        input = numpy.pad(input, [(0, 0), (0, pad)], mode='minimum')

        converter = partial(chainer.dataset.convert.concat_examples,
                            device=self.gpu,
                            padding=0)
        inputs = converter([input])

        with chainer.using_config('train', False):
            out = self.model(inputs).data[0]

        if self.gpu is not None:
            out = chainer.cuda.to_cpu(out)
        out = out[:, :-pad]

        out = self._decode_feature(out, test=True)
        out = AcousticFeature(
            f0=out.f0,
            spectrogram=out.spectrogram,
            aperiodicity=out.aperiodicity,
            mfcc=out.mfcc,
            voiced=input_feature.voiced,
        )
        out = self._feature_denormalize(out, test=True)
        out = AcousticFeature(
            f0=out.f0,
            spectrogram=out.spectrogram,
            aperiodicity=input_feature.aperiodicity,
            mfcc=out.mfcc,
            voiced=out.voiced,
        )

        fftlen = pyworld.get_cheaptrick_fft_size(out_sampling_rate)
        spectrogram = pysptk.mc2sp(
            out.mfcc,
            alpha=self._param.acoustic_feature_param.alpha,
            fftlen=fftlen,
        )

        out = AcousticFeature(
            f0=out.f0,
            spectrogram=spectrogram,
            aperiodicity=out.aperiodicity,
            mfcc=out.mfcc,
            voiced=out.voiced,
        ).astype(numpy.float64)
        return out
コード例 #21
0
ファイル: preprocess.py プロジェクト: as468579/ACVAE-VC
def world_decode_mc(mc, fs):

    fftlen = pyworld.get_cheaptrick_fft_size(fs)
    #coded_sp = coded_sp.astype(np.float32)
    #coded_sp = np.ascontiguousarray(coded_sp)
    alpha = pysptk.util.mcepalpha(fs)
    sp = pysptk.mc2sp(mc, alpha, fftlen)
    # decoded_sp = pyworld.decode_spectral_envelope(coded_sp, fs, fftlen)

    return sp
コード例 #22
0
def test_vc_from_path(model, path, data_mean, data_std, diffvc=True):
    model.eval()

    fs, x = wavfile.read(path)
    x = x.astype(np.float64)
    f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)
    alpha = pysptk.util.mcepalpha(fs)
    mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha)
    c0, mc = mc[:, 0], mc[:, 1:]
    static_dim = mc.shape[-1]
    mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50)
    mc = P.delta_features(mc, hp.windows).astype(np.float32)

    T = mc.shape[0]

    inputs = mc[:, :static_dim].copy()

    # Normalization
    mc_scaled = P.scale(mc, data_mean, data_std)

    # Apply model
    mc_scaled = Variable(torch.from_numpy(mc_scaled))
    R = unit_variance_mlpg_matrix(hp.windows, T)
    R = torch.from_numpy(R)
    y_hat, y_hat_static = model(mc_scaled, R)
    mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim)

    # Denormalize
    mc_static_pred = P.inv_scale(mc_static_pred, data_mean[:static_dim],
                                 data_std[:static_dim])

    outputs = mc_static_pred.copy()

    if diffvc:
        mc_static_pred = mc_static_pred - mc[:, :static_dim]

    mc = np.hstack((c0[:, None], mc_static_pred))
    if diffvc:
        mc[:, 0] = 0  # remove power coefficients
        engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha),
                             hopsize=hop_length)
        b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha)
        waveform = engine.synthesis(x, b)
    else:
        fftlen = pyworld.get_cheaptrick_fft_size(fs)
        spectrogram = pysptk.mc2sp(mc.astype(np.float64),
                                   alpha=alpha,
                                   fftlen=fftlen)
        waveform = pyworld.synthesize(f0, spectrogram, aperiodicity, fs,
                                      hp.frame_period)

    return waveform, inputs, outputs
コード例 #23
0
    def convert_to_feature(self, input: AcousticFeature, out_sampling_rate: Optional[int] = None):
        if out_sampling_rate is None:
            out_sampling_rate = self.config.dataset.param.voice_param.sample_rate

        input_feature = input
        input = self._feature_normalize(input, test=True)
        input = self._encode_feature(input, test=True)

        pad = 128 - input.shape[1] % 128
        input = numpy.pad(input, [(0, 0), (0, pad)], mode='minimum')

        converter = partial(chainer.dataset.convert.concat_examples, device=self.gpu, padding=0)
        inputs = converter([input])

        with chainer.using_config('train', False):
            out = self.model(inputs).data[0]

        if self.gpu is not None:
            out = chainer.cuda.to_cpu(out)
        out = out[:, :-pad]

        out = self._decode_feature(out, test=True)
        out = AcousticFeature(
            f0=out.f0,
            spectrogram=out.spectrogram,
            aperiodicity=out.aperiodicity,
            mfcc=out.mfcc,
            voiced=input_feature.voiced,
        )
        out = self._feature_denormalize(out, test=True)
        out = AcousticFeature(
            f0=out.f0,
            spectrogram=out.spectrogram,
            aperiodicity=input_feature.aperiodicity,
            mfcc=out.mfcc,
            voiced=out.voiced,
        )

        fftlen = pyworld.get_cheaptrick_fft_size(out_sampling_rate)
        spectrogram = pysptk.mc2sp(
            out.mfcc,
            alpha=self._param.acoustic_feature_param.alpha,
            fftlen=fftlen,
        )

        out = AcousticFeature(
            f0=out.f0,
            spectrogram=spectrogram,
            aperiodicity=out.aperiodicity,
            mfcc=out.mfcc,
            voiced=out.voiced,
        ).astype(numpy.float64)
        return out
コード例 #24
0
def synthesis_from_mcep(f0, mcep, ap, sr, fftsize, shiftms, alpha, rmcep=None):
    if rmcep is not None:
        mcep = mod_power(mcep, rmcep, alpha=alpha)

    if ap.shape[1] < fftsize // 2 + 1:
        ap = pw.decode_aperiodicity(ap, sr, fftsize)

    sp = pysptk.mc2sp(mcep, alpha, fftsize)

    wav = pw.synthesize(f0, sp, ap, sr, frame_period=shiftms)

    return wav
コード例 #25
0
ファイル: test.py プロジェクト: entn-at/voice-conversion-2
def main():
    args = get_args()
    debug_args(args)

    # create dir
    os.makedirs("{}/{}_{}".format(args.save_path, ssp, tsp), exist_ok=True)
    # get norm of lf0
    lf0_norm = {}
    with open(args.norm_txt, 'r', encoding='utf-8') as f:
        lines = [line.strip() for line in f.readlines()]
        for line in lines:
            line = line.split()
            lf0_norm[int(line[0])] = {
                'mean': float(line[1]),
                'std': float(line[2])
            }

    checkpoint = torch.load(args.cpt_path,
                            map_location=lambda storage, loc: storage)
    net = checkpoint['model']

    with open('scp/test.scp', 'r', encoding='utf-8') as f:
        lines = [line.strip() for line in f.readlines()]

    for wav_id in tqdm(lines, desc='Synthesis'):
        wav_path = path_template.format(args.ssp, wav_id)
        mc, aperiodicity, f0 = get_features(wav_path)
        f0 = transform_f0(lf0_norm, args.ssp, args.tsp, f0)

        mc = Variable(torch.from_numpy(mc.astype(np.float32)))
        length = [len(mc)]
        mc = torch.unsqueeze(mc, dim=0)

        h, c = net.init_hidden(1)
        if args.dual:
            mc, _ = net(mc, length, h, c, dual=False)
        else:
            mc = net(mc, length, h, c)

        mc = mc.squeeze(0).data.numpy()

        spectrogram = pysptk.mc2sp(mc.astype(np.float64),
                                   alpha=config.alpha,
                                   fftlen=config.fftlen)
        waveform = pyworld.synthesize(f0, spectrogram, aperiodicity, config.fs,
                                      config.frame_period)

        maxv = np.iinfo(np.int16).max
        librosa.output.write_wav(
            '{0}/{1}_{2}/cmu_us_arctic_{2}_{3}.wav'.format(
                args.save_path, args.ssp, args.tsp, wav_id),
            (waveform * maxv).astype(np.int16), config.fs)
コード例 #26
0
 def gen_wav(self, f0, mgc, bap):
     spectrogram = pysptk.mc2sp(mgc, fftlen=self.fftlen, alpha=self.alpha)
     aperiodicity = pyworld.decode_aperiodicity(
         bap.astype(np.float64), self.sr, self.fftlen)
     generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64), spectrogram.astype(
         np.float64), aperiodicity.astype(np.float64), self.sr, self.frame_period)
     x2 = generated_waveform / np.max(generated_waveform) * 32768
     x2 = x2.astype(np.int16)
     wavfile.write("gen.wav", self.sr, x2)
     with open("gen.wav", 'rb') as fd:
         contents = fd.read()
     intensity = 10 * np.log10(np.sum(spectrogram**2, axis=1))
     return contents, intensity
コード例 #27
0
ファイル: dataset.py プロジェクト: umasaki1182/become-yukarin
    def __call__(self, data: Wave, test):
        acoustic_feature = self._acoustic_feature_process(data, test=True).astype_only_float(self._dtype)
        high_spectrogram = acoustic_feature.spectrogram

        fftlen = pyworld.get_cheaptrick_fft_size(data.sampling_rate)
        low_spectrogram = pysptk.mc2sp(
            acoustic_feature.mfcc,
            alpha=self._alpha,
            fftlen=fftlen,
        )

        feature = LowHighSpectrogramFeature(
            low=low_spectrogram,
            high=high_spectrogram,
        )
        feature.validate()
        return feature
コード例 #28
0
    def __call__(self, data: Wave, test):
        acoustic_feature = self._acoustic_feature_process(data, test=True).astype_only_float(self._dtype)
        high_spectrogram = acoustic_feature.spectrogram

        fftlen = pyworld.get_cheaptrick_fft_size(data.sampling_rate)
        low_spectrogram = pysptk.mc2sp(
            acoustic_feature.mfcc,
            alpha=self._alpha,
            fftlen=fftlen,
        )

        feature = LowHighSpectrogramFeature(
            low=low_spectrogram,
            high=high_spectrogram,
        )
        feature.validate()
        return feature
コード例 #29
0
ファイル: waveform.py プロジェクト: qxde01/CTTS
def gen_waveform(y_predicted, do_postfilter=False):
    y_predicted = trim_zeros_frames(y_predicted)
    # Generate parameters and split streams
    mgc, lf0, vuv, bap = gen_parameters(y_predicted)
    if do_postfilter:
        mgc = merlin_post_filter(mgc, alpha)
    spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha)
    #print(bap.shape)
    aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), fs,
                                               fftlen)
    f0 = lf0.copy()
    f0[vuv < 0.5] = 0
    f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])
    generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64),
                                            spectrogram.astype(np.float64),
                                            aperiodicity.astype(np.float64),
                                            fs, frame_period)
    return generated_waveform
コード例 #30
0
ファイル: synthesizer.py プロジェクト: sgmqs/sprocket
    def synthesis(self, f0, mcep, ap, rmcep=None, alpha=0.42):
        """synthesis generates waveform from F0, mcep, aperiodicity

        Parameters
        ----------
        f0 : array, shape (`T`, `1`)
            array of F0 sequence
        mcep : array, shape (`T`, `dim`)
            array of mel-cepstrum sequence
        ap : array, shape (`T`, `fftlen / 2 + 1`) or (`T`, `dim_codeap`)
            array of aperiodicity or code aperiodicity
        rmcep : array, optional, shape (`T`, `dim`)
            array of reference mel-cepstrum sequence
            Default set to None
        alpha : int, optional
            Parameter of all-path transfer function
            Default set to 0.42

        Returns
        ----------
        wav: array,
            Synethesized waveform

        """

        if rmcep is not None:
            # power modification
            mcep = mod_power(mcep, rmcep, alpha=alpha)

        if ap.shape[1] < self.fftl // 2 + 1:
            # decode codeap to ap
            ap = pyworld.decode_aperiodicity(ap, self.fs, self.fftl)

        # mcep into spc
        spc = pysptk.mc2sp(mcep, alpha, self.fftl)

        # generate waveform using world vocoder with f0, spc, ap
        wav = pyworld.synthesize(f0,
                                 spc,
                                 ap,
                                 self.fs,
                                 frame_period=self.shiftms)

        return wav
コード例 #31
0
    def synthesis(self, f0, mcep, ap, rmcep=None, alpha=0.42):
        """synthesis generates waveform from F0, mcep, aperiodicity

        Parameters
        ----------
        f0 : array, shape (`T`, `1`)
            array of F0 sequence
        mcep : array, shape (`T`, `dim`)
            array of mel-cepstrum sequence
        ap : array, shape (`T`, `fftlen / 2 + 1`) or (`T`, `dim_codeap`)
            array of aperiodicity or code aperiodicity
        rmcep : array, optional, shape (`T`, `dim`)
            array of reference mel-cepstrum sequence
            Default set to None
        alpha : int, optional
            Parameter of all-path transfer function
            Default set to 0.42

        Returns
        ----------
        wav: array,
            Synethesized waveform

        """

        if rmcep is not None:
            # power modification
            mcep = mod_power(mcep, rmcep, alpha=alpha)

        if ap.shape[1] < self.fftl // 2 + 1:
            # decode codeap to ap
            ap = pyworld.decode_aperiodicity(ap, self.fs, self.fftl)

        # mcep into spc
        spc = pysptk.mc2sp(mcep, alpha, self.fftl)

        # generate waveform using world vocoder with f0, spc, ap
        wav = pyworld.synthesize(f0, spc, ap,
                                 self.fs, frame_period=self.shiftms)

        return wav
コード例 #32
0
def save_wav_ceps(fake_B, input_path, sample_path):
    length = 14000
    bps, wav_data = wav.read(input_path)
    datas = [
        wav_data[i:i + length, 0] for i in range(0, len(wav_data), length)
    ]
    wave = np.zeros([len(fake_B), length])
    for (b, d) in zip(fake_B, datas):
        f0, _, pitch = pw.wav2world(d, bps)
        for cep in b:
            for i, Scep in enumerate(cep):
                if (i == 0):
                    Scep = (Scep * 28) - 20
                else:
                    Scep = (Scep * 7) - 3
                cep[i] = Scep
        sp = pysptk.mc2sp(b, 0.48, 2048)
        w = pw.synthesize(f0, sp, pitch, bps)
        np.append(wave, w)
    wave = np.reshape(wave, -1).astype('int16')
    wav.write(sample_path + '_fake.wav', bps, wave)
コード例 #33
0
def test_one_utt(src_path, tgt_path, disable_mlpg=False, diffvc=True):
    # GMM-based parameter generation is provided by the library in `baseline` module
    if disable_mlpg:
        # Force disable MLPG
        paramgen = MLPG(gmm, windows=[(0, 0, np.array([1.0]))], diff=diffvc)
    else:
        paramgen = MLPG(gmm, windows=windows, diff=diffvc)

    fs, x = wavfile.read(src_path)
    x = x.astype(np.float64)
    f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)

    pdb.set_trace()

    mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha)
    c0, mc = mc[:, 0], mc[:, 1:]
    if use_delta:
        mc = delta_features(mc, windows)
    mc = paramgen.transform(mc)
    if disable_mlpg and mc.shape[-1] != static_dim:
        mc = mc[:, :static_dim]
    assert mc.shape[-1] == static_dim
    mc = np.hstack((c0[:, None], mc))
    if diffvc:
        mc[:, 0] = 0  # remove power coefficients
        engine = Synthesizer(MLSADF(order=order, alpha=alpha),
                             hopsize=hop_length)
        b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha)
        waveform = engine.synthesis(x, b)
    else:
        spectrogram = pysptk.mc2sp(mc.astype(np.float64),
                                   alpha=alpha,
                                   fftlen=fftlen)
        waveform = pyworld.synthesize(f0, spectrogram, aperiodicity, fs,
                                      frame_period)

    return waveform
コード例 #34
0
ファイル: gmm.py プロジェクト: Emanuele93/INMCA
def test_one_utt(path_src, path_tgt, disable_mlpg=False, diffvc=True):
    if disable_mlpg:
        paramgen = MLPG(gmm, windows=[(0, 0, np.array([1.0]))], diff=diffvc)
    else:
        paramgen = MLPG(gmm, windows=windows, diff=diffvc)

    x, fs_ = sf.read(path_src)
    x = x.astype(np.float64)
    f0, time_axis = pyworld.dio(x, fs_, frame_period=frame_period)
    f0 = pyworld.stonemask(x, f0, time_axis, fs_)
    spectrogram = pyworld.cheaptrick(x, f0, time_axis, fs_)
    aperiodicity = pyworld.d4c(x, f0, time_axis, fs_)

    mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha)
    c0, mc = mc[:, 0], mc[:, 1:]
    if use_delta:
        mc = delta_features(mc, windows)
    mc = paramgen.transform(mc)
    if disable_mlpg and mc.shape[-1] != static_dim:
        mc = mc[:, :static_dim]
    assert mc.shape[-1] == static_dim
    mc = np.hstack((c0[:, None], mc))
    if diffvc:
        mc[:, 0] = 0
        engine = Synthesizer(MLSADF(order=order, alpha=alpha),
                             hopsize=hop_length)
        b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha)
        waveform = engine.synthesis(x, b)
    else:
        spectrogram = pysptk.mc2sp(mc.astype(np.float64),
                                   alpha=alpha,
                                   fftlen=fftlen)
        waveform = pyworld.synthesize(f0, spectrogram, aperiodicity, fs_,
                                      frame_period)

    return waveform
コード例 #35
0
    def MCEPs2wav(self, mc, f0, ap):
        sp = pysptk.mc2sp(np.float64(mc), alpha=self.alpha, fftlen=self.n_fft)
        y = pw.synthesize(np.float64(f0), np.float64(sp), np.float64(ap),
                          self.sr, pw.default_frame_period)

        return y.astype(np.float32)
コード例 #36
0
ファイル: evaluation_vc.py プロジェクト: shamanez/gantts
def test_vc_from_path(model, x, fs, data_mean, data_std, diffvc=True):
    model.eval()

    hop_length = int(fs * (hp.frame_period * 0.001))
    x = x.astype(np.float64)
    f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)
    alpha = pysptk.util.mcepalpha(fs)
    mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha)
    c0, mc = mc[:, 0], mc[:, 1:]
    static_dim = mc.shape[-1]
    mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50)
    mc = P.delta_features(mc, hp.windows).astype(np.float32)

    T = mc.shape[0]

    inputs = mc[:, :static_dim].copy()

    # Normalization
    mc_scaled = P.scale(mc, data_mean, data_std)

    mc_scaled = Variable(torch.from_numpy(mc_scaled))
    lengths = [len(mc_scaled)]

    # Add batch axis
    mc_scaled = mc_scaled.view(1, -1, mc_scaled.size(-1))

    # For MLPG
    R = unit_variance_mlpg_matrix(hp.windows, T)
    R = torch.from_numpy(R)

    # Apply model
    if model.include_parameter_generation():
        # Case: models include parameter generation in itself
        # Mulistream features cannot be used in this case
        y_hat, y_hat_static = model(mc_scaled, R, lengths=lengths)
    else:
        # Case: generic models (can be sequence model)
        assert hp.has_dynamic_features is not None
        y_hat = model(mc_scaled, lengths=lengths)
        y_hat_static = multi_stream_mlpg(
            y_hat, R, hp.stream_sizes, hp.has_dynamic_features)

    mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim)

    # Denormalize
    mc_static_pred = P.inv_scale(
        mc_static_pred, data_mean[:static_dim], data_std[:static_dim])

    outputs = mc_static_pred.copy()

    if diffvc:
        mc_static_pred = mc_static_pred - mc[:, :static_dim]

    mc = np.hstack((c0[:, None], mc_static_pred))
    if diffvc:
        mc[:, 0] = 0  # remove power coefficients
        engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha),
                             hopsize=hop_length)
        b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha)
        waveform = engine.synthesis(x, b)
    else:
        fftlen = pyworld.get_cheaptrick_fft_size(fs)
        spectrogram = pysptk.mc2sp(
            mc.astype(np.float64), alpha=alpha, fftlen=fftlen)
        waveform = pyworld.synthesize(
            f0, spectrogram, aperiodicity, fs, hp.frame_period)

    return waveform, inputs, outputs