def convert(self, in_feature: AcousticFeature): input = self._encode_feature(in_feature) pad = 128 - input.shape[1] % 128 input = numpy.pad(input, [(0, 0), (0, pad)], mode='minimum') converter = partial(chainer.dataset.convert.concat_examples, device=self.gpu, padding=0) inputs = converter([input]) with chainer.using_config('train', False): out = self.model(inputs).data[0] if self.gpu is not None: out = chainer.cuda.to_cpu(out) out = out[:, :-pad] out = self._decode_feature(out) out.ap = in_feature.ap out.voiced = in_feature.voiced out.f0[~out.voiced] = 0 fftlen = pyworld.get_cheaptrick_fft_size(self.out_sampling_rate) sp = pysptk.mc2sp( out.mc, alpha=self._param.alpha, fftlen=fftlen, ) out.sp = sp out = out.astype_only_float(numpy.float64) return out
def synthesis(f0, mcep, ap, r=None, alpha=0.42): if r is not None: mcep = mod_p(mcep, r) spc = pysptk.mc2sp(mcep, alpha, 1024) wav = pyworld.synthesize(f0, spc, ap, 16000, frame_period=5) return wav
def generate(self, parm_var, do_postfilter=True): config = self.analysis_config for path in self.paths: file_id = splitext(basename(path))[0] print('Synthesizing %s ... ' % (file_id), end='') mgc, lf0, vuv, bap = self._generate_parameters(path, parm_var) if do_postfilter: mgc = merlin_post_filter(mgc, config.alpha) sp = pysptk.mc2sp(mgc, fftlen=config.fft_length, alpha=config.alpha) ap = pyworld.decode_aperiodicity(bap.astype(np.float64), config.sampling_rate, config.fft_length) f0 = self._lf0_to_f0(lf0, vuv) generated = pyworld.synthesize(f0.flatten().astype(np.float64), sp.astype(np.float64), ap.astype(np.float64), config.sampling_rate, config.frame_period) with open(join(self.out_dir, file_id + '.wav'), 'wb') as f: f.write(Audio(generated, rate=config.sampling_rate).data) print('done!')
def generate_file(path): out = Path(arguments.output_directory, path.stem + '.npy') if out.exists() and not arguments.enable_overwrite: return # load wave and padding wave_file_load_process = WaveFileLoadProcess( sample_rate=arguments.sample_rate, top_db=arguments.top_db, pad_second=arguments.pad_second, ) wave = wave_file_load_process(path, test=True) # make acoustic feature acoustic_feature_process = AcousticFeatureProcess( frame_period=arguments.frame_period, order=arguments.order, alpha=arguments.alpha, f0_estimating_method=arguments.f0_estimating_method, ) feature = acoustic_feature_process(wave, test=True).astype_only_float(numpy.float32) high_spectrogram = feature.spectrogram fftlen = pyworld.get_cheaptrick_fft_size(arguments.sample_rate) low_spectrogram = pysptk.mc2sp( feature.mfcc, alpha=arguments.alpha, fftlen=fftlen, ) # save numpy.save(out.absolute(), { 'low': low_spectrogram, 'high': high_spectrogram, })
def extract_spectrum(self, spectrum_len=None, Synthesizer=None): if spectrum_len is None: if Synthesizer is None: Synthesizer = kwiiyatta.Synthesizer spectrum_len = Synthesizer.fs_spectrum_len(self.fs) return pysptk.mc2sp(self.data, fftlen=(spectrum_len - 1) * 2, alpha=self.alpha())
def __test(order, alpha, fftlen): np.random.seed(98765) sp = np.random.rand(int(fftlen // 2 + 1)) mc = pysptk.sp2mc(sp, order, alpha) approx_sp = pysptk.mc2sp(mc, alpha, fftlen) # TODO: tolerance should be more carefully chosen assert np.allclose(sp, approx_sp, atol=0.9)
def gen_waveform(y_predicted, Y_mean, Y_std, post_filter=False, coef=1.4, fs=16000, mge_training=True): alpha = pysptk.util.mcepalpha(fs) fftlen = fftlen = pyworld.get_cheaptrick_fft_size(fs) frame_period = hp_acoustic.frame_period # Generate parameters and split streams mgc, lf0, vuv, bap = gen_parameters(y_predicted, Y_mean, Y_std, mge_training) if post_filter: mgc = merlin_post_filter(mgc, alpha, coef=coef) spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha) aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), fs, fftlen) f0 = lf0.copy() f0[vuv < 0.5] = 0 f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)]) generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64), spectrogram.astype(np.float64), aperiodicity.astype(np.float64), fs, frame_period) # Convert range to int16 generated_waveform = generated_waveform / \ np.max(np.abs(generated_waveform)) * 32767 # return features as well to compare natural/genearted later return generated_waveform, mgc, lf0, vuv, bap
def world2wav(feature, frame_period): hparams = hp mgc_idx = 0 lf0_idx = mgc_idx + hparams.num_mgc vuv_idx = lf0_idx + hparams.num_lf0 bap_idx = vuv_idx + hparams.num_vuv mgc = feature[:, mgc_idx:mgc_idx + hparams.num_mgc] lf0 = feature[:, lf0_idx:lf0_idx + hparams.num_lf0] vuv = feature[:, vuv_idx:vuv_idx + hparams.num_vuv] bap = feature[:, bap_idx:bap_idx + hparams.num_bap] fs = hparams.sample_rate alpha = pysptk.util.mcepalpha(fs) fftlen = pyworld.get_cheaptrick_fft_size(fs) spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha) indexes = (vuv < 0.5).flatten() bap[indexes] = np.zeros(hparams.num_bap) aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), fs, fftlen) f0 = lf0.copy() f0[vuv < 0.5] = 0 f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)]) return pyworld.synthesize(f0.flatten().astype(np.float64), spectrogram.astype(np.float64), aperiodicity.astype(np.float64), fs, frame_period)
def generate_changed_voice(model, input_path): fs, x = wavfile.read(input_path) x = x.astype(np.float64) if len(x.shape) > 1: x = x.mean(axis=1) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) alpha = pysptk.util.mcepalpha(fs) mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] mc = P.modspec_smoothing(mc, FS / HOP_LENGHT, cutoff=50) mc = P.delta_features(mc, hp.windows).astype(np.float32) gen_data = model.predict(mc) gen_data = np.hstack([c0.reshape((-1, 1)), gen_data]) fftlen = pyworld.get_cheaptrick_fft_size(fs) spectrogram = pysptk.mc2sp( gen_data.astype(np.float64), alpha=alpha, fftlen=fftlen) waveform = pyworld.synthesize( f0, spectrogram, aperiodicity, fs, hp.frame_period) return waveform
def synthesis(self, feat, se_kind='sp'): batch_size = feat['ap'].size(0) device = feat['ap'].device audio = [] for i in range(batch_size): ap = feat['ap'][i].detach().t().cpu().double().numpy() f0 = feat['f0'][i].detach().view(-1).cpu().double().numpy() if se_kind == 'mcc': mcc = feat['mcc'][i].detach().t().cpu().double().numpy() sp = pysptk.mc2sp(mcc.copy(order='C'), self.mcc_alpha, self.fft_size) else: sp = feat['sp'][i].detach().t().cpu().double().numpy() syn = pyworld.synthesize(f0.copy(order='C'), sp.copy(order='C'), ap.copy(order='C'), self.fs, frame_period=self.shiftms) audio.append(torch.from_numpy(syn).float().view(-1)) audio = torch.cat([syn.unsqueeze(0) for syn in audio], dim=0).to(device) return audio / MAX_WAV_VALUE
def vizualize_hardcoded(x, mgc, lf0, f0, vuv, fs, timeaxis): plt.subplot(5, 1, 1) plt.plot(x, label="Wav") plt.xlim(0, len(x)) # Spec plt.subplot(5, 1, 2) sp = pysptk.mc2sp(mgc[:, :60], alpha=alpha, fftlen=fftlen) logsp = np.log(sp) librosa.display.specshow(logsp.T, sr=fs, hop_length=hop_length, x_axis="time", y_axis="linear") # Lof_f0, Vuv plt.subplot(5, 1, 3) # plt.plot(np.exp(lf0[:,0]), linewidth=2, label="Continuous log-f0") plt.plot(f0, linewidth=2, label="Continuous log-f0") plt.xlim(0, len(f0)) plt.subplot(5, 1, 4) plt.plot(vuv, linewidth=2, label="Voiced/unvoiced flag") plt.xlim(0, len(vuv)) plt.legend(prop={"size": 14}, loc="upper right") # aperiodicity plt.subplot(5, 1, 5) bap = bap[:, :2] bap = np.ascontiguousarray(bap).astype(np.float64) aperiodicity = pyworld.decode_aperiodicity(bap, fs, fftlen) librosa.display.specshow(aperiodicity.T, sr=fs, hop_length=hop_length, x_axis="time", y_axis="linear") plt.show()
def gen_world_params(mgc, lf0, vuv, bap, sample_rate, vuv_threshold=0.3): """Generate WORLD parameters from mgc, lf0, vuv and bap. Args: mgc (ndarray): mgc lf0 (ndarray): lf0 vuv (ndarray): vuv bap (ndarray): bap sample_rate (int): sample rate vuv_threshold (float): threshold for VUV Returns: tuple: tuple of f0, spectrogram and aperiodicity """ fftlen = pyworld.get_cheaptrick_fft_size(sample_rate) alpha = pysptk.util.mcepalpha(sample_rate) spectrogram = pysptk.mc2sp(np.ascontiguousarray(mgc), fftlen=fftlen, alpha=alpha) aperiodicity = pyworld.decode_aperiodicity( np.ascontiguousarray(bap).astype(np.float64), sample_rate, fftlen ) # fill aperiodicity with ones for unvoiced regions aperiodicity[vuv.reshape(-1) < vuv_threshold, :] = 1.0 # WORLD fails catastrophically for out of range aperiodicity aperiodicity = np.clip(aperiodicity, 0.0, 1.0) f0 = lf0.copy() f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)]) f0[vuv < vuv_threshold] = 0 f0 = f0.flatten().astype(np.float64) spectrogram = spectrogram.astype(np.float64) aperiodicity = aperiodicity.astype(np.float64) return f0, spectrogram, aperiodicity
def gen_waveform(self, feature): mcep_dim = self.config['mcep_order'] + 1 mgc = feature[:, :mcep_dim] lf0 = feature[:, mcep_dim:mcep_dim + 1] vuv = feature[:, mcep_dim + 1: mcep_dim + 2] bap = feature[:, mcep_dim + 2:] spectrogram = pysptk.mc2sp( mgc, fftlen=self.config['fft_size'], alpha=pysptk.util.mcepalpha(self.config['sampling_rate']), ) aperiodicity = pyworld.decode_aperiodicity( bap.astype(np.float64), self.config['sampling_rate'], self.config['fft_size'], ) f0 = lf0.copy() f0[vuv < 0.5] = 0 f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)]) waveform = pyworld.synthesize( f0.flatten().astype(np.float64), spectrogram.astype(np.float64), aperiodicity.astype(np.float64), self.config['sampling_rate'], self.config['hop_size_in_ms'], ) return waveform
def synthesis(): # pdb.set_trace() lf0_file = "p225_001.lf0" bap_file_name="p225_001.bap" mgc_file_name="p225_001.mgc" fl=4096 sr=48000 # pdb.set_trace() lf0 = read_binfile(lf0_file, dim=1, dtype=np.float32) zeros_index = np.where(lf0 == -1E+10) nonzeros_index = np.where(lf0 != -1E+10) f0 = lf0.copy() f0[zeros_index] = 0 f0[nonzeros_index] = np.exp(lf0[nonzeros_index]) f0 = f0.astype(np.float64) bap_dim = 5 bap = read_binfile(bap_file_name, dim=bap_dim, dtype=np.float32) ap = pyworld.decode_aperiodicity(bap.astype(np.float64).reshape(-1, bap_dim), sr, fl) mc = read_binfile(mgc_file_name, dim=60, dtype=np.float32) alpha = pysptk.util.mcepalpha(sr) sp = pysptk.mc2sp(mc.astype(np.float64), fftlen=fl, alpha=alpha) wav = pyworld.synthesize(f0, sp, ap, sr, 5) x2 = wav * 32768 x2 = x2.astype(np.int16) scipy.io.wavfile.write("resynthesis.wav", sr, x2)
def decode_spectrogram(self, feature: AcousticFeature): fftlen = pyworld.get_cheaptrick_fft_size(self.out_sampling_rate) feature.sp = pysptk.mc2sp( feature.mc.astype(numpy.float32), alpha=pysptk.util.mcepalpha(self.out_sampling_rate), fftlen=fftlen, ) return feature
def gen_waveform(labels, acoustic_features, acoustic_out_scaler, binary_dict, continuous_dict, stream_sizes, has_dynamic_features, subphone_features="coarse_coding", log_f0_conditioning=True, pitch_idx=None, num_windows=3, post_filter=True, sample_rate=48000, frame_period=5, relative_f0=True): windows = get_windows(num_windows) # Apply MLPG if necessary if np.any(has_dynamic_features): acoustic_features = multi_stream_mlpg( acoustic_features, acoustic_out_scaler.var_, windows, stream_sizes, has_dynamic_features) static_stream_sizes = get_static_stream_sizes( stream_sizes, has_dynamic_features, len(windows)) else: static_stream_sizes = stream_sizes # Split multi-stream features mgc, target_f0, vuv, bap = split_streams(acoustic_features, static_stream_sizes) # Gen waveform by the WORLD vocodoer fftlen = pyworld.get_cheaptrick_fft_size(sample_rate) alpha = pysptk.util.mcepalpha(sample_rate) if post_filter: mgc = merlin_post_filter(mgc, alpha) spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha) aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), sample_rate, fftlen) ### F0 ### if relative_f0: diff_lf0 = target_f0 # need to extract pitch sequence from the musical score linguistic_features = fe.linguistic_features(labels, binary_dict, continuous_dict, add_frame_features=True, subphone_features=subphone_features) f0_score = _midi_to_hz(linguistic_features, pitch_idx, False)[:, None] lf0_score = f0_score.copy() nonzero_indices = np.nonzero(lf0_score) lf0_score[nonzero_indices] = np.log(f0_score[nonzero_indices]) lf0_score = interp1d(lf0_score, kind="slinear") f0 = diff_lf0 + lf0_score f0[vuv < 0.5] = 0 f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)]) else: f0 = target_f0 generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64), spectrogram.astype(np.float64), aperiodicity.astype(np.float64), sample_rate, frame_period) return generated_waveform
def make_conversion(root, result_dir, checkpoint, ut_min=91, ut_max=100, sp_min=91, sp_max=100): alpha = 0.42 n_fft = 1024 root = Path(root) result_dir = Path(result_dir) dicts = torch.load(checkpoint, map_location='cpu') model = VC(dicts['config']['model'], train=False) model.load_state_dict(dicts['model']) model.remove_wn() model = model.eval() for s in range(sp_min, sp_max+1): sp = f'jvs{s:03}' sp_root = result_dir / sp sp_root.mkdir(parents=True, exist_ok=True) sp_dict_path = sp_root / 'sp_dict.pt' if not sp_dict_path.is_file(): nonparas = list((root / sp / 'nonpara30/wav24kHz16bit').glob('BASIC5000_*.mcep.npy')) index = max(enumerate(nonparas), key=lambda p: p[1].stat().st_size)[0] ref_mcep = nonparas[index] ref_f0 = ref_mcep.parent / ref_mcep.stem.replace('.mcep', '.f0.npy') sp_dict = extract_from(model, ref_mcep, ref_f0, sp_dict_path) else: sp_dict = torch.load(sp_dict_path) for s2 in range(sp_min, sp_max+1): sp2 = f'jvs{s2:03}' sp2_root = result_dir / sp2 sp2_root.mkdir(parents=True, exist_ok=True) target_root = sp_root / sp2 target_root.mkdir(parents=True, exist_ok=True) for u in range(ut_min, ut_max+1): src_mcep = root / sp2 / 'parallel100/wav24kHz16bit' / f'VOICEACTRESS100_{u:03}.mcep.npy' src_f0 = root / sp2 / 'parallel100/wav24kHz16bit' / f'VOICEACTRESS100_{u:03}.f0.npy' src_c0 = root / sp2 / 'parallel100/wav24kHz16bit' / f'VOICEACTRESS100_{u:03}.c0.npy' src_ap = root / sp2 / 'parallel100/wav24kHz16bit' / f'VOICEACTRESS100_{u:03}.ap.npy' src_dict_path = sp2_root / f'VOICEACTRESS100_{u:03}.pt' if not src_dict_path.is_file(): src_dict = prep_content(model, src_mcep, src_dict_path) else: src_dict = torch.load(src_dict_path) converted_mcep = model.reconstruct_mcep(src_dict['cq'], sp_dict['kv']).squeeze().numpy() tgt_mcep = target_root / f'VOICEACTRESS100_{u:03}.mcep.npy' np.save(tgt_mcep, converted_mcep) f0 = np.load(src_f0).astype(np.float64) f0 = convert_f0(f0, sp_dict) ap = np.load(src_ap).astype(np.float64) ap = reconstruct_ap(ap) c0 = np.load(src_c0).astype(np.float64) assert (c0.shape[0] <= converted_mcep.shape[-1]), f'{s}->{s2}/{u}, {c0.shape[0]} <= {converted_mcep.shape[-1]}' mcep = np.hstack([c0[:, None], converted_mcep[:, :c0.shape[0]].T]).astype(np.float64) sp = pysptk.mc2sp(np.ascontiguousarray(mcep), alpha, n_fft) wav = pyworld.synthesize(f0, sp, ap, 16000) tgt_wav = target_root / f'VOICEACTRESS100_{u:03}.wav' wavfile.write(tgt_wav, 16000, (wav*32768).astype(np.int16)) print(tgt_wav, flush=True)
def convert_to_feature(self, input: AcousticFeature, out_sampling_rate: Optional[int] = None): if out_sampling_rate is None: out_sampling_rate = self.config.dataset.param.voice_param.sample_rate input_feature = input input = self._feature_normalize(input, test=True) input = self._encode_feature(input, test=True) pad = 128 - input.shape[1] % 128 input = numpy.pad(input, [(0, 0), (0, pad)], mode='minimum') converter = partial(chainer.dataset.convert.concat_examples, device=self.gpu, padding=0) inputs = converter([input]) with chainer.using_config('train', False): out = self.model(inputs).data[0] if self.gpu is not None: out = chainer.cuda.to_cpu(out) out = out[:, :-pad] out = self._decode_feature(out, test=True) out = AcousticFeature( f0=out.f0, spectrogram=out.spectrogram, aperiodicity=out.aperiodicity, mfcc=out.mfcc, voiced=input_feature.voiced, ) out = self._feature_denormalize(out, test=True) out = AcousticFeature( f0=out.f0, spectrogram=out.spectrogram, aperiodicity=input_feature.aperiodicity, mfcc=out.mfcc, voiced=out.voiced, ) fftlen = pyworld.get_cheaptrick_fft_size(out_sampling_rate) spectrogram = pysptk.mc2sp( out.mfcc, alpha=self._param.acoustic_feature_param.alpha, fftlen=fftlen, ) out = AcousticFeature( f0=out.f0, spectrogram=spectrogram, aperiodicity=out.aperiodicity, mfcc=out.mfcc, voiced=out.voiced, ).astype(numpy.float64) return out
def world_decode_mc(mc, fs): fftlen = pyworld.get_cheaptrick_fft_size(fs) #coded_sp = coded_sp.astype(np.float32) #coded_sp = np.ascontiguousarray(coded_sp) alpha = pysptk.util.mcepalpha(fs) sp = pysptk.mc2sp(mc, alpha, fftlen) # decoded_sp = pyworld.decode_spectral_envelope(coded_sp, fs, fftlen) return sp
def test_vc_from_path(model, path, data_mean, data_std, diffvc=True): model.eval() fs, x = wavfile.read(path) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) alpha = pysptk.util.mcepalpha(fs) mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] static_dim = mc.shape[-1] mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50) mc = P.delta_features(mc, hp.windows).astype(np.float32) T = mc.shape[0] inputs = mc[:, :static_dim].copy() # Normalization mc_scaled = P.scale(mc, data_mean, data_std) # Apply model mc_scaled = Variable(torch.from_numpy(mc_scaled)) R = unit_variance_mlpg_matrix(hp.windows, T) R = torch.from_numpy(R) y_hat, y_hat_static = model(mc_scaled, R) mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim) # Denormalize mc_static_pred = P.inv_scale(mc_static_pred, data_mean[:static_dim], data_std[:static_dim]) outputs = mc_static_pred.copy() if diffvc: mc_static_pred = mc_static_pred - mc[:, :static_dim] mc = np.hstack((c0[:, None], mc_static_pred)) if diffvc: mc[:, 0] = 0 # remove power coefficients engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha), hopsize=hop_length) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(x, b) else: fftlen = pyworld.get_cheaptrick_fft_size(fs) spectrogram = pysptk.mc2sp(mc.astype(np.float64), alpha=alpha, fftlen=fftlen) waveform = pyworld.synthesize(f0, spectrogram, aperiodicity, fs, hp.frame_period) return waveform, inputs, outputs
def synthesis_from_mcep(f0, mcep, ap, sr, fftsize, shiftms, alpha, rmcep=None): if rmcep is not None: mcep = mod_power(mcep, rmcep, alpha=alpha) if ap.shape[1] < fftsize // 2 + 1: ap = pw.decode_aperiodicity(ap, sr, fftsize) sp = pysptk.mc2sp(mcep, alpha, fftsize) wav = pw.synthesize(f0, sp, ap, sr, frame_period=shiftms) return wav
def main(): args = get_args() debug_args(args) # create dir os.makedirs("{}/{}_{}".format(args.save_path, ssp, tsp), exist_ok=True) # get norm of lf0 lf0_norm = {} with open(args.norm_txt, 'r', encoding='utf-8') as f: lines = [line.strip() for line in f.readlines()] for line in lines: line = line.split() lf0_norm[int(line[0])] = { 'mean': float(line[1]), 'std': float(line[2]) } checkpoint = torch.load(args.cpt_path, map_location=lambda storage, loc: storage) net = checkpoint['model'] with open('scp/test.scp', 'r', encoding='utf-8') as f: lines = [line.strip() for line in f.readlines()] for wav_id in tqdm(lines, desc='Synthesis'): wav_path = path_template.format(args.ssp, wav_id) mc, aperiodicity, f0 = get_features(wav_path) f0 = transform_f0(lf0_norm, args.ssp, args.tsp, f0) mc = Variable(torch.from_numpy(mc.astype(np.float32))) length = [len(mc)] mc = torch.unsqueeze(mc, dim=0) h, c = net.init_hidden(1) if args.dual: mc, _ = net(mc, length, h, c, dual=False) else: mc = net(mc, length, h, c) mc = mc.squeeze(0).data.numpy() spectrogram = pysptk.mc2sp(mc.astype(np.float64), alpha=config.alpha, fftlen=config.fftlen) waveform = pyworld.synthesize(f0, spectrogram, aperiodicity, config.fs, config.frame_period) maxv = np.iinfo(np.int16).max librosa.output.write_wav( '{0}/{1}_{2}/cmu_us_arctic_{2}_{3}.wav'.format( args.save_path, args.ssp, args.tsp, wav_id), (waveform * maxv).astype(np.int16), config.fs)
def gen_wav(self, f0, mgc, bap): spectrogram = pysptk.mc2sp(mgc, fftlen=self.fftlen, alpha=self.alpha) aperiodicity = pyworld.decode_aperiodicity( bap.astype(np.float64), self.sr, self.fftlen) generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64), spectrogram.astype( np.float64), aperiodicity.astype(np.float64), self.sr, self.frame_period) x2 = generated_waveform / np.max(generated_waveform) * 32768 x2 = x2.astype(np.int16) wavfile.write("gen.wav", self.sr, x2) with open("gen.wav", 'rb') as fd: contents = fd.read() intensity = 10 * np.log10(np.sum(spectrogram**2, axis=1)) return contents, intensity
def __call__(self, data: Wave, test): acoustic_feature = self._acoustic_feature_process(data, test=True).astype_only_float(self._dtype) high_spectrogram = acoustic_feature.spectrogram fftlen = pyworld.get_cheaptrick_fft_size(data.sampling_rate) low_spectrogram = pysptk.mc2sp( acoustic_feature.mfcc, alpha=self._alpha, fftlen=fftlen, ) feature = LowHighSpectrogramFeature( low=low_spectrogram, high=high_spectrogram, ) feature.validate() return feature
def gen_waveform(y_predicted, do_postfilter=False): y_predicted = trim_zeros_frames(y_predicted) # Generate parameters and split streams mgc, lf0, vuv, bap = gen_parameters(y_predicted) if do_postfilter: mgc = merlin_post_filter(mgc, alpha) spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha) #print(bap.shape) aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), fs, fftlen) f0 = lf0.copy() f0[vuv < 0.5] = 0 f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)]) generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64), spectrogram.astype(np.float64), aperiodicity.astype(np.float64), fs, frame_period) return generated_waveform
def synthesis(self, f0, mcep, ap, rmcep=None, alpha=0.42): """synthesis generates waveform from F0, mcep, aperiodicity Parameters ---------- f0 : array, shape (`T`, `1`) array of F0 sequence mcep : array, shape (`T`, `dim`) array of mel-cepstrum sequence ap : array, shape (`T`, `fftlen / 2 + 1`) or (`T`, `dim_codeap`) array of aperiodicity or code aperiodicity rmcep : array, optional, shape (`T`, `dim`) array of reference mel-cepstrum sequence Default set to None alpha : int, optional Parameter of all-path transfer function Default set to 0.42 Returns ---------- wav: array, Synethesized waveform """ if rmcep is not None: # power modification mcep = mod_power(mcep, rmcep, alpha=alpha) if ap.shape[1] < self.fftl // 2 + 1: # decode codeap to ap ap = pyworld.decode_aperiodicity(ap, self.fs, self.fftl) # mcep into spc spc = pysptk.mc2sp(mcep, alpha, self.fftl) # generate waveform using world vocoder with f0, spc, ap wav = pyworld.synthesize(f0, spc, ap, self.fs, frame_period=self.shiftms) return wav
def save_wav_ceps(fake_B, input_path, sample_path): length = 14000 bps, wav_data = wav.read(input_path) datas = [ wav_data[i:i + length, 0] for i in range(0, len(wav_data), length) ] wave = np.zeros([len(fake_B), length]) for (b, d) in zip(fake_B, datas): f0, _, pitch = pw.wav2world(d, bps) for cep in b: for i, Scep in enumerate(cep): if (i == 0): Scep = (Scep * 28) - 20 else: Scep = (Scep * 7) - 3 cep[i] = Scep sp = pysptk.mc2sp(b, 0.48, 2048) w = pw.synthesize(f0, sp, pitch, bps) np.append(wave, w) wave = np.reshape(wave, -1).astype('int16') wav.write(sample_path + '_fake.wav', bps, wave)
def test_one_utt(src_path, tgt_path, disable_mlpg=False, diffvc=True): # GMM-based parameter generation is provided by the library in `baseline` module if disable_mlpg: # Force disable MLPG paramgen = MLPG(gmm, windows=[(0, 0, np.array([1.0]))], diff=diffvc) else: paramgen = MLPG(gmm, windows=windows, diff=diffvc) fs, x = wavfile.read(src_path) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) pdb.set_trace() mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] if use_delta: mc = delta_features(mc, windows) mc = paramgen.transform(mc) if disable_mlpg and mc.shape[-1] != static_dim: mc = mc[:, :static_dim] assert mc.shape[-1] == static_dim mc = np.hstack((c0[:, None], mc)) if diffvc: mc[:, 0] = 0 # remove power coefficients engine = Synthesizer(MLSADF(order=order, alpha=alpha), hopsize=hop_length) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(x, b) else: spectrogram = pysptk.mc2sp(mc.astype(np.float64), alpha=alpha, fftlen=fftlen) waveform = pyworld.synthesize(f0, spectrogram, aperiodicity, fs, frame_period) return waveform
def test_one_utt(path_src, path_tgt, disable_mlpg=False, diffvc=True): if disable_mlpg: paramgen = MLPG(gmm, windows=[(0, 0, np.array([1.0]))], diff=diffvc) else: paramgen = MLPG(gmm, windows=windows, diff=diffvc) x, fs_ = sf.read(path_src) x = x.astype(np.float64) f0, time_axis = pyworld.dio(x, fs_, frame_period=frame_period) f0 = pyworld.stonemask(x, f0, time_axis, fs_) spectrogram = pyworld.cheaptrick(x, f0, time_axis, fs_) aperiodicity = pyworld.d4c(x, f0, time_axis, fs_) mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] if use_delta: mc = delta_features(mc, windows) mc = paramgen.transform(mc) if disable_mlpg and mc.shape[-1] != static_dim: mc = mc[:, :static_dim] assert mc.shape[-1] == static_dim mc = np.hstack((c0[:, None], mc)) if diffvc: mc[:, 0] = 0 engine = Synthesizer(MLSADF(order=order, alpha=alpha), hopsize=hop_length) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(x, b) else: spectrogram = pysptk.mc2sp(mc.astype(np.float64), alpha=alpha, fftlen=fftlen) waveform = pyworld.synthesize(f0, spectrogram, aperiodicity, fs_, frame_period) return waveform
def MCEPs2wav(self, mc, f0, ap): sp = pysptk.mc2sp(np.float64(mc), alpha=self.alpha, fftlen=self.n_fft) y = pw.synthesize(np.float64(f0), np.float64(sp), np.float64(ap), self.sr, pw.default_frame_period) return y.astype(np.float32)
def test_vc_from_path(model, x, fs, data_mean, data_std, diffvc=True): model.eval() hop_length = int(fs * (hp.frame_period * 0.001)) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) alpha = pysptk.util.mcepalpha(fs) mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] static_dim = mc.shape[-1] mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50) mc = P.delta_features(mc, hp.windows).astype(np.float32) T = mc.shape[0] inputs = mc[:, :static_dim].copy() # Normalization mc_scaled = P.scale(mc, data_mean, data_std) mc_scaled = Variable(torch.from_numpy(mc_scaled)) lengths = [len(mc_scaled)] # Add batch axis mc_scaled = mc_scaled.view(1, -1, mc_scaled.size(-1)) # For MLPG R = unit_variance_mlpg_matrix(hp.windows, T) R = torch.from_numpy(R) # Apply model if model.include_parameter_generation(): # Case: models include parameter generation in itself # Mulistream features cannot be used in this case y_hat, y_hat_static = model(mc_scaled, R, lengths=lengths) else: # Case: generic models (can be sequence model) assert hp.has_dynamic_features is not None y_hat = model(mc_scaled, lengths=lengths) y_hat_static = multi_stream_mlpg( y_hat, R, hp.stream_sizes, hp.has_dynamic_features) mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim) # Denormalize mc_static_pred = P.inv_scale( mc_static_pred, data_mean[:static_dim], data_std[:static_dim]) outputs = mc_static_pred.copy() if diffvc: mc_static_pred = mc_static_pred - mc[:, :static_dim] mc = np.hstack((c0[:, None], mc_static_pred)) if diffvc: mc[:, 0] = 0 # remove power coefficients engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha), hopsize=hop_length) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(x, b) else: fftlen = pyworld.get_cheaptrick_fft_size(fs) spectrogram = pysptk.mc2sp( mc.astype(np.float64), alpha=alpha, fftlen=fftlen) waveform = pyworld.synthesize( f0, spectrogram, aperiodicity, fs, hp.frame_period) return waveform, inputs, outputs