def collect_features(self, wav_path, label_path): fs, x = wavfile.read(wav_path) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) bap = pyworld.code_aperiodicity(aperiodicity, fs) mgc = pysptk.sp2mc(spectrogram, order=order, alpha=pysptk.util.mcepalpha(fs)) f0 = f0[:, None] lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) vuv = (lf0 != 0).astype(np.float32) lf0 = interp1d(lf0, kind="slinear") mgc = apply_delta_windows(mgc, windows) lf0 = apply_delta_windows(lf0, windows) bap = apply_delta_windows(bap, windows) features = np.hstack((mgc, lf0, vuv, bap)) # Cut silence frames by HTS alignment labels = hts.load(label_path) features = features[:labels.num_frames()] indices = labels.silence_frame_indices() features = np.delete(features, indices, axis=0) return features.astype(np.float32)
def extract(cls, wave: Wave, frame_period, f0_floor, f0_ceil, fft_length, order, alpha, dtype): x = wave.wave.astype(numpy.float64) fs = wave.sampling_rate f0, t = cls.extract_f0(x=x, fs=fs, frame_period=frame_period, f0_floor=f0_floor, f0_ceil=f0_ceil) sp = pyworld.cheaptrick(x, f0, t, fs, fft_size=fft_length) ap = pyworld.d4c(x, f0, t, fs, fft_size=fft_length) mc = pysptk.sp2mc(sp, order=order, alpha=alpha) coded_ap = pyworld.code_aperiodicity(ap, fs) voiced: numpy.ndarray = ~(f0 == 0) if len(x) % fft_length > 0: f0 = f0[:-1] t = t[:-1] sp = sp[:-1] ap = ap[:-1] mc = mc[:-1] coded_ap = coded_ap[:-1] voiced = voiced[:-1] feature = AcousticFeature( f0=f0[:, None], sp=sp, ap=ap, coded_ap=coded_ap, mc=mc, voiced=voiced[:, None], ) feature = feature.astype_only_float(dtype) return feature
def extract(wave: Wave, frame_period, f0_floor, f0_ceil, fft_length, order, alpha, dtype): x = wave.wave.astype(numpy.float64) fs = wave.sampling_rate f0, t = pyworld.harvest( x, fs, frame_period=frame_period, f0_floor=f0_floor, f0_ceil=f0_ceil, ) f0 = pyworld.stonemask(x, f0, t, fs) sp = pyworld.cheaptrick(x, f0, t, fs, fft_size=fft_length) ap = pyworld.d4c(x, f0, t, fs, fft_size=fft_length) mc = pysptk.sp2mc(sp, order=order, alpha=alpha) coded_ap = pyworld.code_aperiodicity(ap, fs) voiced = ~(f0 == 0) # type: numpy.ndarray feature = AcousticFeature( f0=f0[:, None], sp=sp, ap=ap, coded_ap=coded_ap, mc=mc, voiced=voiced[:, None], ) feature = feature.astype_only_float(dtype) feature.validate() return feature
def wav2world(wavfile, frame_period): wav, fs = librosa.load(wavfile, sr=hp.sample_rate, dtype=np.float64) if hp.use_harvest: f0, timeaxis = pyworld.harvest(wav, fs, frame_period=frame_period) else: f0, timeaxis = pyworld.dio(wav, fs, frame_period=frame_period) f0 = pyworld.stonemask(wav, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(wav, f0, timeaxis, fs) aperiodicity = pyworld.d4c(wav, f0, timeaxis, fs) bap = pyworld.code_aperiodicity(aperiodicity, fs) hp.num_bap = bap.shape[1] alpha = pysptk.util.mcepalpha(fs) mgc = pysptk.sp2mc(spectrogram, order=hp.num_mgc - 1, alpha=alpha) f0 = f0[:, None] lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) if hp.use_harvest: # https://github.com/mmorise/World/issues/35#issuecomment-306521887 vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None] else: vuv = (lf0 != 0).astype(np.float32) #print(mgc.shape,lf0.shape,vuv.shape,bap.shape) features = np.hstack((mgc, lf0, vuv, bap)) return features.astype(np.float32)
def collect_features(self, wav_path): # x: Raw audio, (Sample_length, ) x, fs = librosa.load(wav_path, sr=self.target_sr, mono=True, dtype=np.float64) # f0: F0, (Frame_length, ) # lf0: log(f0) --> interp1d (Frame_length, ) # vuv: voice/unvoiced (Frame_length, ) f0, timeaxis = pyworld.dio(x, self.target_sr, frame_period=self.hop_sz_in_ms) f0 = pyworld.stonemask(x, f0, timeaxis, fs) lf0 = f0.copy() lf0[np.nonzero(f0)] = np.log(f0[np.nonzero(f0)]) lf0 = interp1d(lf0, kind="slinear") vuv = (lf0 != 0).astype(np.float32) # spec: Spectrogram, (Frame_length x Dim), Dim = 513 # bap: coded aperiodicity, (Frame_length, ) # mgc: mel-cepstrum, (Frame_length x Dim), Dim = 60 spec = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) bap = pyworld.code_aperiodicity(aperiodicity, fs) mgc = pysptk.sp2mc(spec, order=59, alpha=pysptk.util.mcepalpha(fs)) # Stacking Features: total dimesnion = 64 features = np.hstack((f0[:,None], lf0[:,None], vuv[:,None], bap, mgc, spec)) return features.astype(np.float32)
def world_spectrogram(wav, sr=_sr, dim_num=32, **kwargs): """world声码器语音转为频谱。""" # 分布提取参数 frame_period = kwargs.get("frame_period", pw.default_frame_period) f0_floor = kwargs.get("f0_floor", pw.default_f0_floor) f0_ceil = kwargs.get("f0_ceil", pw.default_f0_ceil) fft_size = kwargs.get("fft_size", pw.get_cheaptrick_fft_size(sr, f0_floor)) ap_threshold = kwargs.get("ap_threshold", 0.85) f0_extractor = kwargs.get("f0_extractor", "dio") x = wav.astype(np.double) if f0_extractor == "dio": # 使用DIO算法计算音频的基频F0 f0, t = pw.dio(x, sr, f0_floor=f0_floor, f0_ceil=f0_ceil) elif f0_extractor == "harvest": f0, t = pw.harvest(x, sr, f0_floor=f0_floor, f0_ceil=f0_ceil, frame_period=frame_period) else: f0, t = f0_extractor(x, sr, f0_floor=f0_floor, f0_ceil=f0_ceil, frame_period=frame_period) # 使用CheapTrick算法计算音频的频谱包络 sp = pw.cheaptrick(x, f0, t, sr, f0_floor=f0_floor, fft_size=fft_size) # SP降维 sp_enc = pw.code_spectral_envelope(sp, sr, number_of_dimensions=dim_num) # 计算aperiodic参数 ap = pw.d4c(x, f0, t, sr, threshold=ap_threshold, fft_size=fft_size) # AP降维 ap_enc = pw.code_aperiodicity(ap, sr) return f0, sp_enc, ap_enc
def process_wav(wav_path): y, osr = sf.read(wav_path, subtype='PCM_16', channels=1, samplerate=48000, endian='LITTLE') #, start=56640, stop=262560) sr = 32000 if osr != sr: y = librosa.resample(y, osr, sr) #使用harvest算法计算音频的基频F0 _f0, t = pw.harvest(y, sr, f0_floor=f0_min, f0_ceil=f0_max, frame_period=pw.default_frame_period) _f0 = pw.stonemask(y, _f0, t, sr) print(_f0.shape) #使用CheapTrick算法计算音频的频谱包络 _sp = pw.cheaptrick(y, _f0, t, sr) code_sp = code_harmonic(_sp, 60) print(_sp.shape, code_sp.shape) #计算aperiodic参数 _ap = pw.d4c(y, _f0, t, sr) code_ap = pw.code_aperiodicity(_ap, sr) print(_ap.shape, code_ap.shape) return _f0, _sp, code_sp, _ap, code_ap
def _process_feature(out_dir, index, wav_path, label_path): # get list of wav files wav_files = os.listdir(os.path.dirname(wav_path)) # check wav_file assert len( wav_files) != 0 and wav_files[0][-4:] == '.wav', "no wav files found!" fs, x = wavfile.read(wav_path) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period) n_frames = len(f0) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) bap = pyworld.code_aperiodicity(aperiodicity, fs) mgc = pysptk.sp2mc(spectrogram, order=order, alpha=pysptk.util.mcepalpha(fs)) f0 = f0[:, None] lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) vuv = (lf0 != 0).astype(np.float32) lf0 = interp1d(lf0, kind="slinear") mgc = apply_delta_windows(mgc, windows) lf0 = apply_delta_windows(lf0, windows) bap = apply_delta_windows(bap, windows) features = np.hstack((mgc, lf0, vuv, bap)) # get list of lab files lab_files = os.listdir(os.path.dirname(label_path)) # check wav_file assert len( lab_files) != 0 and lab_files[0][-4:] == '.lab', "no lab files found!" # Cut silence frames by HTS alignment labels = hts.load(label_path) features = features[:labels.num_frames()] indices = labels.silence_frame_indices() features = np.delete(features, indices, axis=0) voiced_frames = features.shape[0] # Write the acoustic to disk: acoustic_filename = 'arctic_%05d.npy' % index np.save(os.path.join(out_dir, acoustic_filename), features.astype(np.float32), allow_pickle=False) dataset_ids.append(acoustic_filename[:-4]) with open(os.path.join(os.path.dirname(out_dir), 'dataset_ids.pkl'), 'wb') as pklFile: pickle.dump(dataset_ids, pklFile) # Return a tuple describing this training example: return (acoustic_filename, n_frames, voiced_frames)
def _resample_down_aperiodicity(cls, feature, fs, new_fs, new_spectrum_len): feature = np.ascontiguousarray(feature) coded_ap = pyworld.code_aperiodicity(feature, fs) num = cls._get_aperiodicity_num(new_fs) if num < coded_ap.shape[1]: coded_ap = np.ascontiguousarray(coded_ap[:, :num]) return pyworld.decode_aperiodicity(coded_ap, new_fs, (new_spectrum_len - 1) * 2)
def get_feature(wav_path, preprocessing=False, getsize=False): fs, x = wavfile.read(wav_path) x = x.astype(np.float64) if audio_world_config.use_harvest: f0, timeaxis = pyworld.harvest( x, fs, frame_period=audio_world_config.frame_period, f0_floor=audio_world_config.f0_floor, f0_ceil=audio_world_config.f0_ceil) else: f0, timeaxis = pyworld.dio( x, fs, frame_period=audio_world_config.frame_period, f0_floor=audio_world_config.f0_floor, f0_ceil=audio_world_config.f0_ceil) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) bap = pyworld.code_aperiodicity(aperiodicity, fs) alpha = pysptk.util.mcepalpha(fs) mgc = pysptk.sp2mc(spectrogram, order=audio_world_config.mgc_dim, alpha=alpha) f0 = f0[:, None] lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) if audio_world_config.use_harvest: # https://github.com/mmorise/World/issues/35#issuecomment-306521887 vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None] else: vuv = (lf0 != 0).astype(np.float32) lf0 = P.interp1d(lf0, kind=audio_world_config.f0_interpolation_kind) # Parameter trajectory smoothing if audio_world_config.mod_spec_smoothing: hop_length = int(fs * (audio_world_config.frame_period * 0.001)) modfs = fs / hop_length mgc = P.modspec_smoothing( mgc, modfs, cutoff=audio_world_config.mod_spec_smoothing_cutoff) mgc = P.delta_features(mgc, audio_world_config.windows) lf0 = P.delta_features(lf0, audio_world_config.windows) bap = P.delta_features(bap, audio_world_config.windows) features = np.hstack((mgc, lf0, vuv, bap)) if preprocessing: out_path = wav_path.replace(".wav", "").replace("wav", "world") np.save(out_path, features) elif getsize: feature, mgc.shape[0], lf0.shape[0], bap.shape[0] else: return features
def analyze(x, fs, f0_floor, f0_ceil, frame_period=20.0, pitchshift=None): if pitchshift is not None: f0, spc, ap = analyze_world(x, fs * pitchshift, f0_floor, f0_ceil, frame_period / pitchshift) else: f0, spc, ap = analyze_world(x, fs, f0_floor, f0_ceil, frame_period) mcep = pysptk.sp2mc(spc, 24, 0.410) codeap = pyworld.code_aperiodicity(ap, fs) #return x, fs, f0, time_axis, spc, ap, mcep, codeap return f0, mcep, codeap
def get_features(x, fs): # f0 calculate _f0, t = pw.dio(x, fs) f0 = pw.stonemask(x, _f0, t, fs) # mcep calculate sp = trim_zeros_frames(pw.cheaptrick(x, f0, t, fs)) mcep = pysptk.sp2mc(sp, order=24, alpha=pysptk.util.mcepalpha(fs)) # bap calculate ap = pw.d4c(x, f0, t, fs) bap = pw.code_aperiodicity(ap, fs) return f0, mcep, bap
def _process_utterance(out_dir, index, wav_path, text, phone): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: wav = audio.load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max if hparams.vocoder=="world": spectrogram = audio.spectrogram(wav).astype(np.float32) f0, sp, ap = pw.wav2world(wav.astype(np.double), hparams.sample_rate) ap_coded = pw.code_aperiodicity(ap, hparams.sample_rate) sp_coded = pw.code_spectral_envelope(sp,hparams.sample_rate, hparams.coded_env_dim) world_spec = np.hstack([f0[:,np.newaxis],sp_coded,ap_coded]) n_frames = world_spec.shape[0] spectrogram_filename = 'synpaflex-spec-%05d.npy' % index encoded_filename = 'synpaflex-world-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, encoded_filename), world_spec, allow_pickle=False) else: # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = 'synpaflex-spec-%05d.npy' % index encoded_filename = 'synpaflex-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, encoded_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, encoded_filename, n_frames, text, phone)
def process(filename): ''' The function decomposes a wav file into F0, mel-cepstral coefficients, and aperiodicity :param filename: path to wav file :return: .lf0, .mgc and .bap files ''' # pdb.set_trace() file_id = os.path.basename(filename).split(".")[0] print('\n' + file_id) ### WORLD ANALYSIS -- extract vocoder parameters ### # x, fs = librosa.core.load(filename, sr=16000) fs, x = wavfile.read(filename) # warnning this parameter is important alpha = pysptk.util.mcepalpha(fs) hopesize = int(0.005 * fs) # pdb.set_trace() f0 = pysptk.rapt(x.astype(np.float32), fs=fs, hopsize=hopesize, min=60, max=600, voice_bias=0.0, otype=1) f0 = f0.astype(np.float64) x = x.astype(np.float64) / (2**15) _, timeaxis = pyworld.harvest(x, fs, frame_period=5, f0_floor=60.0, f0_ceil=600) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) f0 = f0[:, None] lf0 = f0.copy() lf0 = lf0.astype(np.float32) nonzero_indices = np.where(f0 != 0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) zero_indices = np.where(f0 == 0) lf0[zero_indices] = -1.0E+10 write_binfile(lf0, os.path.join(lf0_dir, file_id + '.lf0'), dtype=np.float32) mc = pysptk.sp2mc(spectrogram, mcsize, alpha=alpha) mc = mc.astype(np.float32) write_binfile(mc, os.path.join(mgc_dir, file_id + '.mgc'), dtype=np.float32) bap = pyworld.code_aperiodicity(aperiodicity, fs) bap = bap.astype(np.float32) write_binfile(bap, os.path.join(bap_dir, file_id + '.bap'), dtype=np.float32)
def feature_extract(wav_list, arr): n_sample = 0 n_frame = 0 max_frame = 0 count = 1 coeff = np.array([-0.5, 0.5, 0.0]) for wav_name in wav_list: # load wavfile and apply low cut filter fs, x = read_wav(wav_name, cutoff=70) n_sample += x.shape[0] logging.info(wav_name + " " + str(x.shape[0]) + " " + str(n_sample) + " " + str(count)) # check sampling frequency if not fs == args.fs: logging.debug("ERROR: sampling frequency is not matched.") sys.exit(1) hdf5name = args.hdf5dir + "/" + os.path.basename(wav_name).replace( ".wav", ".h5") # extimate f0 and ap time_axis, f0, spc, ap = analyze_range(x, fs=args.fs, minf0=minf0, maxf0=maxf0, fperiod=args.shiftms, fftl=args.fftl) write_hdf5(hdf5name, '/ap', ap) write_hdf5(hdf5name, "/f0", f0) # convert to continuous f0 and low-pass filter uv, cont_f0 = convert_continuos_f0(np.array(f0)) cont_f0_lpf = low_pass_filter(cont_f0, int(1.0 / (args.shiftms * 0.001)), cutoff=20) cont_f0_lpf = np.expand_dims(cont_f0_lpf, axis=-1) uv = np.expand_dims(uv, axis=-1) write_hdf5(hdf5name, "/lcf0", np.log(cont_f0_lpf)) write_hdf5(hdf5name, "/uv", uv) # extimate codeap codeap = pw.code_aperiodicity(ap, args.fs) if codeap.ndim == 1: # when fs == 16000 codeap = np.expand_dims(codeap, axis=-1) write_hdf5(hdf5name, "/codeap", codeap) # mcep mcep = ps.sp2mc(spc, args.mcep_dim, mcep_alpha) write_hdf5(hdf5name, "/mcep", mcep)
def _resample_up_aperiodicity(cls, feature, fs, new_fs, new_spectrum_len): feature = np.ascontiguousarray(feature) coded_ap = pyworld.code_aperiodicity(feature, fs) num = cls._get_aperiodicity_num(new_fs) if num > coded_ap.shape[1]: freq_axis = np.hstack((np.arange(coded_ap.shape[1]), new_fs / 2 / cls.FREQUENCY_INTERVAL - 1)) coded_ap = np.hstack((coded_ap, np.full((coded_ap.shape[0], 1), -cls.SAFE_GUARD_MINIMUM))) ap_interp = scipy.interpolate.interp1d(freq_axis, coded_ap, axis=1) coded_ap = np.ascontiguousarray(ap_interp(np.arange(num))) return pyworld.decode_aperiodicity(coded_ap, new_fs, (new_spectrum_len - 1) * 2)
def collect_features(self, wav_path, label_path): fs, x = wavfile.read(wav_path) x = x.astype(np.float64) if hp_acoustic.use_harvest: f0, timeaxis = pyworld.harvest( x, fs, frame_period=hp_acoustic.frame_period, f0_floor=hp_acoustic.f0_floor, f0_ceil=hp_acoustic.f0_ceil) else: f0, timeaxis = pyworld.dio( x, fs, frame_period=hp_acoustic.frame_period, f0_floor=hp_acoustic.f0_floor, f0_ceil=hp_acoustic.f0_ceil) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) bap = pyworld.code_aperiodicity(aperiodicity, fs) if self.alpha is None: self.alpha = pysptk.util.mcepalpha(fs) mgc = pysptk.sp2mc(spectrogram, order=hp_acoustic.order, alpha=self.alpha) f0 = f0[:, None] lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) if hp_acoustic.use_harvest: # https://github.com/mmorise/World/issues/35#issuecomment-306521887 vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None] else: vuv = (lf0 != 0).astype(np.float32) lf0 = P.interp1d(lf0, kind=hp_acoustic.f0_interpolation_kind) # Parameter trajectory smoothing if hp_acoustic.mod_spec_smoothing: hop_length = int(fs * (hp_acoustic.frame_period * 0.001)) modfs = fs / hop_length mgc = P.modspec_smoothing( mgc, modfs, cutoff=hp_acoustic.mod_spec_smoothing_cutoff) mgc = P.delta_features(mgc, hp_acoustic.windows) lf0 = P.delta_features(lf0, hp_acoustic.windows) bap = P.delta_features(bap, hp_acoustic.windows) features = np.hstack((mgc, lf0, vuv, bap)) # Cut silence frames by HTS alignment labels = hts.load(label_path) features = features[:labels.num_frames()] indices = labels.silence_frame_indices() features = np.delete(features, indices, axis=0) return features.astype(np.float32)
def get_acoustic_feature(lab_path, wav_path, sampling_rate, hop_size_in_ms, mcep_order, windows): fs, audio = wavfile.read(wav_path) audio = audio.astype(np.float64) / 2**15 if fs != sampling_rate: audio = audio.astype(np.float32) audio = librosa.resample(audio, fs, sampling_rate) audio = (audio * 2**15).astype(np.float64) # extract f0 f0, timeaxis = pyworld.dio(audio, sampling_rate, frame_period=hop_size_in_ms) # modify f0 f0 = pyworld.stonemask(audio, f0, timeaxis, sampling_rate) # voiced/unvoiced flag vuv = (f0 > 0)[:, None].astype(np.float32) # calculate log f0 lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) # interpolate f0 in log-domain lf0 = interp1d(lf0, kind='slinear')[:, None] # calculate mel-cepstrum spectrogram = pyworld.cheaptrick(audio, f0, timeaxis, sampling_rate) mgc = pysptk.sp2mc(spectrogram, order=mcep_order, alpha=pysptk.util.mcepalpha(sampling_rate)) # calculate aperiodicity parameter aperiodicity = pyworld.d4c(audio, f0, timeaxis, sampling_rate) bap = pyworld.code_aperiodicity(aperiodicity, sampling_rate) # calculate dynamic features mgc = apply_delta_windows(mgc, windows) lf0 = apply_delta_windows(lf0, windows) bap = apply_delta_windows(bap, windows) feature = np.hstack((mgc, lf0, vuv, bap)) # cut silence frames by HTS alignment labels = hts.load(lab_path) feature = feature[:labels.num_frames()] if labels.num_frames() > len(feature): return indices = labels.silence_frame_indices() feature = np.delete(feature, indices, axis=0) return feature.astype(np.float32)
def codeap(self): """Return coded aperiodicity sequence Returns ------- codeap : array, shape (`T`, `dim`) Encoded aperiodicity sequence of the waveform The `dim` of code ap is defined based on the `fs` as follow: fs = `16000` : `1` fs = `22050` : `2` fs = `44100` : `5` fs = `48000` : `5` """ self._analyzed_check() return pyworld.code_aperiodicity(self._ap, self.fs)
def _extract_static_feats(wav, sr): f0, timeaxis = pyworld.dio(wav, sr, frame_period=5) spectrogram = pyworld.cheaptrick(wav, f0, timeaxis, sr) aperiodicity = pyworld.d4c(wav, f0, timeaxis, sr) mgc = pysptk.sp2mc(spectrogram, order=59, alpha=pysptk.util.mcepalpha(sr)) f0 = f0[:, None] lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) vuv = (lf0 != 0).astype(np.float32) lf0 = interp1d(lf0, kind="slinear") bap = pyworld.code_aperiodicity(aperiodicity, sr) feats = np.hstack((mgc, lf0, vuv, bap)).astype(np.float32) stream_sizes = [mgc.shape[1], lf0.shape[1], vuv.shape[1], bap.shape[1]] return feats, stream_sizes
def collect_features(self, wav_path, label_path): #print(wav_path) #fs, x = wavfile.read(wav_path) d = wavio.read(wav_path) fs, x = d.rate, d.data print(fs, wav_path) if len(x.shape) > 1: x = x[:, 0] x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) bap = pyworld.code_aperiodicity(aperiodicity, fs) mgc = pysptk.sp2mc(spectrogram, order=order, alpha=pysptk.util.mcepalpha(fs)) f0 = f0[:, None] lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) vuv = (lf0 != 0).astype(np.float32) #1 lf0 = interp1d(lf0, kind="slinear") mgc = apply_delta_windows(mgc, windows) #180 lf0 = apply_delta_windows(lf0, windows) #3 bap = apply_delta_windows(bap, windows) #3 biaobei 15 features = np.hstack((mgc, lf0, vuv, bap)) # 187 biaobei 199 #print('mgc:',mgc.shape) #print('lf0:', lf0.shape) #print('vuv:', vuv.shape) #print('bap:', bap.shape) # Cut silence frames by HTS alignment labels = hts.load(label_path) features = features[:labels.num_frames()] indices = labels.silence_frame_indices() if len(indices) > 0: features = np.delete(features, indices, axis=0) #print(features.shape) # return features.astype(np.float32)
def extract_timbre_data(args): audio_data, frequency, timing, sample_rate = args # Spectral envelope is taking the frequency-time of the audio and taking short time windows (frames) # and Fourier transforming them, to convert to the frequency domain spectral_data = pyworld.cheaptrick(audio_data, frequency, timing, sample_rate) aperiodic_data = pyworld.d4c(audio_data, frequency, timing, sample_rate) # 1. First take spectral envelope and convert it to mel cepstrum (MFCC) # 1.1 Spectral envelope is the Short time fourier transform of the frequencies to freuqency bins # 1.2 In MFCC we first map the powers of the spectrum to the mel scale # 1.3 Take the logs of each mel frequency and take the Discrete Cosine Transform to get MFCC # 1.4 MFCC are in the form of amplitudes. The bands used are in the range of what humans can distinct # rather than normal ranges (in normal spec env each band human cant distictively identify) # 2. After breaking down into bins in MFCC the DC frequency (at bin 0) and Nyquist (last frequency) scaled by two # 3. Using the above a mirror spectrum is created # 4. The fourier transform is taken to get the MFSC. MFCC -> Discrete conside transform -> MFSC. # Reverse done here to get real values in frequency range mcep_floor = 10**(-80 / 20) spectral_mel = np.apply_along_axis(pysptk.mcep, 1, spectral_data, params.mcep_order - 1, params.mcep_alpha, itype=params.mcep_input, threshold=mcep_floor) scale_mel = copy.copy(spectral_mel) scale_mel[:, 0] *= 2 scale_mel[:, -1] *= 2 # Create mirror. scale_mel[:, -1:0:-1]] -> all rows, from last column to first, # in reverse (the last -1 in the notation) mirror = np.hstack([scale_mel[:, :-1], scale_mel[:, -1:0:-1]]) mfsc = np.fft.rfft(mirror).real spectral_data = pd.DataFrame(mfsc) aperiodic_data = pyworld.code_aperiodicity(aperiodic_data, sample_rate) aperiodic_data = pd.DataFrame(aperiodic_data) return spectral_data, aperiodic_data
def collect_features(self, wav_path, label_path): fs, x = wavfile.read(wav_path) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp_acoustic.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) bap = pyworld.code_aperiodicity(aperiodicity, fs) if self.alpha is None: self.alpha = pysptk.util.mcepalpha(fs) mgc = pysptk.sp2mc(spectrogram, order=hp_acoustic.order, alpha=self.alpha) f0 = f0[:, None] lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) vuv = (lf0 != 0).astype(np.float32) lf0 = P.interp1d(lf0, kind=hp_acoustic.f0_interpolation_kind) # 50hz parameter trajectory smoothing hop_length = int(fs * (hp_acoustic.frame_period * 0.001)) modfs = fs / hop_length mgc = P.modspec_smoothing(mgc, modfs, cutoff=50) mgc = P.delta_features(mgc, hp_acoustic.windows) lf0 = P.delta_features(lf0, hp_acoustic.windows) bap = P.delta_features(bap, hp_acoustic.windows) features = np.hstack((mgc, lf0, vuv, bap)) # Cut silence frames by HTS alignment labels = hts.load(label_path) features = features[:labels.num_frames()] indices = labels.silence_frame_indices() features = np.delete(features, indices, axis=0) return features.astype(np.float32)
def process_wav(wav_path): y, osr = sf.read(wav_path, subtype='PCM_16', channels=1, samplerate=48000, endian='LITTLE') # , start=56640, stop=262560) sr = 32000 y = librosa.resample(y, osr, sr) # 使用DIO算法计算音频的基频F0 _f0, t = pw.dio(y, sr, f0_floor=50.0, f0_ceil=800.0, channels_in_octave=2, frame_period=pw.default_frame_period) print(_f0.shape) # 使用CheapTrick算法计算音频的频谱包络 _sp = pw.cheaptrick(y, _f0, t, sr) code_sp = pw.code_spectral_envelope(_sp, sr, 60) print(_sp.shape, code_sp.shape) # 计算aperiodic参数 _ap = pw.d4c(y, _f0, t, sr) code_ap = pw.code_aperiodicity(_ap, sr) print(_ap.shape, code_ap.shape) np.save('data/prepared_data/f0', _f0) np.save('data/prepared_data/ap', code_ap) # 合成原始语音 synthesized = pw.synthesize(_f0 - 200, _sp, _ap, 32000, pw.default_frame_period) # 1.输出原始语音 sf.write('./data/gen_wav/test-200.wav', synthesized, 32000)
def collect_features(self, wav_path): ''' Args: wav_path: str - path to wav files Returns: x: np.ndarray (T,) - time domain audio signal mgc: np.ndarray - time domain audio signal ''' fs, x = wavfile.read(wav_path) g, f = x.T x = g[:fs * 8].astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=self.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) bap = pyworld.code_aperiodicity(aperiodicity, fs) mgc = pysptk.sp2mc(spectrogram, order=self.order, alpha=pysptk.util.mcepalpha(fs)) f0 = f0[:, None] lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) vuv = (lf0 != 0).astype(np.float32) mgc = apply_delta_windows(mgc, self.windows) lf0 = apply_delta_windows(lf0, self.windows) bap = apply_delta_windows(bap, self.windows) features = np.hstack((mgc, lf0, vuv, bap)) return x, mgc, lf0, f0, bap, vuv, fs, timeaxis
def process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str, skip_existing: bool, hparams, random_uttBasename_forSpkEmbedding=None): ''' random_uttBasename_forSpkEmbedding: if not None, use the utterance to generate speaker embedding in synthesizer training. ''' ## FOR REFERENCE: # For you not to lose your head if you ever wish to change things here or implement your own # synthesizer. # - Both the audios and the mel spectrograms are saved as numpy arrays # - There is no processing done to the audios that will be saved to disk beyond volume # normalization (in split_on_silences) # - However, pre-emphasis is applied to the audios before computing the mel spectrogram. This # is why we re-apply it on the audio on the side of the vocoder. # - Librosa pads the waveform before computing the mel spectrogram. Here, the waveform is saved # without extra padding. This means that you won't have an exact relation between the length # of the wav and of the mel spectrogram. See the vocoder data loader. # Skip existing utterances if needed mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename) wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename) if skip_existing and mel_fpath.exists() and wav_fpath.exists(): return None # Skip utterances that are too short if len(wav) < hparams.utterance_min_duration * hparams.sample_rate: return None wav = trim_long_silences(wav, hparams.vad_window_length, hparams.sample_rate) # Compute the mel spectrogram wav = wav.astype(np.float64) # feature extraction f0, sp, ap = pw.wav2world(wav, hparams.sample_rate) n_frames = len(f0) # reduce the dimension of ap from 513 to 2 enc_ap = pw.code_aperiodicity(ap, hparams.sample_rate) # feature normalization lf0 = audio.f0_normalize(f0) mgc = audio.sp_normalize(sp, hparams) bap = audio.ap_normalize(enc_ap) # print(lf0.dtype, mgc.dtype, bap.dtype, flush=True) # print(np.shape(lf0), np.shape(mgc), np.shape(bap), flush=True) # wav233 = audio.synthesize(lf0,mgc,bap,hparams) # audio.save_wav(wav233, "/home/zhangwenbo5/worklhf/english_voice_clone/Voice_Cloning_byid_pyworld/test_wav.wav", hparams.sample_rate) ###################### # f0, sp, ap = pw.wav2world(wav, hparams.sample_rate) # f0 = f0.astype(np.float32) # sp = sp.astype(np.float32) # ap = ap.astype(np.float32) # # print(f0.dtype, sp.dtype, ap.dtype, flush=True) # # f0 /= 100.0 # # sp *= 1000.0 # n_frames = np.shape(f0)[0] # f0 = np.reshape(f0, [n_frames, 1]) ########################################### # mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) # [80, frame] # mel_frames = mel_spectrogram.shape[1] lf0 = np.reshape(lf0, [n_frames, 1]) mel_spectrogram = np.concatenate((lf0, mgc, bap), axis=-1) # [frame, 1+60+1], 1027 # Skip utterances that are too long if n_frames > hparams.max_mel_frames and hparams.clip_mels_length: return None # Write the spectrogram, embed and audio to disk np.save(mel_fpath, mel_spectrogram, allow_pickle=False) np.save(wav_fpath, wav, allow_pickle=False) # Return a tuple describing this training example embed_basename = basename if random_uttBasename_forSpkEmbedding is not None: embed_basename = random_uttBasename_forSpkEmbedding return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % embed_basename, len( wav), n_frames, text
def collect_features(self, wav_path, label_path): labels = hts.load(label_path) l_features = fe.linguistic_features( labels, self.binary_dict, self.continuous_dict, add_frame_features=True, subphone_features="coarse_coding") f0_score = _midi_to_hz(l_features, self.pitch_idx, False) notes = l_features[:, self.pitch_idx] notes = notes[notes > 0] # allow 1-tone upper/lower min_f0 = librosa.midi_to_hz(min(notes) - 2) max_f0 = librosa.midi_to_hz(max(notes) + 2) assert max_f0 > min_f0 fs, x = wavfile.read(wav_path) x = x.astype(np.float64) if self.use_harvest: f0, timeaxis = pyworld.harvest(x, fs, frame_period=self.frame_period, f0_floor=min_f0, f0_ceil=max_f0) else: f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period, f0_floor=min_f0, f0_ceil=max_f0) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs, f0_floor=self.f0_floor) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) bap = pyworld.code_aperiodicity(aperiodicity, fs) mgc = pysptk.sp2mc(spectrogram, order=self.mgc_order, alpha=pysptk.util.mcepalpha(fs)) # F0 of speech f0 = f0[:, None] lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) if self.use_harvest: # https://github.com/mmorise/World/issues/35#issuecomment-306521887 vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None] else: vuv = (lf0 != 0).astype(np.float32) lf0 = interp1d(lf0, kind="slinear") # Adjust lengths mgc = mgc[:labels.num_frames()] lf0 = lf0[:labels.num_frames()] vuv = vuv[:labels.num_frames()] bap = bap[:labels.num_frames()] if self.relative_f0: # # F0 derived from the musical score f0_score = f0_score[:, None] lf0_score = f0_score.copy() nonzero_indices = np.nonzero(f0_score) lf0_score[nonzero_indices] = np.log(f0_score[nonzero_indices]) lf0_score = interp1d(lf0_score, kind="slinear") # relative f0 diff_lf0 = lf0 - lf0_score diff_lf0 = np.clip(diff_lf0, np.log(0.5), np.log(2.0)) f0_target = diff_lf0 else: f0_target = lf0 mgc = apply_delta_windows(mgc, self.windows) f0_target = apply_delta_windows(f0_target, self.windows) bap = apply_delta_windows(bap, self.windows) features = np.hstack((mgc, f0_target, vuv, bap)).astype(np.float32) # Align waveform and features wave = x.astype(np.float32) / 2**15 T = int(features.shape[0] * (fs * self.frame_period / 1000)) if len(wave) < T: if T - len(wave) > 100: print("Warn!!", T, len(wave), T-len(wave)) print("you have unepxcted input. Please debug though ipdb") import ipdb; ipdb.set_trace() else: pass wave = np.pad(wave, (0, T-len(wave))) assert wave.shape[0] >= T wave = wave[:T] return features, wave
def ap2bap(ap, fs): bap = pw.code_aperiodicity(ap, fs) return bap
def decode_RNN(wav_list, gpu, cvlist=None, cvlist_src=None, \ mcd_cvlist_src=None, mcdstd_cvlist_src=None, mcdpow_cvlist_src=None, mcdpowstd_cvlist_src=None): with torch.cuda.device(gpu): mean_trg = torch.FloatTensor( read_hdf5(args.stats_jnt, "/mean_feat_org_lf0")[config.stdim:]).cuda() std_trg = torch.FloatTensor( read_hdf5(args.stats_jnt, "/scale_feat_org_lf0")[config.stdim:]).cuda() # define model and load parameters logging.info(config) logging.info("model") with torch.no_grad(): model_encoder = GRU_RNN_STOCHASTIC( in_dim=config.in_dim, out_dim=config.lat_dim, hidden_layers=config.hidden_layers, hidden_units=config.hidden_units, kernel_size=config.kernel_size_enc, dilation_size=config.dilation_size_enc, arparam=config.arparam, spk_dim=n_spk, causal_conv=config.causal_conv, scale_out_flag=False) model_decoder = GRU_RNN(in_dim=config.lat_dim + n_spk, out_dim=config.out_dim, hidden_layers=config.hidden_layers, hidden_units=config.hidden_units, kernel_size=config.kernel_size_dec, dilation_size=config.dilation_size_dec, causal_conv=config.causal_conv, scale_in_flag=False) logging.info(model_encoder) logging.info(model_decoder) model_encoder.load_state_dict( torch.load(args.model)["model_encoder"]) model_decoder.load_state_dict( torch.load(args.model)["model_decoder"]) model_encoder.cuda() model_decoder.cuda() model_encoder.eval() model_decoder.eval() for param in model_encoder.parameters(): param.requires_grad = False for param in model_decoder.parameters(): param.requires_grad = False if config.arparam: init_pp = np.zeros((1, 1, config.lat_dim * 2 + n_spk)) else: init_pp = np.zeros((1, 1, config.lat_dim + n_spk)) y_in_pp = torch.FloatTensor(init_pp).cuda() y_in_src = y_in_trg = torch.unsqueeze( torch.unsqueeze((0 - mean_trg) / std_trg, 0), 0) fs = args.fs fft_size = args.fftl mcep_dim = model_decoder.out_dim - 1 for wav_file in wav_list: # convert mcep feat_file = os.path.join( args.h5outdir, os.path.basename(wav_file).replace(".wav", ".h5")) logging.info("cvmcep " + feat_file + " " + wav_file) fs, x = read_wav(wav_file, cutoff=70) time_axis, f0, sp, ap = analyze_range(x, fs=fs, minf0=args.minf0, maxf0=args.maxf0, \ fperiod=args.shiftms, fftl=args.fftl) logging.info(sp.shape) mcep = ps.sp2mc(sp, mcep_dim, args.mcep_alpha) logging.info(mcep.shape) codeap = pw.code_aperiodicity(ap, fs) logging.info(codeap.shape) npow = spc2npow(sp) logging.info(npow.shape) _, spcidx = extfrm(mcep, npow, power_threshold=args.pow) spcidx = spcidx[0] logging.info(spcidx.shape) uv, contf0 = convert_continuos_f0(np.array(f0)) uv = np.expand_dims(uv, axis=-1) logging.info(uv.shape) cont_f0_lpf = low_pass_filter(contf0, int(1.0 / (args.shiftms * 0.001)), cutoff=LP_CUTOFF) logcontf0 = np.expand_dims(np.log(cont_f0_lpf), axis=-1) logging.info(logcontf0.shape) feat = np.c_[uv, logcontf0, codeap, mcep] logging.info(feat.shape) logging.info("generate") with torch.no_grad(): lat_feat_src, _, _, _, _ = \ model_encoder(torch.FloatTensor(feat).cuda(), y_in_pp, sampling=False) src_code = np.zeros((lat_feat_src.shape[0], n_spk)) src_code[:, src_code_idx] = 1 src_code = torch.FloatTensor(src_code).cuda() trg_code = np.zeros((lat_feat_src.shape[0], n_spk)) trg_code[:, trg_code_idx] = 1 trg_code = torch.FloatTensor(trg_code).cuda() cvmcep_src, _, _ = model_decoder( torch.cat((src_code, lat_feat_src), 1), y_in_src) cvmcep_src = np.array(cvmcep_src.cpu().data.numpy(), dtype=np.float64) cvmcep, _, _ = model_decoder( torch.cat((trg_code, lat_feat_src), 1), y_in_trg) cvmcep = np.array(cvmcep.cpu().data.numpy(), dtype=np.float64) logging.info(lat_feat_src.shape) logging.info(cvmcep_src.shape) logging.info(cvmcep.shape) cvf0 = convert_f0(f0, f0_range_mean_src, f0_range_std_src, f0_range_mean_trg, f0_range_std_trg) uv_cv, contf0_cv = convert_continuos_f0(np.array(cvf0)) uv_cv = np.expand_dims(uv_cv, axis=-1) logging.info(uv_cv.shape) cont_f0_lpf_cv = low_pass_filter(contf0_cv, int(1.0 / (args.shiftms * 0.001)), cutoff=LP_CUTOFF) logcontf0_cv = np.expand_dims(np.log(cont_f0_lpf_cv), axis=-1) logging.info(logcontf0_cv.shape) feat_cv = np.c_[uv_cv, logcontf0_cv, codeap] logging.info(feat_cv.shape) feat_cvmcep = np.c_[feat_cv, cvmcep] logging.info(feat_cvmcep.shape) write_path = '/feat_cvmcep_cycvae-' + model_epoch logging.info(feat_file + ' ' + write_path) write_hdf5(feat_file, write_path, feat_cvmcep) cvlist.append(np.var(cvmcep[:, 1:], axis=0)) _, mcdpow_arr = dtw.calc_mcd(np.array(mcep[np.array(spcidx),:], dtype=np.float64), \ np.array(cvmcep_src[np.array(spcidx),:], dtype=np.float64)) _, mcd_arr = dtw.calc_mcd(np.array(mcep[np.array(spcidx),1:], dtype=np.float64), \ np.array(cvmcep_src[np.array(spcidx),1:], dtype=np.float64)) mcdpow_mean = np.mean(mcdpow_arr) mcdpow_std = np.std(mcdpow_arr) mcd_mean = np.mean(mcd_arr) mcd_std = np.std(mcd_arr) logging.info("mcdpow_src_cv: %.6f dB +- %.6f" % (mcdpow_mean, mcdpow_std)) logging.info("mcd_src_cv: %.6f dB +- %.6f" % (mcd_mean, mcd_std)) mcdpow_cvlist_src.append(mcdpow_mean) mcdpowstd_cvlist_src.append(mcdpow_std) mcd_cvlist_src.append(mcd_mean) mcdstd_cvlist_src.append(mcd_std) cvlist_src.append(np.var(cvmcep_src[:, 1:], axis=0)) logging.info("synth voco") cvsp = ps.mc2sp(np.ascontiguousarray(cvmcep), args.mcep_alpha, fft_size) logging.info(cvsp.shape) wav = np.clip( pw.synthesize(cvf0, cvsp, ap, fs, frame_period=args.shiftms), -1, 1) wavpath = os.path.join( args.outdir, os.path.basename(wav_file).replace(".wav", "_cv.wav")) sf.write(wavpath, wav, fs, 'PCM_16') logging.info(wavpath) logging.info("synth anasyn") wav = np.clip( pw.synthesize(f0, sp, ap, fs, frame_period=args.shiftms), -1, 1) wavpath = os.path.join( args.outdir, os.path.basename(wav_file).replace(".wav", "_anasyn.wav")) sf.write(wavpath, wav, fs, 'PCM_16') logging.info(wavpath)
def main(args): if os.path.isdir('test'): rmtree('test') os.mkdir('test') #x, fs = sf.read('utterance/vaiueo2d.wav') x, fs = sf.read('utterance/p226_002.wav') # x, fs = librosa.load('utterance/vaiueo2d.wav', dtype=np.float64) # 1. A convient way f0, sp, ap = pw.wav2world(x, fs) # use default options y = pw.synthesize(f0, sp, ap, fs, pw.default_frame_period) # 2. Step by step # 2-1 Without F0 refinement _f0, t = pw.dio(x, fs, f0_floor=50.0, f0_ceil=600.0, channels_in_octave=2, frame_period=args.frame_period, speed=args.speed) _sp = pw.cheaptrick(x, _f0, t, fs) _ap = pw.d4c(x, _f0, t, fs) _y = pw.synthesize(_f0, _sp, _ap, fs, args.frame_period) # librosa.output.write_wav('test/y_without_f0_refinement.wav', _y, fs) sf.write('test/y_without_f0_refinement.wav', _y, fs) # 2-2 DIO with F0 refinement (using Stonemask) f0 = pw.stonemask(x, _f0, t, fs) sp = pw.cheaptrick(x, f0, t, fs) ap = pw.d4c(x, f0, t, fs) y = pw.synthesize(f0, sp, ap, fs, args.frame_period) # librosa.output.write_wav('test/y_with_f0_refinement.wav', y, fs) sf.write('test/y_with_f0_refinement.wav', y, fs) # 2-3 Harvest with F0 refinement (using Stonemask) _f0_h, t_h = pw.harvest(x, fs) f0_h = pw.stonemask(x, _f0_h, t_h, fs) sp_h = pw.cheaptrick(x, f0_h, t_h, fs) ap_h = pw.d4c(x, f0_h, t_h, fs) y_h = pw.synthesize(f0_h, sp_h, ap_h, fs, pw.default_frame_period) # librosa.output.write_wav('test/y_harvest_with_f0_refinement.wav', y_h, fs) sf.write('test/y_harvest_with_f0_refinement.wav', y_h, fs) # 2-4 DIO with F0 refinement (using Stonemask). Code and restore sp, ap. code_sp = pw.code_spectral_envelope(sp, fs, 80) code_ap = pw.code_aperiodicity(ap, fs) fft_size = (sp.shape[1] - 1) * 2 rest_sp = pw.decode_spectral_envelope(code_sp, fs, fft_size) rest_ap = pw.decode_aperiodicity(code_ap, fs, fft_size) y_r = pw.synthesize(f0, rest_sp, rest_ap, fs, args.frame_period) sf.write('test/y_with_f0_refinement_code_and_restore.wav', y_r, fs) print("fft size: {:d}".format(fft_size)) print("coded sp shape: ({:d}, {:d})".format(code_sp.shape[0], code_sp.shape[1])) print("coded ap shape: ({:d}, {:d})".format(code_ap.shape[0], code_ap.shape[1])) # 2-5 DIO with F0 refinement (using Stonemask). Code and restore sp, ap. frame_shift: 12.5 ms, frame_length: 50.0 ms f0_xx, t_xx = pw.dio(x, fs, f0_floor=50.0, f0_ceil=600.0, channels_in_octave=2, frame_period=12.5, speed=args.speed) f0_xx = pw.stonemask(x, f0_xx, t_xx, fs) sp_xx = pw.cheaptrick(x, f0_xx, t_xx, fs) ap_xx = pw.d4c(x, f0_xx, t_xx, fs) code_sp_xx = pw.code_spectral_envelope(sp_xx, fs, 80) code_ap_xx = pw.code_aperiodicity(ap_xx, fs) fft_size = (sp_xx.shape[1] - 1) * 2 rest_sp_xx = pw.decode_spectral_envelope(code_sp_xx, fs, fft_size) rest_ap_xx = pw.decode_aperiodicity(code_ap_xx, fs, fft_size) y_r_xx = pw.synthesize(f0_xx, rest_sp_xx, rest_ap_xx, fs, 12.5) sf.write( 'test/y_with_f0_refinement_code_and_restore_frame_period_12.5.wav', y_r_xx, fs) print("coded sp_xx shape: ({:d}, {:d})".format(code_sp_xx.shape[0], code_sp_xx.shape[1])) print("coded ap_xx shape: ({:d}, {:d})".format(code_ap_xx.shape[0], code_ap_xx.shape[1])) # Comparison savefig('test/wavform.png', [x, _y, y, y_h, y_r, y_r_xx]) savefig('test/sp.png', [_sp, sp, sp_h, rest_sp, rest_sp_xx]) savefig('test/ap.png', [_ap, ap, ap_h, rest_ap, rest_ap_xx], log=False) savefig('test/f0.png', [_f0, f0, f0_h, f0_xx]) print('Please check "test" directory for output files')