def main(args): if os.path.isdir('test'): rmtree('test') os.mkdir('test') x, fs = sf.read('utterance/vaiueo2d.wav') # x, fs = librosa.load('utterance/vaiueo2d.wav', dtype=np.float64) # 1. A convient way f0, sp, ap = pw.wav2world(x, fs) # use default options y = pw.synthesize(f0, sp, ap, fs, pw.default_frame_period) # 2. Step by step # 2-1 Without F0 refinement _f0, t = pw.dio(x, fs, f0_floor=50.0, f0_ceil=600.0, channels_in_octave=2, frame_period=args.frame_period, speed=args.speed) _sp = pw.cheaptrick(x, _f0, t, fs) _ap = pw.d4c(x, _f0, t, fs) _y = pw.synthesize(_f0, _sp, _ap, fs, args.frame_period) # librosa.output.write_wav('test/y_without_f0_refinement.wav', _y, fs) sf.write('test/y_without_f0_refinement.wav', _y, fs) # 2-2 DIO with F0 refinement (using Stonemask) f0 = pw.stonemask(x, _f0, t, fs) sp = pw.cheaptrick(x, f0, t, fs) ap = pw.d4c(x, f0, t, fs) y = pw.synthesize(f0, sp, ap, fs, args.frame_period) # librosa.output.write_wav('test/y_with_f0_refinement.wav', y, fs) sf.write('test/y_with_f0_refinement.wav', y, fs) # 2-3 Harvest with F0 refinement (using Stonemask) _f0_h, t_h = pw.harvest(x, fs) f0_h = pw.stonemask(x, _f0_h, t_h, fs) sp_h = pw.cheaptrick(x, f0_h, t_h, fs) ap_h = pw.d4c(x, f0_h, t_h, fs) y_h = pw.synthesize(f0_h, sp_h, ap_h, fs, pw.default_frame_period) # librosa.output.write_wav('test/y_harvest_with_f0_refinement.wav', y_h, fs) sf.write('test/y_harvest_with_f0_refinement.wav', y_h, fs) # Comparison savefig('test/wavform.png', [x, _y, y]) savefig('test/sp.png', [_sp, sp]) savefig('test/ap.png', [_ap, ap], log=False) savefig('test/f0.png', [_f0, f0]) print('Please check "test" directory for output files')
def world_decompose(wav, fs, frame_period = 5.0): # Decompose speech signal into f0, spectral envelope and aperiodicity using WORLD wav = wav.astype(np.float64) f0, timeaxis = pyworld.harvest(wav, fs, frame_period = frame_period, f0_floor = 71.0, f0_ceil = 800.0) sp = pyworld.cheaptrick(wav, f0, timeaxis, fs) ap = pyworld.d4c(wav, f0, timeaxis, fs) return f0, timeaxis, sp, ap
def wav2pw(x, fs=16000, fft_size=FFT_SIZE): ''' Extract WORLD feature from waveform ''' _f0, t = pw.dio(x, fs, f0_ceil=args.f0_ceil) # raw pitch extractor f0 = pw.stonemask(x, _f0, t, fs) # pitch refinement sp = pw.cheaptrick(x, f0, t, fs, fft_size=fft_size) ap = pw.d4c(x, f0, t, fs, fft_size=fft_size) # extract aperiodicity return { 'f0': f0, 'sp': sp, 'ap': ap, }
def collect_features(self, wav_path, label_path): fs, x = wavfile.read(wav_path) x = x.astype(np.float64) if hp_acoustic.use_harvest: f0, timeaxis = pyworld.harvest( x, fs, frame_period=hp_acoustic.frame_period, f0_floor=hp_acoustic.f0_floor, f0_ceil=hp_acoustic.f0_ceil) else: f0, timeaxis = pyworld.dio( x, fs, frame_period=hp_acoustic.frame_period, f0_floor=hp_acoustic.f0_floor, f0_ceil=hp_acoustic.f0_ceil) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) bap = pyworld.code_aperiodicity(aperiodicity, fs) if self.alpha is None: self.alpha = pysptk.util.mcepalpha(fs) mgc = pysptk.sp2mc(spectrogram, order=hp_acoustic.order, alpha=self.alpha) f0 = f0[:, None] lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) if hp_acoustic.use_harvest: # https://github.com/mmorise/World/issues/35#issuecomment-306521887 vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None] else: vuv = (lf0 != 0).astype(np.float32) lf0 = P.interp1d(lf0, kind=hp_acoustic.f0_interpolation_kind) # Parameter trajectory smoothing if hp_acoustic.mod_spec_smoothing: hop_length = int(fs * (hp_acoustic.frame_period * 0.001)) modfs = fs / hop_length mgc = P.modspec_smoothing( mgc, modfs, cutoff=hp_acoustic.mod_spec_smoothing_cutoff) mgc = P.delta_features(mgc, hp_acoustic.windows) lf0 = P.delta_features(lf0, hp_acoustic.windows) bap = P.delta_features(bap, hp_acoustic.windows) features = np.hstack((mgc, lf0, vuv, bap)) # Cut silence frames by HTS alignment labels = hts.load(label_path) features = features[:labels.num_frames()] indices = labels.silence_frame_indices() features = np.delete(features, indices, axis=0) return features.astype(np.float32)
def __call__(self, data: Wave, test=None): x = data.wave.astype(numpy.float64) fs = data.sampling_rate if self._f0_estimating_method == 'dio': _f0, t = pyworld.dio( x, fs, frame_period=self._frame_period, f0_floor=self._f0_floor, f0_ceil=self._f0_ceil, ) else: from world4py.np import apis _f0, t = apis.harvest( x, fs, frame_period=self._frame_period, f0_floor=self._f0_floor, f0_ceil=self._f0_ceil, ) f0 = pyworld.stonemask(x, _f0, t, fs) spectrogram = pyworld.cheaptrick(x, f0, t, fs) aperiodicity = pyworld.d4c(x, f0, t, fs) mfcc = pysptk.sp2mc(spectrogram, order=self._order, alpha=self._alpha) voiced = ~(f0 == 0) # type: numpy.ndarray feature = AcousticFeature( f0=f0[:, None].astype(self._dtype), spectrogram=spectrogram.astype(self._dtype), aperiodicity=aperiodicity.astype(self._dtype), mfcc=mfcc.astype(self._dtype), voiced=voiced[:, None], ) feature.validate() return feature
def test_vc_from_path(model, x, fs, data_mean, data_std, diffvc=True): model.eval() hop_length = int(fs * (hp.frame_period * 0.001)) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) alpha = pysptk.util.mcepalpha(fs) mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] static_dim = mc.shape[-1] mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50) mc = P.delta_features(mc, hp.windows).astype(np.float32) T = mc.shape[0] inputs = mc[:, :static_dim].copy() # Normalization mc_scaled = P.scale(mc, data_mean, data_std) mc_scaled = Variable(torch.from_numpy(mc_scaled)) lengths = [len(mc_scaled)] # Add batch axis mc_scaled = mc_scaled.view(1, -1, mc_scaled.size(-1)) # For MLPG R = unit_variance_mlpg_matrix(hp.windows, T) R = torch.from_numpy(R) # Apply model if model.include_parameter_generation(): # Case: models include parameter generation in itself # Mulistream features cannot be used in this case y_hat, y_hat_static = model(mc_scaled, R, lengths=lengths) else: # Case: generic models (can be sequence model) assert hp.has_dynamic_features is not None y_hat = model(mc_scaled, lengths=lengths) y_hat_static = multi_stream_mlpg( y_hat, R, hp.stream_sizes, hp.has_dynamic_features) mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim) # Denormalize mc_static_pred = P.inv_scale( mc_static_pred, data_mean[:static_dim], data_std[:static_dim]) outputs = mc_static_pred.copy() if diffvc: mc_static_pred = mc_static_pred - mc[:, :static_dim] mc = np.hstack((c0[:, None], mc_static_pred)) if diffvc: mc[:, 0] = 0 # remove power coefficients engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha), hopsize=hop_length) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(x, b) else: fftlen = pyworld.get_cheaptrick_fft_size(fs) spectrogram = pysptk.mc2sp( mc.astype(np.float64), alpha=alpha, fftlen=fftlen) waveform = pyworld.synthesize( f0, spectrogram, aperiodicity, fs, hp.frame_period) return waveform, inputs, outputs
def test_vc_from_path(model, x, fs, data_mean, data_std, diffvc=True): model.eval() hop_length = int(fs * (hp.frame_period * 0.001)) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) alpha = pysptk.util.mcepalpha(fs) mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] static_dim = mc.shape[-1] mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50) mc = P.delta_features(mc, hp.windows).astype(np.float32) T = mc.shape[0] inputs = mc[:, :static_dim].copy() # Normalization mc_scaled = P.scale(mc, data_mean, data_std) mc_scaled = Variable(torch.from_numpy(mc_scaled)) lengths = [len(mc_scaled)] # Add batch axis mc_scaled = mc_scaled.view(1, -1, mc_scaled.size(-1)) # For MLPG R = unit_variance_mlpg_matrix(hp.windows, T) R = torch.from_numpy(R) # Apply model if model.include_parameter_generation(): # Case: models include parameter generation in itself # Mulistream features cannot be used in this case y_hat, y_hat_static = model(mc_scaled, R, lengths=lengths) else: # Case: generic models (can be sequence model) assert hp.has_dynamic_features is not None y_hat = model(mc_scaled, lengths=lengths) y_hat_static = multi_stream_mlpg(y_hat, R, hp.stream_sizes, hp.has_dynamic_features) mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim) # Denormalize mc_static_pred = P.inv_scale(mc_static_pred, data_mean[:static_dim], data_std[:static_dim]) outputs = mc_static_pred.copy() if diffvc: mc_static_pred = mc_static_pred - mc[:, :static_dim] mc = np.hstack((c0[:, None], mc_static_pred)) if diffvc: mc[:, 0] = 0 # remove power coefficients engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha), hopsize=hop_length) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(x, b) else: fftlen = pyworld.get_cheaptrick_fft_size(fs) spectrogram = pysptk.mc2sp(mc.astype(np.float64), alpha=alpha, fftlen=fftlen) waveform = pyworld.synthesize(f0, spectrogram, aperiodicity, fs, hp.frame_period) return waveform, inputs, outputs
from python_speech_features import mfcc from sklearn.model_selection import train_test_split from keras.utils import np_utils ############################################################## # Test one audio file path='/Users/adaezeadigwe/Desktop/Research/project_ml/Data/anger/anger_0001.wav' Data_Directory = '/Users/adaezeadigwe/Desktop/Research/project_ml/Data/' x, fs = sf.read(path) _f0, t = pw.dio(x, fs) # raw pith extractor f0 = pw.stonemask(x, _f0, t, fs) # pitch refinement sp = pw.cheaptrick(x, f0, t, fs) # extract smoothed spectrogram ap = pw.d4c(x, f0, t, fs) # extract aperiodicity mfcc = librosa.feature.mfcc(x, sr=16000) y = pw.synthesize(f0, sp, ap, fs) #wav.write('neutral_syn.wav',fs, y) print(f0.shape) #1071, print(sp.shape) #1071 , 513 print(ap.shape) #1071, 513 print(mfcc.shape) #20,168 ############################################################## #A. LOOP to extract vector by sample of source fundamental frequency def get_labels(path=Data_Directory): labels = os.listdir(path) label_indices = np.arange(0, len(labels))
# Making directories for speech features for s in spklist: for f in featlist: if not os.path.exists("data/{}/{}".format(s, f)): os.mkdir("data/{}/{}".format(s, f)) for s in spklist: wavlist = os.listdir("data/{}/wav".format(s)) for wf in wavlist: # WORLD analysis for each file print("speaker: {} file: {}".format(s, wf)) fs, data = wavfile.read("data/{}/wav/{}".format(s, wf)) data = data.astype(np.float) f0, t = pw.harvest(data, fs) sp = pw.cheaptrick(data, f0, t, fs) ap = pw.d4c(data, f0, t, fs) alpha = 0.42 dim = 24 mgc = sptk.sp2mc(sp, dim, alpha) bn, _ = os.path.splitext(wf) with open("data/{}/mgc/{}.mgc".format(s, bn), "wb") as f: mgc.tofile(f) with open("data/{}/f0/{}.f0".format(s, bn), "wb") as f: f0.tofile(f) with open("data/{}/ap/{}.ap".format(s, bn), "wb") as f: ap.tofile(f)
def analysisf(self, fwav, ff0, f0_min, f0_max, fspec, faper, fvuv, preproc_hp=None): print('Extracting WORLD features from: ' + fwav) wav, fs, _ = sp.wavread(fwav) if preproc_hp == 'auto': preproc_hp = f0_min self.preprocwav(wav, fs, highpass=preproc_hp) import pyworld as pw if 0: # Check direct copy re-synthesis without compression/encoding print(pw.__file__) # _f0, ts = pw.dio(wav, fs, f0_floor=f0_min, f0_ceil=f0_max, channels_in_octave=2, frame_period=self.shift*1000.0) _f0, ts = pw.dio(wav, fs, f0_floor=f0_min, f0_ceil=f0_max, channels_in_octave=2, frame_period=self.shift * 1000.0) # _f0, ts = pw.harvest(wav, fs) f0 = pw.stonemask(wav, _f0, ts, fs) SPEC = pw.cheaptrick(wav, f0, ts, fs, fft_size=self.dftlen) APER = pw.d4c(wav, f0, ts, fs, fft_size=self.dftlen) resyn = pw.synthesize(f0.astype('float64'), SPEC.astype('float64'), APER.astype('float64'), fs, self.shift * 1000.0) sp.wavwrite('resynth.wav', resyn, fs, norm_abs=True, force_norm_abs=True, verbose=1) from IPython.core.debugger import Pdb Pdb().set_trace() _f0, ts = pw.dio(wav, fs, f0_floor=f0_min, f0_ceil=f0_max, channels_in_octave=2, frame_period=self.shift * 1000.0) f0 = pw.stonemask(wav, _f0, ts, fs) SPEC = pw.cheaptrick(wav, f0, ts, fs, fft_size=self.dftlen) # SPEC = 10.0*np.sqrt(SPEC) # TODO Best gain correction I could find. Hard to find the good one between PML and WORLD different syntheses APER = pw.d4c(wav, f0, ts, fs, fft_size=self.dftlen) unvoiced = np.where(f0 < 20)[0] f0 = np.interp(ts, ts[f0 > 0], f0[f0 > 0]) f0 = np.log(f0) makedirs(os.path.dirname(ff0)) f0.astype('float32').tofile(ff0) vuv = np.ones(len(f0)) vuv[unvoiced] = 0 makedirs(os.path.dirname(fvuv)) vuv.astype('float32').tofile(fvuv) SPEC = self.compress_spectrum(SPEC, fs, self.spec_size) makedirs(os.path.dirname(fspec)) SPEC.astype('float32').tofile(fspec) APER = sp.linbnd2fwbnd(APER, fs, self.dftlen, self.aper_size) APER = sp.mag2db(APER) makedirs(os.path.dirname(faper)) APER.astype('float32').tofile(faper) # CMP = np.concatenate((f0.reshape((-1,1)), SPEC, APER, vuv.reshape((-1,1))), axis=1) # (This is not a necessity) if 0: import matplotlib.pyplot as plt plt.ion() resyn = self.synthesis(fs, CMP) sp.wavwrite('resynth.wav', resyn, fs, norm_abs=True, force_norm_abs=True, verbose=1) from IPython.core.debugger import Pdb Pdb().set_trace()
def get_con2(x, words): _f0, t = pw.dio(x, fs, f0_floor=120.0, f0_ceil=750.0, frame_period=8.0) f0_herz = pw.stonemask(x, _f0, t, fs) sp = pw.cheaptrick(x, f0_herz, t, fs) ap = pw.d4c(x, f0_herz, t, fs) # print(sp.shape) f0_note = [] for i in range(len(f0_herz)): if f0_herz[i] == 0: f0_note.append(0.0) else: f0_note.append(herz2note(f0_herz[i])) con2 = [] # plt.plot(np.arange(len(x)/256),f0_note) for i in range(len(words)): note = cal_note(f0_note[words[i][0]:words[i][1]]) con2.append(note) # print(words[i]) # x=np.arange(words[i][0],words[i][1]) # y=np.zeros(words[i][1]-words[i][0]) # y.fill(note) # print(x,y) # plt.plot(x,y) # plt.show() f0_note = np.array(f0_note) f0_note = np.round((f0_note - 40.0) * 5) # print(f0_note) f0_mat = np.zeros([f0_note.shape[0], 200]) f0_mat.fill(0.0) for i in range(f0_note.shape[0]): if f0_note[i] > 0.0 and f0_note[i] < 200: f0_mat[i][int(f0_note[i])] = 1.0 else: f0_note[i] = 0 # plt.matshow(ap) # plt.show() ap = ap * 20 - 18 arr = [] for i in range(sp.shape[0]): arr.append( np.interp(np.linspace(0, 1025, 32), np.arange(1025), ap[i])[np.newaxis, :]) _ap = np.concatenate(arr, axis=0) sp = np.log(sp) # plt.matshow(sp) # plt.show() arr = [] for i in range(sp.shape[0]): arr.append( np.interp(np.linspace(0, 1025, 128), np.arange(1025), sp[i])[np.newaxis, :]) _sp = np.concatenate(arr, axis=0) mel = np.concatenate([_ap, _sp], axis=1) # mel=mel+20.0 # mel=np.where(mel>0,mel,0) # mel=mel/mel.max() # plt.matshow(mel) # plt.show() return np.array(con2), mel, f0_note.astype(np.int32)
if opts.harvest: print("Begin harvest ...") f0_x, tp_x = pw.harvest(x, RATE, f0_floor, f0_ceil, frame_period) out_filename += 'harvest' else: print("Begin stonemask ...") f0_x, tp_x = pw.dio(x, RATE, f0_floor, f0_ceil, channels_in_octave, frame_period, speed, allowed_range) f0_x = pw.stonemask(x, f0_x, tp_x, RATE) out_filename += 'dio' print("Begin cheaptrick ...") sp_x = pw.cheaptrick(x, f0_x, tp_x, RATE, q1, f0_floor, fft_size) print("Begin d4c ...") ap_x = pw.d4c(x, f0_x, tp_x, RATE, threshold, fft_size) lz, tz, f0_x, sp_x, ap_x = trim_zeros_frames(f0_x, sp_x, ap_x, 0.7) uv = (f0_x == 0).astype(int) print("Begin f0 transform ...") lf0_x = toquefrency(f0_x) print("Begin sp transform ...") mgc_x = pysptk.conversion.sp2mc(sp_x, order=mcsize, alpha=alpha) print("Begin ap transform ...") bap_x = pysptk.conversion.sp2mc(ap_x, order=mcsize, alpha=alpha) statsdir = 'model_saves/ST_STATS_mlpg.npy' savedir = 'model_saves/theta_best_mlpg.dat'
def world_feature_extract(wav_list, spk_list, feat_param_list, args): """EXTRACT WORLD FEATURE VECTOR""" for i, wav_name in enumerate(wav_list): bin_basename = os.path.basename(wav_name).replace('wav', 'bin') # spk = os.path.dirname(wav_name).split('/')[-1][-3:] spk = os.path.dirname(wav_name).split('/')[-1] bin_name = os.path.join(args.bindir, 'noVAD', spk, bin_basename) vad_bin_name = os.path.join(args.bindir, 'VAD', spk, bin_basename) if os.path.exists(bin_name): if args.overwrite: logging.info("overwrite %s (%d/%d)" % (wav_name, i + 1, len(wav_list))) else: logging.info("skip %s (%d/%d)" % (wav_name, i + 1, len(wav_list))) continue else: logging.info("now processing %s (%d/%d)" % (wav_name, i + 1, len(wav_list))) feat_param = feat_param_list[spk_list.index(spk)] # load wavfile and apply low cut filter fs, x = wavfile.read(wav_name) x = np.array(x, dtype=np.float64) x = low_cut_filter(x, fs, cutoff=feat_param['highpass_cutoff']) # check sampling frequency if not fs == feat_param['fs']: logging.error("sampling frequency is not matched: %s" % wav_name) sys.exit(1) # extract features f0, time_axis = pw.harvest(x, feat_param['fs'], f0_floor=feat_param['f0min'], f0_ceil=feat_param['f0max'], frame_period=feat_param['shift_ms']) sp = pw.cheaptrick(x, f0, time_axis, feat_param['fs'], fft_size=feat_param['fftl']) ap = pw.d4c(x, f0, time_axis, feat_param['fs'], fft_size=feat_param['fftl']) mcc = pysptk.sp2mc(sp, feat_param['mcep_dim'], feat_param['mcep_alpha']) en_sp, sp = energy_norm(sp) sp = np.log10(sp) en_mcc = mcc[:, 0] # expand dimensions for concatenation f0 = np.expand_dims(f0, axis=-1) en_mcc = np.expand_dims(en_mcc, axis=-1) # concatenation world_feats = np.concatenate([sp, mcc[:, 1:], ap, f0, en_sp, en_mcc], axis=1) labels = spk_list.index(spk) * np.ones( [sp.shape[0], 1], np.float32) # concatenate all features feats = np.concatenate( [world_feats, labels], axis=1).astype(np.float32) # VAD vad_idx = np.where(f0.copy().reshape([-1])>10)[0] if len(vad_idx) < 1: logging.info("invalid wave file: %s" % wav_name) continue vad_feats = feats[vad_idx[0] : vad_idx[-1]+1] # write to bin with open(bin_name, 'wb') as fp: fp.write(feats.tostring()) with open(vad_bin_name, 'wb') as fp: fp.write(vad_feats.tostring())
def estimate(letter, name): fb = 0 fm = 0 fe = 0 max_beg = 0 max_mid = 0 max_end = 0 if os.path.exists(f'zvucni_glasovi_wav/novi_{letter}_beg_{name}.wav'): beg, fs = sf.read(f'zvucni_glasovi_wav/novi_{letter}_beg_{name}.wav') """ f0_dio, timeaxis_dio = pw.dio(beg, fs, f0_floor=70.0, f0_ceil=800.0, channels_in_octave=3.0, frame_period=args.frame_period, speed=args.speed) """ f0, timeaxis = pw.harvest(beg, fs) f0_mask = pw.stonemask(beg, f0, timeaxis, fs) sp = pw.cheaptrick(beg, f0_mask, timeaxis, fs) ap = pw.d4c(beg, f0_mask, timeaxis, fs) y = pw.synthesize(f0_mask, sp, ap, fs, pw.default_frame_period) sf.write( f'zvucni_glasovi_after_sint/{letter}_beg_{name}_after_sint.wav', y, fs) plt.figure() plt.title(f'Glas {letter} na početku riječi') plt.plot(timeaxis, f0, 'r', label='Procjenjena f0 pomoću harvest() funkcije') # plt.plot(timeaxis_dio, f0_dio, 'y', label='Procjenjena f0 pomoću DIO() funkcije') plt.plot(timeaxis, f0_mask, 'g--', label='Pročišćena f0 pomoću stonemaska') plt.ylabel('frekvencija (Hz)') plt.xlabel('vrijeme (s)') plt.legend() plt.savefig(f'slike_f0_usporedbe/f0_{letter}_{name}_beg.png') savefig( f'slike_before_after_sint_usporedbe/before_after_synt_{letter}_{name}_beg.png', [beg, y], letter + '_beg') savefig(f'slike_sp_usporedbe/sp_{letter}_{name}_beg.png', [sp], letter) savefig(f'slike_ap_usporedbe/ap_{letter}_{name}_beg.png', [ap], letter, log=False) plt.close() max_beg = np.max(f0) fb = 1 if os.path.exists(f'zvucni_glasovi_wav/novi_{letter}_mid_{name}.wav'): mid, fs = sf.read(f'zvucni_glasovi_wav/novi_{letter}_mid_{name}.wav') """ f0_dio, timeaxis_dio = pw.dio(mid, fs, f0_floor=70.0, f0_ceil=800.0, channels_in_octave=2.0, frame_period=args.frame_period, speed=args.speed) """ f0, timeaxis = pw.harvest(mid, fs) f0_mask = pw.stonemask(mid, f0, timeaxis, fs) sp = pw.cheaptrick(mid, f0_mask, timeaxis, fs) ap = pw.d4c(mid, f0_mask, timeaxis, fs) y = pw.synthesize(f0_mask, sp, ap, fs, pw.default_frame_period) sf.write( f'zvucni_glasovi_after_sint/{letter}_mid_{name}_after_sint.wav', y, fs) plt.figure() plt.title(f'Glas {letter} u sredini riječi') plt.plot(timeaxis, f0, 'r', label='Procjenjena f0 pomoću harvest() funkcije') # plt.plot(timeaxis_dio, f0_dio, 'y', label='Procjenjena f0 pomoću DIO() funkcije') plt.plot(timeaxis, f0_mask, 'g--', label='Pročišćena f0 pomoću stonemaska') plt.ylabel('frekvencija (Hz)') plt.xlabel('vrijeme (s)') plt.legend() plt.savefig(f'slike_f0_usporedbe/f0_{letter}_{name}_mid.png') savefig( f'slike_before_after_sint_usporedbe/before_after_synt_{letter}_{name}_mid.png', [mid, y], letter + '_mid') savefig(f'slike_sp_usporedbe/sp_{letter}_{name}_mid.png', [sp], letter) savefig(f'slike_ap_usporedbe/ap_{letter}_{name}_mid.png', [ap], letter, log=False) plt.close() max_mid = np.max(f0) fm = 1 if os.path.exists(f'zvucni_glasovi_wav/novi_{letter}_end_{name}.wav'): end, fs = sf.read(f'zvucni_glasovi_wav/novi_{letter}_end_{name}.wav') """ f0_dio, timeaxis_dio = pw.dio(end, fs, f0_floor=70.0, f0_ceil=800.0, channels_in_octave=2.0, frame_period=args.frame_period, speed=args.speed) """ f0, timeaxis = pw.harvest(end, fs) f0_mask = pw.stonemask(end, f0, timeaxis, fs) sp = pw.cheaptrick(end, f0_mask, timeaxis, fs) ap = pw.d4c(end, f0_mask, timeaxis, fs) y = pw.synthesize(f0_mask, sp, ap, fs, pw.default_frame_period) sf.write( f'zvucni_glasovi_after_sint/{letter}_end_{name}_after_sint.wav', y, fs) plt.figure() plt.title(f'Glas {letter} na kraju riječi') plt.plot(timeaxis, f0, 'r', label='Procjenjena f0 pomoću harvest() funkcije') # plt.plot(timeaxis_dio, f0_dio, 'y', label='Procjenjena f0 pomoću DIO() funkcije') plt.plot(timeaxis, f0_mask, 'g--', label='Pročišćena f0 pomoću stonemaska') plt.ylabel('frekvencija (Hz)') plt.xlabel('vrijeme (s)') plt.legend() plt.savefig(f'slike_f0_usporedbe/f0_{letter}_{name}_end.png') savefig( f'slike_before_after_sint_usporedbe/before_after_synt_{letter}_{name}_end.png', [end, y], letter + '_end') savefig(f'slike_sp_usporedbe/sp_{letter}_{name}_end.png', [sp], letter) savefig(f'slike_ap_usporedbe/ap_{letter}_{name}_end.png', [ap], letter, log=False) plt.close() max_end = np.max(f0) fe = 1 if fb and fm and fe: x = [5, 10, 15] max_f0 = [max_beg, max_mid, max_end] plt.bar(x, height=max_f0) plt.axhline(np.average(max_f0), color='lightblue', linestyle='--', label='prosjek') plt.xticks(x, [f'{letter}_beg', f'{letter}_mid', f'{letter}_end']) plt.xlabel('pozicija') plt.ylabel('frekvencija (Hz)') plt.legend() plt.savefig(f'slike_f0_usporedbe/f0_{letter}_{name}_hist.png') plt.close() """
mfcc = dp.load_mfcc(filename, config_mfcc_mcep) ppg_sentence = converter.predict(mfcc) result = transformer.predict(ppg_sentence).numpy() #extract info from source file x, _ = librosa.load(filename, sr=config_mfcc_mcep["sampling_frequency"]) x = x.astype(np.float64) _f0, t = pw.dio( x, config_mfcc_mcep["sampling_frequency"]) # frame_period=10) f0_try = pw.stonemask(x, _f0, t, config_mfcc_mcep["sampling_frequency"] ) #refinement of f0 using stone mask ap_try = pw.d4c(x=x, f0=_f0, temporal_positions=t, fs=config_mfcc_mcep["sampling_frequency"], fft_size=config_mfcc_mcep["n_fft"]) #use transformed result indices = sorted(np.concatenate([np.arange(len(result))] * 2)) alpha = 0.35 spc = pysptk.mc2sp(result[indices], alpha, config_mfcc_mcep["n_fft"]).astype( np.float64)[:len(ap_try)] y2 = pw.synthesize(f0_try, spc, ap_try, config_mfcc_mcep["sampling_frequency"]) endfile = final_directory + l scipy.io.wavfile.write(endfile, config_mfcc_mcep["sampling_frequency"], y2)
def wav2world( wave, fs, mcep_order=25, f0_smoothing=0, ap_smoothing=0, mcep_smoothing=0, frame_period=None, f0_floor=None, f0_ceil=None, f0_mode="harvest"): # setup default values wave = wave.astype('float64') frame_period = pyworld.default_frame_period \ if frame_period is None else frame_period f0_floor = pyworld.default_f0_floor if f0_floor is None else f0_floor f0_ceil = pyworld.default_f0_ceil if f0_ceil is None else f0_ceil alpha = pysptk.util.mcepalpha(fs) # f0 if f0_mode == "harvest": f0, t = pyworld.harvest( wave, fs, f0_floor=f0_floor, f0_ceil=f0_ceil, frame_period=frame_period) threshold = 0.85 elif f0_mode == "reaper": _, _, t, f0, _ = reaper( (wave * (2**15 - 1)).astype("int16"), fs, frame_period=frame_period / 1000, do_hilbert_transform=True) t, f0 = t.astype('float64'), f0.astype('float64') threshold = 0.1 elif f0_mode == "dio": _f0, t = pyworld.dio(wave, fs) f0 = pyworld.stonemask(wave, _f0, t, fs) threshold = 0.0 else: raise ValueError # world sp = pyworld.cheaptrick(wave, f0, t, fs) ap = pyworld.d4c(wave, f0, t, fs, threshold=threshold) # extract vuv from ap vuv_flag = (ap[:, 0] < 0.5) * (f0 > 1.0) vuv = vuv_flag.astype('int') # continuous log f0 clf0 = np.zeros_like(f0) if vuv_flag.any(): if not vuv_flag[0]: f0[0] = f0[vuv_flag][0] vuv_flag[0] = True if not vuv_flag[-1]: f0[-1] = f0[vuv_flag][-1] vuv_flag[-1] = True idx = np.arange(len(f0)) clf0[idx[vuv_flag]] = np.log( np.clip(f0[idx[vuv_flag]], f0_floor / 2, f0_ceil * 2)) clf0[idx[~vuv_flag]] = interp1d( idx[vuv_flag], clf0[idx[vuv_flag]] )(idx[~vuv_flag]) if f0_smoothing > 0: clf0 = modspec_smoothing( clf0, 1000 / frame_period, cut_off=f0_smoothing) else: clf0 = np.ones_like(f0) * f0_floor # continuous coded ap cap = pyworld.code_aperiodicity(ap, fs) if ap_smoothing > 0: cap = modspec_smoothing(cap, 1000 / frame_period, cut_off=ap_smoothing) # mcep mcep = pysptk.mcep(sp, order=mcep_order, alpha=alpha, itype=4) if mcep_smoothing > 0: mcep = modspec_smoothing( mcep, 1000 / frame_period, cut_off=mcep_smoothing) fbin = sp.shape[1] return mcep, clf0, vuv, cap, sp, fbin, t
fs = sr fft_len = 1024 hop_length = 256 frame_period = hop_length / sr * 1000 # hop_length in ms f0_floor = 71. # default f0_ceil = 800. # default f0, timeaxis = pyworld.dio(x, fs=sr, f0_floor=f0_floor, f0_ceil=f0_ceil, frame_period=frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) sp = pyworld.cheaptrick(x, f0, timeaxis, fs, fft_size=fft_len) # Spectrogram ap = pyworld.d4c(x, f0, timeaxis, fs, fft_size=fft_len) # Aperiodicity plt.subplot(3, 1, 1) plt.plot(f0) plt.subplot(3, 1, 2) plt.plot(lf0) plt.subplot(3, 1, 3) librosa.display(sp.T, sr=sr, hop_length=hop_length, y_axis='linear') plt.show() y = pyworld.synthesize(f0, sp, ap, fs, frame_period) play_audio(y) bap = pyworld.code_aperiodicity(aperiodicity, fs) mgc = pysptk.sp2mc(spectrogram,
import peakutils import soundfile as sf import argparse from audiolazy import * from audiolazy.lazy_stream import Stream from audiolazy import Stream np.set_printoptions(threshold=np.inf) styletext = '<style>table {width:100%;}table, th, td {border: 1px solid black;border-collapse: collapse;}</style>' #path = "vaiueo2d.wav" path = "vaiueo2d.wav" x, fs = sf.read(path) print(x.shape) f0, t = pw.dio(x, fs, f0_floor=50.0, f0_ceil=600.0) sp = pw.cheaptrick(x, f0, t, fs) ap = pw.d4c(x, f0, t, fs) #_y = pw.synthesize(f0, sp, ap, fs) #if it has less that temporal position pop off the last element ################################# #DEFINE FORMANT DETECTION FUNCTION def cam_formants(x, fs): ms10 = math.ceil(fs * 0.005) ms30 = math.floor(fs * 0.03) ncoeff = 2 + fs / 1000 t = np.arange(0, len(x) - 1) t = t / fs pos = 1 fm = [] ft = []