def __getitem__(self, key): key, pitch_aug_factor, time_aug_factor = key wav = self.data[key] if self.normalize: # soundfile.read normalizes data to [-1,1] if dtype is not given array, rate = soundfile.read(wav, always_2d=self.always_2d) else: array, rate = soundfile.read(wav, dtype=self.dtype, always_2d=self.always_2d) if pitch_aug_factor != 0: # Pitch augmentation ratio = pow(2, 1 / 12) import pyworld as pw f0_pw, sp, ap = pw.wav2world(array, rate) # use default options array = pw.synthesize( f0_pw * (ratio**pitch_aug_factor), sp, ap, rate, pw.default_frame_period, ) if time_aug_factor != 1: # Time augmentation array = tsm.wsola(array, time_aug_factor) return rate, array
def get_world_feats(vocals): vocals = np.float64(vocals) feats = pw.wav2world(vocals, config.fs, frame_period=config.hoptime * 1000) ap = feats[2].reshape([feats[1].shape[0], feats[1].shape[1]]).astype(np.float32) ap = 10. * np.log10(ap**2) harm = 10 * np.log10(feats[1].reshape( [feats[2].shape[0], feats[2].shape[1]])) harm += config.world_offset f0 = feats[0] # f0 = pitch.extract_f0_sac(vocals, fs, config.hoptime) y = f0_to_hertz(f0) # import pdb;pdb.set_trace() # y = hertz_to_new_base(f0) nans, x = utils.nan_helper(y) naners = np.isinf(y) y[nans] = np.interp(x(nans), x(~nans), y[~nans]) # y=[float(x-(min_note-1))/float(max_note-(min_note-1)) for x in y] y = np.array(y).reshape([len(y), 1]) guy = np.array(naners).reshape([len(y), 1]) y = np.concatenate((y, guy), axis=-1) # if config.comp_mode == 'mfsc': harmy = sp_to_mfsc(harm, 60, 0.45) apy = sp_to_mfsc(ap, 4, 0.45) # elif config.comp_mode == 'mgc': # harmy=sp_to_mgc(harm,60,0.45) # apy=sp_to_mgc(ap,4,0.45) out_feats = np.concatenate((harmy, apy, y.reshape((-1, 2))), axis=1) return out_feats
def get_mgc(audio, sample_rate, frame_period, fft_size=512, mcep_size=60, alpha=0.65): if isinstance(audio, torch.Tensor): if audio.ndim > 1: audio = audio[0] audio = audio.numpy() _, sp, _ = pw.wav2world(audio.astype(np.double), fs=sample_rate, frame_period=frame_period, fft_size=fft_size) mgc = pysptk.sptk.mcep(sp, order=mcep_size, alpha=alpha, maxiter=0, etype=1, eps=1.0E-8, min_det=0.0, itype=3) return mgc
def input_to_feats(input_file, mode=config.comp_mode): audio, fs = sf.read(input_file) vocals = np.array(audio[:, 1]) feats = pw.wav2world(vocals, fs, frame_period=5.80498866) ap = feats[2].reshape([feats[1].shape[0], feats[1].shape[1]]).astype(np.float32) ap = 10. * np.log10(ap**2) harm = 10 * np.log10(feats[1].reshape( [feats[2].shape[0], feats[2].shape[1]])) y = 69 + 12 * np.log2(feats[0] / 440) nans, x = nan_helper(y) naners = np.isinf(y) y[nans] = np.interp(x(nans), x(~nans), y[~nans]) # y=[float(x-(min_note-1))/float(max_note-(min_note-1)) for x in y] y = np.array(y).reshape([len(y), 1]) guy = np.array(naners).reshape([len(y), 1]) y = np.concatenate((y, guy), axis=-1) if mode == 'mfsc': harmy = sp_to_mfsc(harm, 60, 0.45) apy = sp_to_mfsc(ap, 4, 0.45) elif mode == 'mgc': harmy = sp_to_mgc(harm, 60, 0.45) apy = sp_to_mgc(ap, 4, 0.45) out_feats = np.concatenate((harmy, apy, y.reshape((-1, 2))), axis=1) # harm_in=mgc_to_sp(harmy, 1025, 0.45) # ap_in=mgc_to_sp(apy, 1025, 0.45) return out_feats
def mcep_dir(srcroot, tgtroot, n_mcep=40, alpha=0.42): src = pathlib.Path(srcroot) tgt = pathlib.Path(tgtroot) if not pathlib.Path(src).exists(): raise ValueError('src not exists: {}'.format(src)) for p in sorted(src.glob('**/*.wav')): print(p) tgt_dir = tgt / p.parent.relative_to(src) tgt_stem = (tgt_dir / p.name).with_suffix('') tgt_dir.mkdir(parents=True, exist_ok=True) mcep_path = tgt_stem.with_suffix('.mcep.npy') c0_path = tgt_stem.with_suffix('.c0.npy') f0_path = tgt_stem.with_suffix('.f0.npy') ap_path = tgt_stem.with_suffix('.ap.npy') if mcep_path.exists() and c0_path.exists() and f0_path.exists() and ap_path.exists(): print('skip') continue sr, wav = wavfile.read(p) x = (wav/32768.0).astype(np.float64) f0, sp, ap = pyworld.wav2world(x.astype(np.float64), sr) mcep = pysptk.sptk.mcep(sp,order=n_mcep,alpha=alpha, itype=4) f0, mcep, ap = f0.astype(np.float32), mcep.T.astype(np.float32), ap.T.astype(np.float32) c0 = mcep[0, :] mcep = np.ascontiguousarray(mcep[1:, :]) ap = ap[192, :] np.save(mcep_path, mcep) np.save(c0_path, c0) np.save(f0_path, f0) np.save(ap_path, ap) print(tgt_stem, flush=True)
def extract_feats(file, feats_dir): fname = os.path.basename(file).split('.wav')[0] x, fs = sf.read(file) f0, sp, ap = pw.wav2world(x, fs, frame_period=20) np.savetxt(feats_dir + '/' + fname + '.f0_ascii', f0) np.savetxt(feats_dir + '/' + fname + '.sp_ascii', sp) np.savetxt(feats_dir + '/' + fname + '.ap_ascii', ap)
def wav2world(wavfile): wav, fs = sf.read(wavfile) f0,sp,ap=vocoder.wav2world(wav,fs , hp.n_fft, ap_depth=hp.num_bap) # feature normalization lf0 = f0_normalize(f0) mgc = sp_normalize(sp) bap = ap_normalize(ap) return np.array(world_features_to_one_tensor(lf0,mgc,bap))
def basic_analysis(wav, sample_rate): nbits = wav.itemsize * 8 int_ceiling = 2**(nbits - 1) float_data = wav.astype(np.float64) / int_ceiling f0, smoothed_spectrogram, aperiodicity = pyworld.wav2world( float_data, sample_rate) f0 = f0.reshape((-1, 1)) return f0, smoothed_spectrogram, aperiodicity
def entropy(filename): y, sr = librosa.core.load(filename) y = y.astype(np.float64) f0, sp, ap = pw.wav2world(y, sr) # compute entropy of f0 f0_entropy = -1 # if sum(f0) entropy = scipy.stats.entropy f0_entropy = entropy(np.trim_zeros(f0)) return f0_entropy
def world_spectrogram_default(wav, sr=_sr): """默认参数的world声码器语音转为特征频谱。""" # f0 : ndarray # F0 contour. 基频等高线 # sp : ndarray # Spectral envelope. 频谱包络 # ap : ndarray # Aperiodicity. 非周期性 f0, sp, ap = pw.wav2world(wav.astype(np.double), sr) # use default options return f0, sp, ap
def changeFreq(self, data, freq_target): data = data.astype(np.float) f0, sp, ap = pw.wav2world(data, self.fs) f0_positive = np.array([f for f in f0 if f > 0]) if len(f0_positive) == 0: return data f0_mean = np.mean(f0_positive) f0_new = f0 * freq_target / f0_mean synthesized = pw.synthesize(f0_new, sp, ap, self.fs, pw.default_frame_period) return synthesized
def extract_feats(file, feats_dir): fname = os.path.basename(file).split('.wav')[0] x, fs = sf.read(file) f0, sp, ap = pw.wav2world(x, fs, frame_period=1) _f0, t = pw.dio(x, fs, frame_period=1) t_sec = t[1:-1] * 16000 x_segments = np.split(x, t_sec.astype(int)) np.savetxt(feats_dir + '/' + fname + '.f0_ascii', f0) np.savetxt(feats_dir + '/' + fname + '.sp_ascii', sp) np.savetxt(feats_dir + '/' + fname + '.ap_ascii', ap)
def get_data(): import pyworld as pw import os import soundfile as sf cwd = os.getcwd() raw_folder = os.path.join(cwd, 'data', 'raw') processed_folder = os.path.join(raw_folder, 'processed') #create processed items folder if not os.path.exists(processed_folder): os.makedirs(processed_folder) for i, filename in enumerate(os.listdir(raw_folder)): if filename.endswith(".raw"): # print(os.path.join(directory, filename)) print("hree", i) data, samplerate = sf.read(os.path.join(raw_folder, filename), channels=1, endian='LITTLE', dtype='float', subtype='PCM_16', samplerate=48000) print("here2, data", data, "rate", samplerate) f0, sp, ap = pw.wav2world(data, fs=48000) print("passed through vocoder successfully. f0", f0) print("") print("") print("") print("sp", sp) print("") print("") print("") print("ap", ap) new_file_folder_path = os.path.join(processed_folder, str(i)) if not os.path.exists(new_file_folder_path): os.makedirs(new_file_folder_path) new_proccesed_file_path = os.path.join(new_file_folder_path, 'f0') f = open(new_proccesed_file_path, 'w') f.write(f0) f.close() new_proccesed_file_path = os.path.join(new_file_folder_path, 'sp') f = open(new_proccesed_file_path, 'w') f.write(sp) f.close() new_proccesed_file_path = os.path.join(new_file_folder_path, 'ap') f = open(new_proccesed_file_path, 'w') f.write(ap) f.close()
def spectral_entropy(filename): y, sr = librosa.core.load(filename) y = y.astype(np.float64) f0, sp, ap = pw.wav2world(y, sr) # power spectral density freq, psd = scipy.signal.periodogram(np.trim_zeros(f0)) # pdb.set_trace() # normalized_v = v / np.sqrt(np.sum(v**2)) # This routine will normalize pk and qk if they don’t sum to 1. f0_spectral_entropy = scipy.stats.entropy(psd) return f0_spectral_entropy
def _process_utterance(out_dir, index, wav_path, text, phone): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: wav = audio.load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max if hparams.vocoder=="world": spectrogram = audio.spectrogram(wav).astype(np.float32) f0, sp, ap = pw.wav2world(wav.astype(np.double), hparams.sample_rate) ap_coded = pw.code_aperiodicity(ap, hparams.sample_rate) sp_coded = pw.code_spectral_envelope(sp,hparams.sample_rate, hparams.coded_env_dim) world_spec = np.hstack([f0[:,np.newaxis],sp_coded,ap_coded]) n_frames = world_spec.shape[0] spectrogram_filename = 'synpaflex-spec-%05d.npy' % index encoded_filename = 'synpaflex-world-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, encoded_filename), world_spec, allow_pickle=False) else: # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = 'synpaflex-spec-%05d.npy' % index encoded_filename = 'synpaflex-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, encoded_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, encoded_filename, n_frames, text, phone)
def main(): _x, _fs = sf.read(oripath) # 原始 音频信息&采样率 _f0, _sp, _ap = pw.wav2world(_x, _fs) # 原始f0,sp,ap,合成要用到,这个函数更 直接简单 ori_f0, ori_timeaxis = pw.harvest(_x, _fs) # 貌似只有这个函数能出来 timeaxis:对应帧信息 x, fs = sf.read(oripath) # 目标 音频信息&采样率 f0, sp, ap = pw.wav2world(x, fs) # 目标 f0,sp,ap aim_f0, aim_timeaxis = pw.harvest(x, fs) aim_mean_f0 = get_mean_f0(aimpath) #目标f0均值,是 对数形式 的 数字(拿到下面去正态分布出来) ori_mean_f0 = get_mean_f0(oripath) #下面开始把源说话人的f0(有效帧:f0>0的帧),做一个转换(先取对数,再加上两者 f0对数均值差 ) for i in range(len(ori_timeaxis)): # 对原始说话人,逐帧筛选,有效帧 做对数处理后 再转换; if ori_f0[i] > 0: tmp_log_f0 = np.log(ori_f0[i]) tmp_log_f0 = tmp_log_f0 - ori_mean_f0 + aim_mean_f0 tmp_exp_f0 = np.exp(tmp_log_f0) # 反对数 ori_f0[i] = tmp_exp_f0 # 这样说是不对,再来一版本:下面这行不行 # aim_new_f0 = np.random.normal(aim_mean_f0, 1.0, sp.shape[0]) # 要的是目标新f0 print('原始 _x:wav的尺寸_x.shape = ' + str(_x.shape)) # 54852 维度,帧长度可以用len(timeaxis) print('目标:sp.shape[0] = ' + str(sp.shape[0]) + ' sp.shape = ' + str(sp.shape)) print('目标 ap.shape = ' + str(ap.shape)) print('f0.shape = ' + str(f0.shape) + '_f0.shape = ' + str(_f0.shape)) #print('aim_new_f0.shape'+str(aim_new_f0.shape)) print('原始f0.shape = ' + str(f0.shape)) print('原始_sp.shape = ' + str(_sp.shape) + ' 原始_ap.shape = ' + str(_ap.shape)) synthesized = pw.synthesize(ori_f0, _sp, _ap, _fs, pw.default_frame_period) sf.write('./synthesized.wav', synthesized, _fs)
def stft_to_feats(vocals, fs, mode=config.comp_mode): feats = pw.wav2world(vocals, fs, frame_period=5.80498866) ap = feats[2].reshape([feats[1].shape[0], feats[1].shape[1]]).astype(np.float32) ap = 10. * np.log10(ap**2) harm = 10 * np.log10(feats[1].reshape( [feats[2].shape[0], feats[2].shape[1]])) feats = pw.wav2world(vocals, fs, frame_period=5.80498866) f0 = feats[0] # f0 = pitch.extract_f0_sac(vocals, fs, 0.00580498866) y = 69 + 12 * np.log2(f0 / 440) # import pdb;pdb.set_trace() # y = hertz_to_new_base(f0) nans, x = nan_helper(y) naners = np.isinf(y) y[nans] = np.interp(x(nans), x(~nans), y[~nans]) # y=[float(x-(min_note-1))/float(max_note-(min_note-1)) for x in y] y = np.array(y).reshape([len(y), 1]) guy = np.array(naners).reshape([len(y), 1]) y = np.concatenate((y, guy), axis=-1) if mode == 'mfsc': harmy = sp_to_mfsc(harm, 60, 0.45) apy = sp_to_mfsc(ap, 4, 0.45) elif mode == 'mgc': harmy = sp_to_mgc(harm, 60, 0.45) apy = sp_to_mgc(ap, 4, 0.45) # import pdb;pdb.set_trace() out_feats = np.concatenate((harmy, apy, y.reshape((-1, 2))), axis=1) # harm_in=mgc_to_sp(harmy, 1025, 0.45) # ap_in=mgc_to_sp(apy, 1025, 0.45) return out_feats
def main(args): if os.path.isdir('test'): rmtree('test') os.mkdir('test') x, fs = sf.read("./{}.wav".format(edited_files["Thinking_Out_Loud"])) # x, fs = librosa.load('utterance/vaiueo2d.wav', dtype=np.float64) # 1. A convient way f0, sp, ap = pw.wav2world(x, fs) # use default options y = pw.synthesize(f0, sp, ap, fs, pw.default_frame_period) # 2. Step by step # 2-1 Without F0 refinement _f0, t = pw.dio(x, fs, f0_floor=50.0, f0_ceil=600.0, channels_in_octave=2, frame_period=args.frame_period, speed=args.speed) _sp = pw.cheaptrick(x, _f0, t, fs) _ap = pw.d4c(x, _f0, t, fs) _y = pw.synthesize(_f0, _sp, _ap, fs, args.frame_period) # librosa.output.write_wav('test/y_without_f0_refinement.wav', _y, fs) sf.write('test/y_without_f0_refinement.wav', _y, fs) # 2-2 DIO with F0 refinement (using Stonemask) f0 = pw.stonemask(x, _f0, t, fs) sp = pw.cheaptrick(x, f0, t, fs) ap = pw.d4c(x, f0, t, fs) y = pw.synthesize(f0, sp, ap, fs, args.frame_period) # librosa.output.write_wav('test/y_with_f0_refinement.wav', y, fs) sf.write('test/y_with_f0_refinement.wav', y, fs) # 2-3 Harvest with F0 refinement (using Stonemask) _f0_h, t_h = pw.harvest(x, fs) f0_h = pw.stonemask(x, _f0_h, t_h, fs) sp_h = pw.cheaptrick(x, f0_h, t_h, fs) ap_h = pw.d4c(x, f0_h, t_h, fs) y_h = pw.synthesize(f0_h, sp_h, ap_h, fs, pw.default_frame_period) # librosa.output.write_wav('test/y_harvest_with_f0_refinement.wav', y_h, fs) sf.write('test/y_harvest_with_f0_refinement.wav', y_h, fs) # Comparison savefig('test/wavform.png', [x, _y, y]) savefig('test/sp.png', [_sp, sp]) savefig('test/ap.png', [_ap, ap], log=False) savefig('test/f0.png', [_f0, f0]) print('Please check "test" directory for output files')
def world(y, sample_rate, fft_size, hop_size): if isinstance(y, torch.Tensor): y = y.numpy() if y.ndim == 2: y = y.squeeze(0) y = y.astype('float64') frame_period = 1000*hop_size/sample_rate f0, sp, ap = pw.wav2world(y, sample_rate, fft_size=fft_size, frame_period=frame_period) f0 = torch.from_numpy(f0).float() sp = torch.from_numpy(sp).float() ap = torch.from_numpy(ap).float() return f0, sp, ap
def compute_features_from_path(path): from tqdm import tqdm d={} d['sp_list']=[] d['f0_list']=[] d['ap_list']=[] for id in tqdm(transcript[transcript.index.str.contains(hp.validpatt)].index): file = [s for s in os.listdir(path) if id in s][0] wav,fs=sf.read(path+file) f0, sp, ap = pw.wav2world(wav, fs) # mgc, lf0, vuv = mgc_lf0_vuv(f0, sp, ap, fs=fs) d['sp_list'].append(sp) d['f0_list'].append(f0) d['ap_list'].append(ap) return d
def main(args): if os.path.isdir('test'): rmtree('test') os.mkdir('test') x, fs = sf.read('utterance/vaiueo2d.wav') # x, fs = librosa.load('utterance/vaiueo2d.wav', dtype=np.float64) # 1. A convient way f0, sp, ap = pw.wav2world(x, fs) # use default options y = pw.synthesize(f0, sp, ap, fs, pw.default_frame_period) # 2. Step by step # 2-1 Without F0 refinement _f0, t = pw.dio(x, fs, f0_floor=50.0, f0_ceil=600.0, channels_in_octave=2, frame_period=args.frame_period, speed=args.speed) _sp = pw.cheaptrick(x, _f0, t, fs) _ap = pw.d4c(x, _f0, t, fs) _y = pw.synthesize(_f0, _sp, _ap, fs, args.frame_period) # librosa.output.write_wav('test/y_without_f0_refinement.wav', _y, fs) sf.write('test/y_without_f0_refinement.wav', _y, fs) # 2-2 DIO with F0 refinement (using Stonemask) f0 = pw.stonemask(x, _f0, t, fs) sp = pw.cheaptrick(x, f0, t, fs) ap = pw.d4c(x, f0, t, fs) y = pw.synthesize(f0, sp, ap, fs, args.frame_period) # librosa.output.write_wav('test/y_with_f0_refinement.wav', y, fs) sf.write('test/y_with_f0_refinement.wav', y, fs) # 2-3 Harvest with F0 refinement (using Stonemask) _f0_h, t_h = pw.harvest(x, fs) f0_h = pw.stonemask(x, _f0_h, t_h, fs) sp_h = pw.cheaptrick(x, f0_h, t_h, fs) ap_h = pw.d4c(x, f0_h, t_h, fs) y_h = pw.synthesize(f0_h, sp_h, ap_h, fs, pw.default_frame_period) # librosa.output.write_wav('test/y_harvest_with_f0_refinement.wav', y_h, fs) sf.write('test/y_harvest_with_f0_refinement.wav', y_h, fs) # Comparison savefig('test/wavform.png', [x, _y, y]) savefig('test/sp.png', [_sp, sp]) savefig('test/ap.png', [_ap, ap], log=False) savefig('test/f0.png', [_f0, f0]) print('Please check "test" directory for output files')
def world(self): """Extracts vocoder features using WORLD. Note VUV in F0 is represented using 0.0 Returns: (np.ndarray[n_frames]): fundamental frequency, (np.ndarray[n_frames, sp_dim]): smoothed spectrogram, (np.ndarray[n_frames, ap_dim]): aperiodicity. """ nbits = self.data.itemsize * 8 int_ceiling = 2**(nbits - 1) float_data = self.data.astype(np.float64) / int_ceiling f0, smoothed_spectrogram, aperiodicity = pyworld.wav2world( float_data, self.sample_rate) return f0, smoothed_spectrogram, aperiodicity
def actuar(self, text): command = 'espeak -v ' + VOZ + ' "' + text + '" -w ' + ARCHIVO os.system(command) x, fs = sf.read(ARCHIVO) f0, sp, ap = pw.wav2world(x, fs) yy = pw.synthesize(f0 / GRAVEDAD, sp / ATENUACION_DEL_VOLUMEN, ap, fs / VELOCIDAD_DEL_DISCURSO, pw.default_frame_period) sf.write(ARCHIVO, yy, fs) mixer.init() mixer.music.load(ARCHIVO) mixer.music.play() while mixer.music.get_busy(): pygame.time.Clock().tick(10) mixer.quit() print("robot:$ " + text)
def file_to_sac(input_file): audio, fs = sf.read(input_file) vocals = np.array(audio[:, 1]) feats = pw.wav2world(vocals, fs, frame_period=5.80498866) f0 = feats[0] # f0 = pitch.extract_f0_sac(vocals, config.fs, 0.00580498866) y = 69 + 12 * np.log2(f0 / 440) # y = hertz_to_new_base(f0) nans, x = nan_helper(y) naners = np.isinf(y) y[nans] = np.interp(x(nans), x(~nans), y[~nans]) # y=[float(x-(min_note-1))/float(max_note-(min_note-1)) for x in y] y = np.array(y).reshape([len(y), 1]) guy = np.array(naners).reshape([len(y), 1]) y = np.concatenate((y, guy), axis=-1) return y
def main(args): if os.path.isdir('test'): rmtree('test') os.mkdir('test') # Read speech sample x, fs = sf.read(args.input) # 1. A convenient way f0, sp, ap = pw.wav2world(x, fs) # use default options y = pw.synthesize(f0, sp, ap, fs, pw.default_frame_period) # 2. Step by step # 2-1 Without F0 refinement _f0, t = pw.dio(x, fs, f0_floor=50.0, f0_ceil=600.0, channels_in_octave=2, frame_period=args.frame_period, speed=args.speed) _sp = pw.cheaptrick(x, _f0, t, fs) _ap = pw.d4c(x, _f0, t, fs) _y = pw.synthesize(_f0, _sp, _ap, fs, args.frame_period) sf.write('test/y_without_f0_refinement.wav', _y, fs) # 2-2 DIO with F0 refinement (using Stonemask) f0 = pw.stonemask(x, _f0, t, fs) sp = pw.cheaptrick(x, f0, t, fs) ap = pw.d4c(x, f0, t, fs) y = pw.synthesize(f0, sp, ap, fs, args.frame_period) sf.write('test/y_with_f0_refinement.wav', y, fs) # 2-3 Harvest with F0 refinement (using Stonemask) _f0_h, t_h = pw.harvest(x, fs) f0_h = pw.stonemask(x, _f0_h, t_h, fs) sp_h = pw.cheaptrick(x, f0_h, t_h, fs) ap_h = pw.d4c(x, f0_h, t_h, fs) y_h = pw.synthesize(f0_h, sp_h, ap_h, fs, pw.default_frame_period) sf.write('test/y_harvest_with_f0_refinement.wav', y_h, fs) # Comparison save_image('test/wavform.png', [x, _y, y]) save_image('test/sp.png', [_sp, sp]) save_image('test/ap.png', [_ap, ap], log=False) save_image('test/f0.png', [_f0, f0])
def load(cls, wavfile: str) -> Frq: path = pathlib.Path(wavfile).with_suffix(EXTENSION) if path.is_file(): data = np.load(path) else: # NOTE: WORLD anaylsis only works on mono-channel float64 samples f0, sp, ap = pyworld.wav2world( *soundfile.read(wavfile, dtype="float64")) if not f0.nonzero()[0].size: raise RuntimeError(f"f0 estimation failed for {wavfile}!!!") data = {"f0": f0, "sp": sp, "ap": ap} np.savez(path, **data) return cls(**data)
def main(): # read x, fs = sf.read('Datas/vaiueo2d.wav') # extract features f0, sp, ap = pw.wav2world(x, fs) # use default options # synthesize features y_default = pw.synthesize(f0, sp, ap, fs, pw.default_frame_period) # write sf.write('test/default.wav', y_default, fs) y_f0_x2 = pw.synthesize(f0*2, sp, ap, fs, pw.default_frame_period) sf.write('test/f0_x2.wav', y_f0_x2, fs) y_sp_x2 = pw.synthesize(f0, sp*2, ap, fs, pw.default_frame_period) sf.write('test/sp_x2.wav', y_sp_x2, fs) y_ap_x2 = pw.synthesize(f0, sp, ap, fs, pw.default_frame_period) sf.write('test/ap_x2.wav', y_ap_x2, fs)
def main(args): if os.path.isdir('test'): rmtree('test') os.mkdir('test') x, fs = sf.read('utterance/vaiueo2d.wav') # x, fs = librosa.load('utterance/vaiueo2d.wav', dtype=np.float64) # 1. A convient way f0, sp, ap, pyDioOpt = pw.wav2world(x, fs) # use default options y = pw.synthesize(f0, sp, ap, fs, pyDioOpt.option['frame_period']) # 2. Step by step pyDioOpt = pw.pyDioOption(f0_floor=50, f0_ceil=600, channels_in_octave=2, frame_period=args.frame_rate, speed=args.speed) # 2-1 Without F0 refinement _f0, t = pw.dio(x, fs, pyDioOpt) _sp = pw.cheaptrick(x, _f0, t, fs) _ap = pw.d4c(x, _f0, t, fs) _y = pw.synthesize(_f0, _sp, _ap, fs, pyDioOpt.option['frame_period']) # librosa.output.write_wav('test/y_without_f0_refinement.wav', _y, fs) sf.write('test/y_without_f0_refinement.wav', _y, fs) # 2-2 With F0 refinement (using stonemask) f0 = pw.stonemask(x, _f0, t, fs) sp = pw.cheaptrick(x, f0, t, fs) ap = pw.d4c(x, f0, t, fs) y = pw.synthesize(f0, sp, ap, fs, pyDioOpt.option['frame_period']) # librosa.output.write_wav('test/y_with_f0_refinement.wav', y, fs) sf.write('test/y_with_f0_refinement.wav', y, fs) # Comparison savefig('test/wavform.png', [x, _y, y]) savefig('test/sp.png', [_sp, sp]) savefig('test/ap.png', [_ap, ap], log=False) savefig('test/f0.png', [_f0, f0]) print('Please check "test" directory for output files')
def save_wav_ceps(fake_B, input_path, sample_path): length = 14000 bps, wav_data = wav.read(input_path) datas = [ wav_data[i:i + length, 0] for i in range(0, len(wav_data), length) ] wave = np.zeros([len(fake_B), length]) for (b, d) in zip(fake_B, datas): f0, _, pitch = pw.wav2world(d, bps) for cep in b: for i, Scep in enumerate(cep): if (i == 0): Scep = (Scep * 28) - 20 else: Scep = (Scep * 7) - 3 cep[i] = Scep sp = pysptk.mc2sp(b, 0.48, 2048) w = pw.synthesize(f0, sp, pitch, bps) np.append(wave, w) wave = np.reshape(wave, -1).astype('int16') wav.write(sample_path + '_fake.wav', bps, wave)
def main(args): x, fs = sf.read('voice.wav') f0, sp, ap = pw.wav2world(x, fs) y = pw.synthesize(f0, sp, ap, fs, pw.default_frame_period) sf.write('test_f0/y_10_semplice.wav', y, fs) sf.write('test_f0+sp/y_10_semplice.wav', y, fs) for i in range(1, 20): if i != 10: _f0 = (i / 10) * np.array(f0) _y = pw.synthesize(_f0, sp, ap, fs, args.frame_period) sf.write('test_f0/y_' + str(i) + '.wav', _y, fs) for i in range(1, 20): if i != 10: _f0 = (i / 10) * np.array(f0) _sp = (i / 10) * np.array(sp) _y = pw.synthesize(_f0, _sp, ap, fs, args.frame_period) sf.write('test_f0+sp/y_' + str(i) + '.wav', _y, fs) print('Please check "test" directory for output files')
def save_mcg_np(path): # Check if recording ID list (and thereby numpy representations) # have already been created if os.path.isfile(os.path.join(path, 'rec_ids.txt')): print('Recording ID list already exists. Assuming numpy arrays' ' exist as well. Skipping this folder.') return files = os.listdir(path) # Create a list of file endings to save as text file for later use rec_id_list = [] # Iterate through all .wav files and save as mcep feature arrays for filename in files: if filename.endswith('.wav'): rec_id_list.append(filename.rstrip('.wav')[-3:]) wav_path = os.path.join(path, filename) loaded_wav, _ = librosa.load(wav_path, sr=SAMPLING_RATE) # Use WORLD vocoder for spectral envelope _, sp, _ = pyworld.wav2world(loaded_wav.astype(np.double), fs=SAMPLING_RATE, frame_period=FRAME_PERIOD, fft_size=fft_size) # Extract MCEP features mgc = pysptk.sptk.mcep(sp, order=mcep_size, alpha=alpha, maxiter=0, etype=1, eps=1.0E-8, min_det=0.0, itype=3) # Save as numpy np.save(os.path.join(path, filename.rstrip('.wav') + '.npy'), mgc, allow_pickle=False) # Save list of file endings rec_id_file = open(os.path.join(path, 'rec_ids.txt'), 'w') for rec_id in sorted(rec_id_list): rec_id_file.write(rec_id + '\n')