def compute_spect(wav): # compute spectrogram D = pySTFT(wav).T D_mel = np.dot(D, mel_basis) D_db = 20 * np.log10(np.maximum(min_level, D_mel)) - 16 S = (D_db + 100) / 100 S = S[np.newaxis, :, :] if S.shape[1] <= 192: S, _ = pad_seq_to_2(S, 192) uttr = torch.from_numpy(S.astype(np.float32)).to(device) return uttr
def _processing_data(hparams, full_path, spk_label, spk_emb, gender, npz_name, pbar, i): if gender == 'M': lo, hi = 50, 250 elif gender == 'F': lo, hi = 100, 600 else: raise ValueError prng = RandomState(int(random.random())) x, fs = librosa.load(full_path, sr=hparams.sample_rate) assert fs == hparams.sample_rate if x.shape[0] % hparams.hop_size == 0: x = np.concatenate((x, np.array([1e-06])), axis=0) y = signal.filtfilt(b, a, x) wav = y * 0.96 + (prng.rand(y.shape[0]) - 0.5) * 1e-06 # compute spectrogram D = pySTFT(wav).T D_mel = np.dot(D, mel_basis) D_db = 20 * np.log10(np.maximum(min_level, D_mel)) - hparams.ref_level_db S = (D_db + 100) / 100 # extract f0 f0_rapt = sptk.rapt(wav.astype(np.float32) * 32768, fs, hparams.hop_size, min=lo, max=hi, otype=2) index_nonzero = (f0_rapt != -1e10) mean_f0, std_f0 = np.mean(f0_rapt[index_nonzero]), np.std( f0_rapt[index_nonzero]) f0_norm = speaker_normalization(f0_rapt, index_nonzero, mean_f0, std_f0) assert len(S) == len(f0_rapt) data = { 'mel': S.astype(np.float32), 'f0': f0_norm.astype(np.float32), 'spk_label': spk_label } if spk_emb is not None: data['spk_emb'] = spk_emb np.savez(npz_name, **data) pbar.update(i)
try: x, fs = sf.read(wav_path) except Exception as e: # print("Error on {}".format(basename)) print(e) if 'System error' not in str( e ) else None # preprocessed dir can have no wav file due to the lenght constraint. continue # assert fs == 16000 if x.shape[0] % 256 == 0: x = np.concatenate((x, np.array([1e-06])), axis=0) y = signal.filtfilt(b, a, x) wav = y * 0.96 + (prng.rand(y.shape[0]) - 0.5) * 1e-06 # compute spectrogram D = pySTFT(wav).T D_mel = np.dot(D, mel_basis) D_db = 20 * np.log10(np.maximum(min_level, D_mel)) - 16 S = (D_db + 100) / 100 # extract f0 f0_rapt = sptk.rapt(wav.astype(np.float32) * 32768, fs, 256, min=lo, max=hi, otype=2) index_nonzero = (f0_rapt != -1e10) mean_f0, std_f0 = np.mean(f0_rapt[index_nonzero]), np.std( f0_rapt[index_nonzero]) f0_norm = speaker_normalization(f0_rapt, index_nonzero, mean_f0,