def _process_utterance(in_path, out_path, speaker, text): # Change sampling rate try: old_samplerate, old_audio = read_wav_np(in_path) except: return new_samplerate = hparams.sampling_rate if old_samplerate != new_samplerate: duration = old_audio.shape[0] / old_samplerate time_old = np.linspace(0, duration, old_audio.shape[0]) time_new = np.linspace( 0, duration, int(old_audio.shape[0] * new_samplerate / old_samplerate)) interpolator = interpolate.interp1d(time_old, old_audio.T) new_audio = interpolator(time_new).T.astype(np.float32) else: new_audio = old_audio # Trim wav, _ = librosa.effects.trim(new_audio, top_db=25, frame_length=2048, hop_length=512) wav = torch.from_numpy(wav).unsqueeze(0) wav = wav.squeeze(0).numpy() write(out_path, 22050, wav) line = text.rstrip('\n') return (out_path, line, speaker)
def _process_utterance(in_path, out_path, spk_name_idx): # out_path = out_path.replace('pcm', 'wav') out_path = out_path.replace('wav', 'wav') dir = os.path.dirname(out_path) # wav is saved as int 16 command = 'sox -L -c 1 -e signed -b 16 -t raw -r 44100 {} -c 1 -e signed -b 16 -t wav -r 22050 {}'\ .format(in_path, out_path) os.system(command) # int16 is converted into float32 here sampling_rate, audio = read_wav_np(out_path) wav, _ = librosa.effects.trim(audio, top_db=25, frame_length=2048, hop_length=512) wav = torch.from_numpy(wav).unsqueeze(0) wav = wav.squeeze(0).numpy() # txt_file = in_path.replace('raw', 'script').replace('.pcm', '.pron') txt_file = in_path.replace('wav', 'script').replace('.pcm', '.pron') with open(txt_file, 'r', encoding='utf-8-sig') as f: line = f.readline() speaker = in_path.split('/')[spk_name_idx] write(out_path, 22050, wav) return (out_path, line.rstrip('\n'), speaker)
def _process_utterance(in_path): try: sr, wav = read_wav_np(in_path) except: return in_path + '\n' return ''