def _process_utterance(out_dir, index, wav_path, text, silence_threshold, fft_size): # Load the audio to a numpy array: wav = audio.load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max if hparams.input_type != "raw": # Mu-law quantize out = P.mulaw_quantize(wav) # Trim silences start, end = audio.start_and_end_indices(out, silence_threshold) out = out[start:end] wav = wav[start:end] constant_value = P.mulaw_quantize(0, 256) out_dtype = np.int16 else: out = wav constant_value = 0. out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_value) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) wav_id = os.path.basename(wav_path).split('.')[ 0] # wav_id = wav_path.split('/')[-1].split('.')[0] # Write the spectrograms to disk: audio_filename = '{}-audio.npy'.format(wav_id) mel_filename = '{}-mel.npy'.format(wav_id) np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return audio_filename, mel_filename, timesteps, text
def _process_utterance( out_dir, index, speaker_id, wav_path, text, silence_threshold, fft_size, ): sr = hparams.sample_rate # Load the audio to a numpy array. Resampled if needed wav = audio.load_wav(wav_path) lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab") # Trim silence from hts labels if available # TODO if exists(lab_path) and False: labels = hts.load(lab_path) b = int(start_at(labels) * 1e-7 * sr) e = int(end_at(labels) * 1e-7 * sr) wav = wav[b:e] wav, _ = librosa.effects.trim(wav, top_db=20) else: wav, _ = librosa.effects.trim(wav, top_db=20) # Mu-law quantize quantized = P.mulaw_quantize(wav) # Trim silences start, end = audio.start_and_end_indices(quantized, silence_threshold) quantized = quantized[start:end] wav = wav[start:end] # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjast time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, fft_size, audio.get_hop_size()) # zero pad for quantized signal quantized = np.pad(quantized, (l, r), mode="constant", constant_values=P.mulaw_quantize(0)) N = mel_spectrogram.shape[0] assert len(quantized) >= N * audio.get_hop_size() # time resolution adjastment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample quantized = quantized[:N * audio.get_hop_size()] assert len(quantized) % audio.get_hop_size() == 0 timesteps = len(quantized) wav_id = wav_path.split('/')[-1].split('.')[0] # Write the spectrograms to disk: audio_filename = '{}-audio.npy'.format(wav_id) mel_filename = '{}-mel.npy'.format(wav_id) np.save(os.path.join(out_dir, audio_filename), quantized.astype(np.int16), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return (audio_filename, mel_filename, timesteps, text, speaker_id)
def assert_ready_for_upsampling(x, c): assert len(x) % len(c) == 0 and len(x) // len(c) == audio.get_hop_size()
def thread_main(self, sess): stop = False while not stop: iterator = load_npy_data(self.metadata_filename, self.npy_dataroot, self.speaker_id) for wav, local_condition, global_condition in iterator: if self.coord.should_stop(): stop = True break # force to align the audio and local_condition # if audio.shape[0] > local_condition.shape[0]: # audio = audio[:local_condition.shape[0], :] # else: # local_condition = local_condition[:audio.shape[0], :] # audio = np.pad(audio, [[self.receptive_field, 0], [0, 0]], mode='constant') # local_condition = np.pad(local_condition, [[self.receptive_field, 0], [0, 0]], mode='constant') # if self.sample_size: # while len(audio) > self.receptive_field: # audio_piece = audio[:(self.receptive_field + self.sample_size), :] # audio = audio[self.sample_size:, :] # # local_condition_piece = local_condition[:(self.receptive_field + self.sample_size), :] # local_condition = local_condition[self.sample_size:, :] # # if self.gc_enable: # sess.run(self.enqueue, feed_dict= # dict(zip(self._placeholders, (audio_piece, local_condition_piece, global_condition)))) # else: # sess.run(self.enqueue, feed_dict= # dict(zip(self._placeholders, (audio_piece, local_condition_piece)))) # else: # if self.gc_enable: # sess.run(self.enqueue, feed_dict=dict(zip( # self._placeholders, (audio, local_condition, global_condition)))) # else: # sess.run(self.enqueue, feed_dict=dict(zip(self._placeholders, (audio, local_condition)))) if hparams.upsample_conditional_features: wav = wav.reshape(-1, 1) assert_ready_for_upsampling(wav, local_condition) if self.sample_size is not None: sample_size = ensure_divisible(self.sample_size, audio.get_hop_size(), True) if wav.shape[0] > sample_size: max_frames = sample_size // audio.get_hop_size() s = np.random.randint( 0, len(local_condition) - max_frames) ts = s * audio.get_hop_size() wav = wav[ts:ts + audio.get_hop_size() * max_frames, :] local_condition = local_condition[s:s + max_frames, :] if self.gc_enable: sess.run(self.enqueue, feed_dict=dict( zip(self._placeholders, (wav, local_condition, global_condition)))) else: sess.run(self.enqueue, feed_dict=dict( zip(self._placeholders, (wav, local_condition)))) else: wav, local_condition = audio.adjust_time_resolution( wav, local_condition) wav = wav.reshape(-1, 1) if self.sample_size is not None: while wav.shape[0] > self.sample_size: wav_piece = wav[:(self.receptive_field + self.sample_size), :] local_condition_piece = local_condition[:( self.receptive_field + self.sample_size), :] wav = wav[:self.sample_size, :] local_condition = local_condition[:self. sample_size, :] assert len(wav_piece) == len(local_condition_piece) if self.gc_enable: sess.run( self.enqueue, feed_dict=dict( zip(self._placeholders, (wav_piece, local_condition_piece, global_condition)))) else: sess.run(self.enqueue, feed_dict=dict( zip(self._placeholders, (wav_piece, local_condition_piece))))