def _adjust_time_resolution(self, batch, local_condition, max_time_steps): '''Adjust time resolution between audio and local condition ''' if local_condition: new_batch = [] for b in batch: x, c, g, l = b self._assert_ready_for_upsample(x, c) if max_time_steps is not None: max_steps = _ensure_divisible( max_time_steps, audio.get_hop_size(self._hparams), True) if len(x) > max_time_steps: max_time_frames = max_steps // audio.get_hop_size( self._hparams) start = np.random.randint(0, len(c) - max_time_frames) time_start = start * audio.get_hop_size(self._hparams) x = x[time_start:time_start + max_time_frames * audio.get_hop_size(self._hparams)] c = c[start:start + max_time_frames, :] self._assert_ready_for_upsample(x, c) new_batch.append((x, c, g, l)) return new_batch else: new_batch = [] for b in batch: x, c, g, l = b x = audio.trim_silence(x, hparams) if max_time_steps is not None and len(x) > max_time_steps: start = np.random.randint(0, len(c) - max_time_steps) x = x[start:start + max_time_steps] new_batch.append((x, c, g, l)) return new_batch
def _assert_ready_for_upsample(self, x, c): print(len(c)) print("\n") print(len(x)) print("\n") print(audio.get_hop_size(self._hparams)) return
def _assert_ready_for_upsample(self, x, c): assert len(x) % len(c) == 0 and len(x) // len(c) == audio.get_hop_size( self._hparams)
def _process_utterance(out_dir, index, wav_path, text, hparams): wav = _trim_wav(audio.load_wav(wav_path)) # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = mulaw(wav, hparams.quantize_channels) constant_values = mulaw(0., hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0. out_dtype = np.float32 mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) name = os.path.splitext(os.path.basename(wav_path))[0] speaker_id = _speaker_re.match(name).group(1) mel_frames = mel_spectrogram.shape[1] if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: return None # Compute the linear scale spectrogram from the wav linear_spectrogram = audio.linearspectrogram(wav, hparams).astype(np.float32) linear_frames = linear_spectrogram.shape[1] # sanity check assert linear_frames == mel_frames # Ensure time resolution adjustement between audio and mel-spectrogram fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) # Zero pad for quantized signal out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) # print(len(out), mel_frames, audio.get_hop_size(hparams)) assert len(out) >= mel_frames * audio.get_hop_size(hparams) # time resolution adjustement # ensure length of raw audio is multiple of hop size so that we can use # transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 # time_steps = len(out) # Write the spectrogram and audio to disk audio_filename = 'vctk-audio-{:05d}.npy'.format(index) mel_filename = 'vctk-mel-{:05d}.npy'.format(index) # np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, audio_filename), linear_spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) return (audio_filename, mel_filename, mel_frames, text)
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path) except FileNotFoundError: #catch missing wav exception print( 'file {} present in csv metadata is not present in wav folder. skipping!' .format(wav_path)) return None #rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max #M-AILABS extra silence specific if hparams.trim_silence: wav = audio.trim_silence(wav) #Mu-law quantize if is_mulaw_quantize(hparams.input_type): #[0, quantize_channels) out = mulaw_quantize(wav, hparams.quantize_channels) #Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): #[-1, 1] out = mulaw(wav, hparams.quantize_channels) constant_values = mulaw(0., hparams.quantize_channels) out_dtype = np.float32 else: #[-1, 1] out = wav constant_values = 0. out_dtype = np.float32 # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) mel_frames = mel_spectrogram.shape[1] if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: return None #Compute the linear scale spectrogram from the wav linear_spectrogram = audio.linearspectrogram(wav).astype(np.float32) linear_frames = linear_spectrogram.shape[1] #sanity check assert linear_frames == mel_frames #Ensure time resolution adjustement between audio and mel-spectrogram fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size()) #Zero pad for quantized signal out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) assert len(out) >= mel_frames * audio.get_hop_size() #time resolution adjustement #ensure length of raw audio is multiple of hop size so that we can use #transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 time_steps = len(out) # Write the spectrogram and audio to disk audio_filename = 'speech-audio-{:05d}.npy'.format(index) mel_filename = 'speech-mel-{:05d}.npy'.format(index) linear_filename = 'speech-linear-{:05d}.npy'.format(index) np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text)
def initialize(self, y, c, g, input_lengths, x=None, synthesis_length=None): '''Initialize wavenet graph for train, eval and test cases. ''' hparams = self._hparams self.is_training = x is not None self.is_evaluating = not self.is_training and y is not None #Set all convolutions to corresponding mode self.set_mode(self.is_training) log('Initializing Wavenet model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(self.is_training)) log(' Eval mode: {}'.format(self.is_evaluating)) log(' Synthesis mode: {}'.format(not (self.is_training or self.is_evaluating))) with tf.variable_scope('inference') as scope: #Training if self.is_training: batch_size = tf.shape(x)[0] #[batch_size, time_length, 1] self.mask = self.get_mask(input_lengths, maxlen=tf.shape(x)[-1]) #To be used in loss computation #[batch_size, channels, time_length] y_hat = self.step(x, c, g, softmax=False) #softmax is automatically computed inside softmax_cross_entropy if needed if is_mulaw_quantize(hparams.input_type): #[batch_size, time_length, channels] self.y_hat_q = tf.transpose(y_hat, [0, 2, 1]) self.y_hat = y_hat self.y = y self.input_lengths = input_lengths #Graph extension for log saving #[batch_size, time_length] shape_control = (batch_size, tf.shape(x)[-1], 1) with tf.control_dependencies([tf.assert_equal(tf.shape(y), shape_control)]): y_log = tf.squeeze(y, [-1]) if is_mulaw_quantize(hparams.input_type): self.y = y_log y_hat_log = tf.cond(tf.equal(tf.rank(y_hat), 4), lambda: tf.squeeze(y_hat, [-1]), lambda: y_hat) y_hat_log = tf.reshape(y_hat_log, [batch_size, hparams.out_channels, -1]) if is_mulaw_quantize(hparams.input_type): #[batch_size, time_length] y_hat_log = tf.reduce_max(tf.nn.softmax(y_hat_log, axis=1), 1) y_hat_log = util.inv_mulaw_quantize(y_hat_log, hparams.quantize_channels) y_log = util.inv_mulaw_quantize(y_log, hparams.quantize_channels) else: #[batch_size, time_length] y_hat_log = sample_from_discretized_mix_logistic( y_hat_log, log_scale_min=hparams.log_scale_min) if is_mulaw(hparams.input_type): y_hat_log = util.inv_mulaw(y_hat_log, hparams.quantize_channels) y_log = util.inv_mulaw(y_log, hparams.quantize_channels) self.y_hat_log = y_hat_log self.y_log = y_log log(' inputs: {}'.format(x.shape)) if self.local_conditioning_enabled(): log(' local_condition: {}'.format(c.shape)) if self.has_speaker_embedding(): log(' global_condition: {}'.format(g.shape)) log(' targets: {}'.format(y_log.shape)) log(' outputs: {}'.format(y_hat_log.shape)) #evaluating elif self.is_evaluating: #[time_length, ] idx = 0 length = input_lengths[idx] y_target = tf.reshape(y[idx], [-1])[:length] if c is not None: c = tf.expand_dims(c[idx, :, :length], axis=0) with tf.control_dependencies([tf.assert_equal(tf.rank(c), 3)]): c = tf.identity(c, name='eval_assert_c_rank_op') if g is not None: g = g[idx] #Start silence frame if is_mulaw_quantize(hparams.input_type): initial_value = mulaw_quantize(0, hparams.quantize_channels) elif is_mulaw(hparams.input_type): initial_value = mulaw(0.0, hparams.quantize_channels) else: initial_value = 0.0 #[channels, ] if is_mulaw_quantize(hparams.input_type): initial_input = tf.one_hot(indices=initial_value, depth=hparams.quantize_channels, dtype=tf.float32) initial_input = tf.reshape(initial_input, [1, 1, hparams.quantize_channels]) else: initial_input = tf.ones([1, 1, 1], tf.float32) * initial_value #Fast eval y_hat = self.incremental(initial_input, c=c, g=g, time_length=length, softmax=True, quantize=True, log_scale_min=hparams.log_scale_min) #Save targets and length for eval loss computation if is_mulaw_quantize(hparams.input_type): self.y_eval = tf.reshape(y[idx], [1, -1])[:, :length] else: self.y_eval = tf.expand_dims(y[idx], axis=0)[:, :length, :] self.eval_length = length if is_mulaw_quantize(hparams.input_type): y_hat = tf.reshape(tf.reduce_max(y_hat, axis=1), [-1]) y_hat = inv_mulaw_quantize(y_hat, hparams.quantize_channels) y_target = inv_mulaw_quantize(y_target, hparams.quantize_channels) elif is_mulaw(hparams.input_type): y_hat = inv_mulaw(tf.reshape(y_hat, [-1]), hparams.quantize_channels) y_target = inv_mulaw(y_target, hparams.quantize_channels) else: y_hat = tf.reshape(y_hat, [-1]) self.y_hat = y_hat self.y_target = y_target if self.local_conditioning_enabled(): log(' local_condition: {}'.format(c.shape)) if self.has_speaker_embedding(): log(' global_condition: {}'.format(g.shape)) log(' targets: {}'.format(y_target.shape)) log(' outputs: {}'.format(y_hat.shape)) #synthesizing else: if c is None: assert synthesis_length is not None else: #[batch_size, local_condition_time, local_condition_dimension(num_mels)] message = ('Expected 3 dimension shape [batch_size(1), time_length, {}] for local condition features but found {}'.format( hparams.cin_channels, c.shape)) with tf.control_dependencies([tf.assert_equal(tf.rank(c), 3, message=message)]): c = tf.identity(c, name='synthesis_assert_c_rank_op') Tc = tf.shape(c)[1] upsample_factor = audio.get_hop_size(self._hparams) #Overwrite length with respect to local condition features synthesis_length = Tc * upsample_factor #[batch_size, local_condition_dimension, local_condition_time] #time_length will be corrected using the upsample network c = tf.transpose(c, [0, 2, 1]) #Start silence frame if is_mulaw_quantize(hparams.input_type): initial_value = mulaw_quantize(0, hparams.quantize_channels) elif is_mulaw(hparams.input_type): initial_value = mulaw(0.0, hparams.quantize_channels) else: initial_value = 0.0 if is_mulaw_quantize(hparams.input_type): assert initial_value >= 0 and initial_value < hparams.quantize_channels initial_input = tf.one_hot(indices=initial_value, depth=hparams.quantize_channels, dtype=tf.float32) initial_input = tf.reshape(initial_input, [1, 1, hparams.quantize_channels]) else: initial_input = tf.ones([1, 1, 1], tf.float32) * initial_value y_hat = self.incremental(initial_input, c=c, g=g, time_length=synthesis_length, softmax=True, quantize=True, log_scale_min=hparams.log_scale_min) if is_mulaw_quantize(hparams.input_type): y_hat = tf.reshape(tf.reduce_max(y_hat, axis=1), [-1]) y_hat = util.inv_mulaw_quantize(y_hat, hparams.quantize_channels) elif is_mulaw(hparams.input_type): y_hat = util.inv_mulaw(tf.reshape(y_hat, [-1]), hparams.quantize_channels) else: y_hat = tf.reshape(y_hat, [-1]) self.y_hat = y_hat if self.local_conditioning_enabled(): log(' local_condition: {}'.format(c.shape)) if self.has_speaker_embedding(): log(' global_condition: {}'.format(g.shape)) log(' outputs: {}'.format(y_hat.shape)) self.variables = tf.trainable_variables() self.ema = tf.train.ExponentialMovingAverage(decay=hparams.wavenet_ema_decay)
def synthesize(self, mel_spectrograms, speaker_ids, basenames, out_dir, log_dir): hparams = self._hparams local_cond, global_cond = self._check_conditions() #Switch mels in case of debug if self.synth_debug: assert len(hparams.wavenet_debug_mels) == len( hparams.wavenet_debug_wavs) mel_spectrograms = [ np.load(mel_file) for mel_file in hparams.wavenet_debug_mels ] #Get True length of audio to be synthesized: audio_len = mel_len * hop_size audio_lengths = [ len(x) * get_hop_size(self._hparams) for x in mel_spectrograms ] #Prepare local condition batch maxlen = max([len(x) for x in mel_spectrograms]) #[-max, max] or [0,max] T2_output_range = ( -self._hparams.max_abs_value, self._hparams.max_abs_value) if self._hparams.symmetric_mels else ( 0, self._hparams.max_abs_value) if self._hparams.clip_for_wavenet: mel_spectrograms = [ np.clip(x, T2_output_range[0], T2_output_range[1]) for x in mel_spectrograms ] c_batch = np.stack([ _pad_inputs(x, maxlen, _pad=T2_output_range[0]) for x in mel_spectrograms ]).astype(np.float32) if self._hparams.normalize_for_wavenet: #rerange to [0, 1] c_batch = _interp(c_batch, T2_output_range).astype(np.float32) g = None if speaker_ids is None else np.asarray( speaker_ids, dtype=np.int32).reshape(len(c_batch), 1) feed_dict = {} if local_cond: feed_dict[self.local_conditions] = c_batch else: feed_dict[self.synthesis_length] = 100 if global_cond: feed_dict[self.global_conditions] = g if self.synth_debug: debug_wavs = hparams.wavenet_debug_wavs assert len(debug_wavs) % hparams.wavenet_num_gpus == 0 test_wavs = [ np.load(debug_wav).reshape(-1, 1) for debug_wav in debug_wavs ] #pad wavs to same length max_test_len = max([len(x) for x in test_wavs]) test_wavs = np.stack([ _pad_inputs(x, max_test_len) for x in test_wavs ]).astype(np.float32) assert len(test_wavs) == len(debug_wavs) feed_dict[self.targets] = test_wavs.reshape( len(test_wavs), max_test_len, 1) feed_dict[self.input_lengths] = np.asarray([test_wavs.shape[1]]) #Generate wavs and clip extra padding to select Real speech parts generated_wavs, upsampled_features = self.session.run( [ self.model.tower_y_hat, self.model.tower_synth_upsampled_local_features ], feed_dict=feed_dict) #Linearize outputs (n_gpus -> 1D) generated_wavs = [ wav for gpu_wavs in generated_wavs for wav in gpu_wavs ] upsampled_features = [ feat for gpu_feats in upsampled_features for feat in gpu_feats ] generated_wavs = [ generated_wav[:length] for generated_wav, length in zip(generated_wavs, audio_lengths) ] upsampled_features = [ upsampled_feature[:, :length] for upsampled_feature, length in zip( upsampled_features, audio_lengths) ] audio_filenames = [] for i, (generated_wav, input_mel, upsampled_feature) in enumerate( zip(generated_wavs, mel_spectrograms, upsampled_features)): #Save wav to disk audio_filename = os.path.join( out_dir, 'wavenet-audio-{}.wav'.format(basenames[i])) save_wavenet_wav(generated_wav, audio_filename, sr=hparams.sample_rate, inv_preemphasize=hparams.preemphasize, k=hparams.preemphasis) audio_filenames.append(audio_filename) #Compare generated wav mel with original input mel to evaluate wavenet audio reconstruction performance #Both mels should match on low frequency information, wavenet mel should contain more high frequency detail when compared to Tacotron mels. generated_mel = melspectrogram(generated_wav, hparams).T util.plot_spectrogram( generated_mel, os.path.join( log_dir, 'wavenet-mel-spectrogram-{}.png'.format(basenames[i])), title= 'Local Condition vs Reconstructed Audio Mel-Spectrogram analysis', target_spectrogram=input_mel) #Save upsampled features to visualize checkerboard artifacts. util.plot_spectrogram( upsampled_feature.T, os.path.join( log_dir, 'wavenet-upsampled_features-{}.png'.format(basenames[i])), title='Upmsampled Local Condition features', auto_aspect=True) #Save waveplot to disk if log_dir is not None: plot_filename = os.path.join( log_dir, 'wavenet-waveplot-{}.png'.format(basenames[i])) util.waveplot(plot_filename, generated_wav, None, hparams, title='WaveNet generated Waveform.') return audio_filenames
def _process_utterance(pml_dir, wav_dir, index, wav_path, pml_path, hparams): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectrogram filename - wav_path: path to the audio file containing the speech input - pml_path: path to the cmp file containing the pml vocoder features - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path) except FileNotFoundError: # catch missing wav exception print( 'file {} present in csv metadata is not present in wav folder. skipping!' .format(wav_path)) return None # rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Assert all audio is in [-1, 1] if (wav > 1.).any() or (wav < -1.).any(): raise RuntimeError('wav has invalid value: {}'.format(wav_path)) # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = mulaw_quantize(wav, hparams.quantize_channels) constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = mulaw(wav, hparams.quantize_channels) constant_values = mulaw(0., hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0. out_dtype = np.float32 # Get the PML features from the cmp file pml_cmp = np.fromfile(pml_path, dtype=np.float32) pml_features = pml_cmp.reshape((-1, hparams.pml_dimension)) pml_frames = pml_features.shape[0] if pml_frames > hparams.max_pml_frames and hparams.clip_pmls_length: return None # Find parameters n_fft = (hparams.num_freq - 1) * 2 if hparams.use_lws: # Ensure time resolution adjustement between audio and mel-spectrogram l, r = audio.pad_lr(wav, n_fft, audio.get_hop_size(hparams)) # Zero pad audio signal out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) else: # Ensure time resolution adjustement between audio and mel-spectrogram l_pad, r_pad = audio.librosa_pad_lr(wav, n_fft, audio.get_hop_size(hparams)) # Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency) out = np.pad(out, (l_pad, r_pad), mode='constant', constant_values=constant_values) # print(len(out), pml_frames, audio.get_hop_size(hparams), pml_frames * audio.get_hop_size(hparams)) assert len(out) >= pml_frames * audio.get_hop_size(hparams) # time resolution adjustment # ensure length of raw audio is multiple of hop size so that we can use # transposed convolution to upsample out = out[:pml_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) # Write the spectrogram and audio to disk audio_filename = os.path.join(wav_dir, 'audio-{}.npy'.format(index)) pml_filename = os.path.join(pml_dir, 'pml-{}.npy'.format(index)) np.save(audio_filename, out.astype(out_dtype), allow_pickle=False) np.save(pml_filename, pml_features, allow_pickle=False) # global condition features if hparams.gin_channels > 0: raise RuntimeError( 'When activating global conditions, please set your speaker_id rules in line 129 of ' 'datasets/wavenet_preprocessor.py to use them during training') else: speaker_id = '<no_g>' # Return a tuple describing this training example return audio_filename, pml_path, pml_filename, speaker_id, time_steps, pml_frames