Example #1
0
    def _adjust_time_resolution(self, batch, local_condition, max_time_steps):
        '''Adjust time resolution between audio and local condition
		'''
        if local_condition:
            new_batch = []
            for b in batch:
                x, c, g, l = b
                self._assert_ready_for_upsample(x, c)
                if max_time_steps is not None:
                    max_steps = _ensure_divisible(
                        max_time_steps, audio.get_hop_size(self._hparams),
                        True)
                    if len(x) > max_time_steps:
                        max_time_frames = max_steps // audio.get_hop_size(
                            self._hparams)
                        start = np.random.randint(0, len(c) - max_time_frames)
                        time_start = start * audio.get_hop_size(self._hparams)
                        x = x[time_start:time_start + max_time_frames *
                              audio.get_hop_size(self._hparams)]
                        c = c[start:start + max_time_frames, :]
                        self._assert_ready_for_upsample(x, c)

                new_batch.append((x, c, g, l))
            return new_batch

        else:
            new_batch = []
            for b in batch:
                x, c, g, l = b
                x = audio.trim_silence(x, hparams)
                if max_time_steps is not None and len(x) > max_time_steps:
                    start = np.random.randint(0, len(c) - max_time_steps)
                    x = x[start:start + max_time_steps]
                new_batch.append((x, c, g, l))
            return new_batch
	def _assert_ready_for_upsample(self, x, c):
		print(len(c))
		print("\n")
		print(len(x))
		print("\n")
		print(audio.get_hop_size(self._hparams))
		return
Example #3
0
 def _assert_ready_for_upsample(self, x, c):
     assert len(x) % len(c) == 0 and len(x) // len(c) == audio.get_hop_size(
         self._hparams)
Example #4
0
def _process_utterance(out_dir, index, wav_path, text, hparams):
    wav = _trim_wav(audio.load_wav(wav_path))
    # Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        # [0, quantize_channels)
        out = mulaw_quantize(wav, hparams.quantize_channels)

        # Trim silences
        start, end = audio.start_and_end_indices(out,
                                                 hparams.silence_threshold)
        wav = wav[start:end]
        out = out[start:end]

        constant_values = mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16

    elif is_mulaw(hparams.input_type):
        # [-1, 1]
        out = mulaw(wav, hparams.quantize_channels)
        constant_values = mulaw(0., hparams.quantize_channels)
        out_dtype = np.float32

    else:
        # [-1, 1]
        out = wav
        constant_values = 0.
        out_dtype = np.float32

    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    name = os.path.splitext(os.path.basename(wav_path))[0]
    speaker_id = _speaker_re.match(name).group(1)

    mel_frames = mel_spectrogram.shape[1]

    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
        return None

    # Compute the linear scale spectrogram from the wav
    linear_spectrogram = audio.linearspectrogram(wav,
                                                 hparams).astype(np.float32)
    linear_frames = linear_spectrogram.shape[1]

    # sanity check
    assert linear_frames == mel_frames

    # Ensure time resolution adjustement between audio and mel-spectrogram
    fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size
    l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))

    # Zero pad for quantized signal
    out = np.pad(out, (l, r), mode='constant', constant_values=constant_values)
    # print(len(out), mel_frames, audio.get_hop_size(hparams))
    assert len(out) >= mel_frames * audio.get_hop_size(hparams)

    # time resolution adjustement
    # ensure length of raw audio is multiple of hop size so that we can use
    # transposed convolution to upsample
    out = out[:mel_frames * audio.get_hop_size(hparams)]
    assert len(out) % audio.get_hop_size(hparams) == 0
    # time_steps = len(out)

    # Write the spectrogram and audio to disk
    audio_filename = 'vctk-audio-{:05d}.npy'.format(index)
    mel_filename = 'vctk-mel-{:05d}.npy'.format(index)
    # np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False)
    np.save(os.path.join(out_dir, audio_filename),
            linear_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    return (audio_filename, mel_filename, mel_frames, text)
Example #5
0
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text):
    """
	Preprocesses a single utterance wav/text pair
	this writes the mel scale spectogram to disk and return a tuple to write
	to the train.txt file
	Args:
		- mel_dir: the directory to write the mel spectograms into
		- linear_dir: the directory to write the linear spectrograms into
		- wav_dir: the directory to write the preprocessed wav into
		- index: the numeric index to use in the spectogram filename
		- wav_path: path to the audio file containing the speech input
		- text: text spoken in the input audio file
		- hparams: hyper parameters
	Returns:
		- A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
	"""
    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path)
    except FileNotFoundError:  #catch missing wav exception
        print(
            'file {} present in csv metadata is not present in wav folder. skipping!'
            .format(wav_path))
        return None

    #rescale wav
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    #M-AILABS extra silence specific
    if hparams.trim_silence:
        wav = audio.trim_silence(wav)

    #Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        #[0, quantize_channels)
        out = mulaw_quantize(wav, hparams.quantize_channels)

        #Trim silences
        start, end = audio.start_and_end_indices(out,
                                                 hparams.silence_threshold)
        wav = wav[start:end]
        out = out[start:end]

        constant_values = mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16

    elif is_mulaw(hparams.input_type):
        #[-1, 1]
        out = mulaw(wav, hparams.quantize_channels)
        constant_values = mulaw(0., hparams.quantize_channels)
        out_dtype = np.float32

    else:
        #[-1, 1]
        out = wav
        constant_values = 0.
        out_dtype = np.float32

    # Compute the mel scale spectrogram from the wav
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]

    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
        return None

    #Compute the linear scale spectrogram from the wav
    linear_spectrogram = audio.linearspectrogram(wav).astype(np.float32)
    linear_frames = linear_spectrogram.shape[1]

    #sanity check
    assert linear_frames == mel_frames

    #Ensure time resolution adjustement between audio and mel-spectrogram
    fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size
    l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size())

    #Zero pad for quantized signal
    out = np.pad(out, (l, r), mode='constant', constant_values=constant_values)
    assert len(out) >= mel_frames * audio.get_hop_size()

    #time resolution adjustement
    #ensure length of raw audio is multiple of hop size so that we can use
    #transposed convolution to upsample
    out = out[:mel_frames * audio.get_hop_size()]
    assert len(out) % audio.get_hop_size() == 0
    time_steps = len(out)

    # Write the spectrogram and audio to disk
    audio_filename = 'speech-audio-{:05d}.npy'.format(index)
    mel_filename = 'speech-mel-{:05d}.npy'.format(index)
    linear_filename = 'speech-linear-{:05d}.npy'.format(index)
    np.save(os.path.join(wav_dir, audio_filename),
            out.astype(out_dtype),
            allow_pickle=False)
    np.save(os.path.join(mel_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(linear_dir, linear_filename),
            linear_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example
    return (audio_filename, mel_filename, linear_filename, time_steps,
            mel_frames, text)
Example #6
0
	def initialize(self, y, c, g, input_lengths, x=None, synthesis_length=None):
		'''Initialize wavenet graph for train, eval and test cases.
		'''
		hparams = self._hparams
		self.is_training = x is not None
		self.is_evaluating = not self.is_training and y is not None
		#Set all convolutions to corresponding mode
		self.set_mode(self.is_training)

		log('Initializing Wavenet model.  Dimensions (? = dynamic shape): ')
		log('  Train mode:                {}'.format(self.is_training))
		log('  Eval mode:                 {}'.format(self.is_evaluating))
		log('  Synthesis mode:            {}'.format(not (self.is_training or self.is_evaluating)))
		with tf.variable_scope('inference') as scope:
			#Training
			if self.is_training:
				batch_size = tf.shape(x)[0]
				#[batch_size, time_length, 1]
				self.mask = self.get_mask(input_lengths, maxlen=tf.shape(x)[-1]) #To be used in loss computation
				#[batch_size, channels, time_length]
				y_hat = self.step(x, c, g, softmax=False) #softmax is automatically computed inside softmax_cross_entropy if needed

				if is_mulaw_quantize(hparams.input_type):
					#[batch_size, time_length, channels]
					self.y_hat_q = tf.transpose(y_hat, [0, 2, 1])

				self.y_hat = y_hat
				self.y = y
				self.input_lengths = input_lengths

				#Graph extension for log saving
				#[batch_size, time_length]
				shape_control = (batch_size, tf.shape(x)[-1], 1)
				with tf.control_dependencies([tf.assert_equal(tf.shape(y), shape_control)]):
					y_log = tf.squeeze(y, [-1])
					if is_mulaw_quantize(hparams.input_type):
						self.y = y_log

				y_hat_log = tf.cond(tf.equal(tf.rank(y_hat), 4),
					lambda: tf.squeeze(y_hat, [-1]),
					lambda: y_hat)
				y_hat_log = tf.reshape(y_hat_log, [batch_size, hparams.out_channels, -1])

				if is_mulaw_quantize(hparams.input_type):
					#[batch_size, time_length]
					y_hat_log = tf.reduce_max(tf.nn.softmax(y_hat_log, axis=1), 1)

					y_hat_log = util.inv_mulaw_quantize(y_hat_log, hparams.quantize_channels)
					y_log = util.inv_mulaw_quantize(y_log, hparams.quantize_channels)

				else:
					#[batch_size, time_length]
					y_hat_log = sample_from_discretized_mix_logistic(
						y_hat_log, log_scale_min=hparams.log_scale_min)

					if is_mulaw(hparams.input_type):
						y_hat_log = util.inv_mulaw(y_hat_log, hparams.quantize_channels)
						y_log = util.inv_mulaw(y_log, hparams.quantize_channels)

				self.y_hat_log = y_hat_log
				self.y_log = y_log
				
				log('  inputs:                    {}'.format(x.shape))
				if self.local_conditioning_enabled():
					log('  local_condition:           {}'.format(c.shape))
				if self.has_speaker_embedding():
					log('  global_condition:          {}'.format(g.shape))
				log('  targets:                   {}'.format(y_log.shape))
				log('  outputs:                   {}'.format(y_hat_log.shape))


			#evaluating
			elif self.is_evaluating: 
				#[time_length, ]
				idx = 0
				length = input_lengths[idx]
				y_target = tf.reshape(y[idx], [-1])[:length]

				if c is not None:
					c = tf.expand_dims(c[idx, :, :length], axis=0)
					with tf.control_dependencies([tf.assert_equal(tf.rank(c), 3)]):
						c = tf.identity(c, name='eval_assert_c_rank_op')
				if g is not None:
					g = g[idx]

				#Start silence frame
				if is_mulaw_quantize(hparams.input_type):
					initial_value = mulaw_quantize(0, hparams.quantize_channels)
				elif is_mulaw(hparams.input_type):
					initial_value = mulaw(0.0, hparams.quantize_channels)
				else:
					initial_value = 0.0

				#[channels, ]
				if is_mulaw_quantize(hparams.input_type):
					initial_input = tf.one_hot(indices=initial_value, depth=hparams.quantize_channels, dtype=tf.float32)
					initial_input = tf.reshape(initial_input, [1, 1, hparams.quantize_channels])
				else:
					initial_input = tf.ones([1, 1, 1], tf.float32) * initial_value

				#Fast eval
				y_hat = self.incremental(initial_input, c=c, g=g, time_length=length,
					softmax=True, quantize=True, log_scale_min=hparams.log_scale_min)

				#Save targets and length for eval loss computation
				if is_mulaw_quantize(hparams.input_type):
					self.y_eval = tf.reshape(y[idx], [1, -1])[:, :length]
				else:
					self.y_eval = tf.expand_dims(y[idx], axis=0)[:, :length, :]
				self.eval_length = length

				if is_mulaw_quantize(hparams.input_type):
					y_hat = tf.reshape(tf.reduce_max(y_hat, axis=1), [-1])
					y_hat = inv_mulaw_quantize(y_hat, hparams.quantize_channels)
					y_target = inv_mulaw_quantize(y_target, hparams.quantize_channels)
				elif is_mulaw(hparams.input_type):
					y_hat = inv_mulaw(tf.reshape(y_hat, [-1]), hparams.quantize_channels)
					y_target = inv_mulaw(y_target, hparams.quantize_channels)
				else:
					y_hat = tf.reshape(y_hat, [-1])

				self.y_hat = y_hat
				self.y_target = y_target

				if self.local_conditioning_enabled():
					log('  local_condition:           {}'.format(c.shape))
				if self.has_speaker_embedding():
					log('  global_condition:          {}'.format(g.shape))
				log('  targets:                   {}'.format(y_target.shape))
				log('  outputs:                   {}'.format(y_hat.shape))

			#synthesizing
			else:
				if c is None:
					assert synthesis_length is not None
				else:
					#[batch_size, local_condition_time, local_condition_dimension(num_mels)]
					message = ('Expected 3 dimension shape [batch_size(1), time_length, {}] for local condition features but found {}'.format(
							hparams.cin_channels, c.shape))
					with tf.control_dependencies([tf.assert_equal(tf.rank(c), 3, message=message)]):
						c = tf.identity(c, name='synthesis_assert_c_rank_op')

					Tc = tf.shape(c)[1]
					upsample_factor = audio.get_hop_size(self._hparams)

					#Overwrite length with respect to local condition features
					synthesis_length = Tc * upsample_factor

					#[batch_size, local_condition_dimension, local_condition_time]
					#time_length will be corrected using the upsample network
					c = tf.transpose(c, [0, 2, 1])

				#Start silence frame
				if is_mulaw_quantize(hparams.input_type):
					initial_value = mulaw_quantize(0, hparams.quantize_channels)
				elif is_mulaw(hparams.input_type):
					initial_value = mulaw(0.0, hparams.quantize_channels)
				else:
					initial_value = 0.0

				if is_mulaw_quantize(hparams.input_type):
					assert initial_value >= 0 and initial_value < hparams.quantize_channels
					initial_input = tf.one_hot(indices=initial_value, depth=hparams.quantize_channels, dtype=tf.float32)
					initial_input = tf.reshape(initial_input, [1, 1, hparams.quantize_channels])
				else:
					initial_input = tf.ones([1, 1, 1], tf.float32) * initial_value

				y_hat = self.incremental(initial_input, c=c, g=g, time_length=synthesis_length,
					softmax=True, quantize=True, log_scale_min=hparams.log_scale_min)

				if is_mulaw_quantize(hparams.input_type):
					y_hat = tf.reshape(tf.reduce_max(y_hat, axis=1), [-1])
					y_hat = util.inv_mulaw_quantize(y_hat, hparams.quantize_channels)
				elif is_mulaw(hparams.input_type):
					y_hat = util.inv_mulaw(tf.reshape(y_hat, [-1]), hparams.quantize_channels)
				else:
					y_hat = tf.reshape(y_hat, [-1])

				self.y_hat = y_hat

				if self.local_conditioning_enabled():
					log('  local_condition:            {}'.format(c.shape))
				if self.has_speaker_embedding():
					log('  global_condition:           {}'.format(g.shape))
				log('  outputs:                    {}'.format(y_hat.shape))

		self.variables = tf.trainable_variables()
		self.ema = tf.train.ExponentialMovingAverage(decay=hparams.wavenet_ema_decay)
Example #7
0
    def synthesize(self, mel_spectrograms, speaker_ids, basenames, out_dir,
                   log_dir):
        hparams = self._hparams
        local_cond, global_cond = self._check_conditions()

        #Switch mels in case of debug
        if self.synth_debug:
            assert len(hparams.wavenet_debug_mels) == len(
                hparams.wavenet_debug_wavs)
            mel_spectrograms = [
                np.load(mel_file) for mel_file in hparams.wavenet_debug_mels
            ]

        #Get True length of audio to be synthesized: audio_len = mel_len * hop_size
        audio_lengths = [
            len(x) * get_hop_size(self._hparams) for x in mel_spectrograms
        ]

        #Prepare local condition batch
        maxlen = max([len(x) for x in mel_spectrograms])
        #[-max, max] or [0,max]
        T2_output_range = (
            -self._hparams.max_abs_value,
            self._hparams.max_abs_value) if self._hparams.symmetric_mels else (
                0, self._hparams.max_abs_value)

        if self._hparams.clip_for_wavenet:
            mel_spectrograms = [
                np.clip(x, T2_output_range[0], T2_output_range[1])
                for x in mel_spectrograms
            ]

        c_batch = np.stack([
            _pad_inputs(x, maxlen, _pad=T2_output_range[0])
            for x in mel_spectrograms
        ]).astype(np.float32)

        if self._hparams.normalize_for_wavenet:
            #rerange to [0, 1]
            c_batch = _interp(c_batch, T2_output_range).astype(np.float32)

        g = None if speaker_ids is None else np.asarray(
            speaker_ids, dtype=np.int32).reshape(len(c_batch), 1)
        feed_dict = {}

        if local_cond:
            feed_dict[self.local_conditions] = c_batch
        else:
            feed_dict[self.synthesis_length] = 100

        if global_cond:
            feed_dict[self.global_conditions] = g

        if self.synth_debug:
            debug_wavs = hparams.wavenet_debug_wavs
            assert len(debug_wavs) % hparams.wavenet_num_gpus == 0
            test_wavs = [
                np.load(debug_wav).reshape(-1, 1) for debug_wav in debug_wavs
            ]

            #pad wavs to same length
            max_test_len = max([len(x) for x in test_wavs])
            test_wavs = np.stack([
                _pad_inputs(x, max_test_len) for x in test_wavs
            ]).astype(np.float32)

            assert len(test_wavs) == len(debug_wavs)
            feed_dict[self.targets] = test_wavs.reshape(
                len(test_wavs), max_test_len, 1)
            feed_dict[self.input_lengths] = np.asarray([test_wavs.shape[1]])

        #Generate wavs and clip extra padding to select Real speech parts
        generated_wavs, upsampled_features = self.session.run(
            [
                self.model.tower_y_hat,
                self.model.tower_synth_upsampled_local_features
            ],
            feed_dict=feed_dict)

        #Linearize outputs (n_gpus -> 1D)
        generated_wavs = [
            wav for gpu_wavs in generated_wavs for wav in gpu_wavs
        ]
        upsampled_features = [
            feat for gpu_feats in upsampled_features for feat in gpu_feats
        ]

        generated_wavs = [
            generated_wav[:length]
            for generated_wav, length in zip(generated_wavs, audio_lengths)
        ]
        upsampled_features = [
            upsampled_feature[:, :length] for upsampled_feature, length in zip(
                upsampled_features, audio_lengths)
        ]

        audio_filenames = []
        for i, (generated_wav, input_mel, upsampled_feature) in enumerate(
                zip(generated_wavs, mel_spectrograms, upsampled_features)):
            #Save wav to disk
            audio_filename = os.path.join(
                out_dir, 'wavenet-audio-{}.wav'.format(basenames[i]))
            save_wavenet_wav(generated_wav,
                             audio_filename,
                             sr=hparams.sample_rate,
                             inv_preemphasize=hparams.preemphasize,
                             k=hparams.preemphasis)
            audio_filenames.append(audio_filename)

            #Compare generated wav mel with original input mel to evaluate wavenet audio reconstruction performance
            #Both mels should match on low frequency information, wavenet mel should contain more high frequency detail when compared to Tacotron mels.
            generated_mel = melspectrogram(generated_wav, hparams).T
            util.plot_spectrogram(
                generated_mel,
                os.path.join(
                    log_dir,
                    'wavenet-mel-spectrogram-{}.png'.format(basenames[i])),
                title=
                'Local Condition vs Reconstructed Audio Mel-Spectrogram analysis',
                target_spectrogram=input_mel)
            #Save upsampled features to visualize checkerboard artifacts.
            util.plot_spectrogram(
                upsampled_feature.T,
                os.path.join(
                    log_dir,
                    'wavenet-upsampled_features-{}.png'.format(basenames[i])),
                title='Upmsampled Local Condition features',
                auto_aspect=True)

            #Save waveplot to disk
            if log_dir is not None:
                plot_filename = os.path.join(
                    log_dir, 'wavenet-waveplot-{}.png'.format(basenames[i]))
                util.waveplot(plot_filename,
                              generated_wav,
                              None,
                              hparams,
                              title='WaveNet generated Waveform.')

        return audio_filenames
Example #8
0
def _process_utterance(pml_dir, wav_dir, index, wav_path, pml_path, hparams):
    """
    Preprocesses a single utterance wav/text pair

    this writes the mel scale spectogram to disk and return a tuple to write
    to the train.txt file

    Args:
        - mel_dir: the directory to write the mel spectograms into
        - linear_dir: the directory to write the linear spectrograms into
        - wav_dir: the directory to write the preprocessed wav into
        - index: the numeric index to use in the spectrogram filename
        - wav_path: path to the audio file containing the speech input
        - pml_path: path to the cmp file containing the pml vocoder features
        - text: text spoken in the input audio file
        - hparams: hyper parameters

    Returns:
        - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
    """
    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path)
    except FileNotFoundError:  # catch missing wav exception
        print(
            'file {} present in csv metadata is not present in wav folder. skipping!'
            .format(wav_path))
        return None

    # rescale wav
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

        # Assert all audio is in [-1, 1]
        if (wav > 1.).any() or (wav < -1.).any():
            raise RuntimeError('wav has invalid value: {}'.format(wav_path))

    # Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        # [0, quantize_channels)
        out = mulaw_quantize(wav, hparams.quantize_channels)

        constant_values = mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16

    elif is_mulaw(hparams.input_type):
        # [-1, 1]
        out = mulaw(wav, hparams.quantize_channels)
        constant_values = mulaw(0., hparams.quantize_channels)
        out_dtype = np.float32

    else:
        # [-1, 1]
        out = wav
        constant_values = 0.
        out_dtype = np.float32

    # Get the PML features from the cmp file
    pml_cmp = np.fromfile(pml_path, dtype=np.float32)
    pml_features = pml_cmp.reshape((-1, hparams.pml_dimension))
    pml_frames = pml_features.shape[0]

    if pml_frames > hparams.max_pml_frames and hparams.clip_pmls_length:
        return None

    # Find parameters
    n_fft = (hparams.num_freq - 1) * 2

    if hparams.use_lws:
        # Ensure time resolution adjustement between audio and mel-spectrogram
        l, r = audio.pad_lr(wav, n_fft, audio.get_hop_size(hparams))

        # Zero pad audio signal
        out = np.pad(out, (l, r),
                     mode='constant',
                     constant_values=constant_values)
    else:
        # Ensure time resolution adjustement between audio and mel-spectrogram
        l_pad, r_pad = audio.librosa_pad_lr(wav, n_fft,
                                            audio.get_hop_size(hparams))

        # Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency)
        out = np.pad(out, (l_pad, r_pad),
                     mode='constant',
                     constant_values=constant_values)

    # print(len(out), pml_frames, audio.get_hop_size(hparams), pml_frames * audio.get_hop_size(hparams))
    assert len(out) >= pml_frames * audio.get_hop_size(hparams)

    # time resolution adjustment
    # ensure length of raw audio is multiple of hop size so that we can use
    # transposed convolution to upsample
    out = out[:pml_frames * audio.get_hop_size(hparams)]
    assert len(out) % audio.get_hop_size(hparams) == 0
    time_steps = len(out)

    # Write the spectrogram and audio to disk
    audio_filename = os.path.join(wav_dir, 'audio-{}.npy'.format(index))
    pml_filename = os.path.join(pml_dir, 'pml-{}.npy'.format(index))
    np.save(audio_filename, out.astype(out_dtype), allow_pickle=False)
    np.save(pml_filename, pml_features, allow_pickle=False)

    # global condition features
    if hparams.gin_channels > 0:
        raise RuntimeError(
            'When activating global conditions, please set your speaker_id rules in line 129 of '
            'datasets/wavenet_preprocessor.py to use them during training')
    else:
        speaker_id = '<no_g>'

    # Return a tuple describing this training example
    return audio_filename, pml_path, pml_filename, speaker_id, time_steps, pml_frames