Ejemplo n.º 1
0
    def _adjust_time_resolution(self, batch, local_condition, max_time_steps):
        '''Adjust time resolution between audio and local condition
		'''
        if local_condition:
            new_batch = []
            for b in batch:
                x, c, g, l = b
                self._assert_ready_for_upsample(x, c)
                if max_time_steps is not None:
                    max_steps = _ensure_divisible(
                        max_time_steps, audio.get_hop_size(self._hparams),
                        True)
                    if len(x) > max_time_steps:
                        max_time_frames = max_steps // audio.get_hop_size(
                            self._hparams)
                        start = np.random.randint(0, len(c) - max_time_frames)
                        time_start = start * audio.get_hop_size(self._hparams)
                        x = x[time_start:time_start + max_time_frames *
                              audio.get_hop_size(self._hparams)]
                        c = c[start:start + max_time_frames, :]
                        self._assert_ready_for_upsample(x, c)

                new_batch.append((x, c, g, l))
            return new_batch

        else:
            new_batch = []
            for b in batch:
                x, c, g, l = b
                x = audio.trim_silence(x, hparams)
                if max_time_steps is not None and len(x) > max_time_steps:
                    start = np.random.randint(0, len(c) - max_time_steps)
                    x = x[start:start + max_time_steps]
                new_batch.append((x, c, g, l))
            return new_batch
def ensure_divisible_mel(length, divisible_by=256, lower=True):
    if length % divisible_by == 0:
        max_steps = length

        return max_steps // audio.get_hop_size()
    if lower:
        max_steps = length - length % divisible_by
    else:
        max_steps = length + (divisible_by - length % divisible_by)

    return max_steps // audio.get_hop_size()
Ejemplo n.º 3
0
    def __init__(self,
                 coord,
                 data_dirs,
                 batch_size,
                 receptive_field,
                 gc_enable=False,
                 queue_size=8):
        super(DataFeederWavenet, self).__init__()
        self.data_dirs = data_dirs
        self.coord = coord
        self.batch_size = batch_size
        self.receptive_field = receptive_field
        self.hop_size = audio.get_hop_size(hparams)
        self.sample_size = ensure_divisible(hparams.sample_size, self.hop_size,
                                            True)
        self.max_frames = self.sample_size // self.hop_size  # sample_size 크기를 확보하기 위해.
        self.queue_size = queue_size
        self.gc_enable = gc_enable
        self.skip_path_filter = hparams.skip_path_filter

        self.rng = np.random.RandomState(123)
        self._offset = defaultdict(lambda: 2)  # key에 없는 값이 들어어면 2가 할당된다.

        self.data_dir_to_id = {
            data_dir: idx
            for idx, data_dir in enumerate(self.data_dirs)
        }  # data_dir <---> speaker_id 매핑
        self.path_dict = get_path_dict(
            self.data_dirs,
            np.max([self.sample_size, receptive_field
                    ]))  # receptive_field 보다 작은 것을 버리고, 나머지만 돌려준다.

        self._placeholders = [
            tf.placeholder(tf.float32, shape=[None, None, 1],
                           name='input_wav'),
            tf.placeholder(tf.float32,
                           shape=[None, None, hparams.num_mels],
                           name='local_condition')
        ]
        dtypes = [tf.float32, tf.float32]

        if self.gc_enable:
            self._placeholders.append(
                tf.placeholder(tf.int32, shape=[None], name='speaker_id'))
            dtypes.append(tf.int32)

        queue = tf.FIFOQueue(self.queue_size, dtypes, name='input_queue')
        self.enqueue = queue.enqueue(self._placeholders)

        if self.gc_enable:
            self.inputs_wav, self.local_condition, self.speaker_id = queue.dequeue(
            )
        else:
            self.inputs_wav, self.local_condition = queue.dequeue()

        self.inputs_wav.set_shape(self._placeholders[0].shape)
        self.local_condition.set_shape(self._placeholders[1].shape)
        if self.gc_enable:
            self.speaker_id.set_shape(self._placeholders[2].shape)
Ejemplo n.º 4
0
def synthesis(checkpoint_path, local_path, global_id, output_dir, hp):
    checkpoint_name = checkpoint_path.split('/')[-1]
    audio_dir = os.path.join(output_dir, checkpoint_name, 'wavs')
    plot_dir = os.path.join(output_dir, checkpoint_name, 'plots')
    os.makedirs(audio_dir, exist_ok=True)
    os.makedirs(plot_dir, exist_ok=True)

    ph = create_placeholders()

    model = create_model(ph, hp)
    # apply ema to variable
    ema = tf.train.ExponentialMovingAverage(decay=hp.ema_decay)

    local_condition = np.load(local_path)
    local_condition = local_condition.reshape([1, -1, hparams.num_mels])

    if not hp.upsample_conditional_features:
        local_condition = np.repeat(local_condition,
                                    audio.get_hop_size(),
                                    axis=1)

    index = local_path.split('-')[-1].split('.')[0]

    saver = tf.train.Saver(ema.variables_to_restore())

    config = tf.ConfigProto(
        gpu_options=tf.GPUOptions(allow_growth=True),
        log_device_placement=False,
    )
    with tf.Session(config=config) as sess:
        saver.restore(sess, checkpoint_path)
        start_time = time.time()
        outputs = sess.run(model.eval_outputs,
                           feed_dict={ph['local_condition']: local_condition})
        duration = time.time() - start_time
        print(
            'Time Evaluation: Generation of {} audio samples took {:.3f} sec ({:.3f} frames/sec)'
            .format(len(outputs), duration,
                    len(outputs) / duration))

        waveform = np.reshape(outputs, [-1])

        audio_path = os.path.join(audio_dir, '{}.wav'.format(index))
        plot_path = os.path.join(plot_dir, '{}.png'.format(index))
        waveplot(plot_path, waveform, None, hp)
        librosa.output.write_wav(audio_path, waveform, sr=hp.sample_rate)
Ejemplo n.º 5
0
 def _adjust_time_step(self, audio_data, local_feature, max_time_steps):
     """Adjust time resolution for local condition."""
     hop_size = audio.get_hop_size()
     if local_feature is not None:
         if self._hparams.upsample_conditional_features:
             self._assert_ready_for_upsample(audio_data, local_feature)
             if max_time_steps is not None:
                 max_steps = _ensure_divisible(max_time_steps, hop_size, True)
                 if len(audio_data) > max_time_steps:
                     max_time_frames = max_steps // hop_size
                     start = np.random.randint(0, len(local_feature) - max_time_frames)
                     time_start = start * hop_size
                     audio_data = audio_data[time_start:time_start + max_time_frames * hop_size]
                     local_feature = local_feature[start:start + max_time_frames, :]
                     self._assert_ready_for_upsample(audio_data, local_feature)
         else:
             audio_data, local_feature = audio.adjust_time_resolution(audio_data, local_feature)
             if max_time_steps is not None and len(audio_data) > max_time_steps:
                 s = np.random.randint(0, len(audio_data) - max_time_steps)
                 audio_data, local_feature = audio_data[s:s + max_time_steps], local_feature[s:s + max_time_steps, :]
             assert len(audio_data) == len(local_feature)
     return audio_data, local_feature
Ejemplo n.º 6
0
def _process_utterance(out_dir, wav_path, text, hparams):
    """
    Preprocesses a single utterance wav/text pair

    this writes the mel scale spectogram to disk and return a tuple to write
    to the train.txt file

    Args:
        - mel_dir: the directory to write the mel spectograms into
        - linear_dir: the directory to write the linear spectrograms into
        - wav_dir: the directory to write the preprocessed wav into
        - index: the numeric index to use in the spectogram filename
        - wav_path: path to the audio file containing the speech input
        - text: text spoken in the input audio file
        - hparams: hyper parameters

    Returns:
        - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
    """
    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
    except FileNotFoundError:  #catch missing wav exception
        print(
            'file {} present in csv metadata is not present in wav folder. skipping!'
            .format(wav_path))
        return None

    #rescale wav
    if hparams.rescaling:  # hparams.rescale = True
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    #M-AILABS extra silence specific
    if hparams.trim_silence:  # hparams.trim_silence = True
        wav = audio.trim_silence(wav,
                                 hparams)  # Trim leading and trailing silence

    #Mu-law quantize, default 값은 'raw'
    if hparams.input_type == 'mulaw-quantize':
        #[0, quantize_channels)
        out = audio.mulaw_quantize(wav, hparams.quantize_channels)

        #Trim silences
        start, end = audio.start_and_end_indices(out,
                                                 hparams.silence_threshold)
        wav = wav[start:end]
        out = out[start:end]

        constant_values = mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16

    elif hparams.input_type == 'mulaw':
        #[-1, 1]
        out = audio.mulaw(wav, hparams.quantize_channels)
        constant_values = audio.mulaw(0., hparams.quantize_channels)
        out_dtype = np.float32

    else:  # raw
        #[-1, 1]
        out = wav
        constant_values = 0.
        out_dtype = np.float32

    # Compute the mel scale spectrogram from the wav
    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]

    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:  # hparams.max_mel_frames = 1000, hparams.clip_mels_length = True
        return None

    #Compute the linear scale spectrogram from the wav
    linear_spectrogram = audio.linearspectrogram(wav,
                                                 hparams).astype(np.float32)
    linear_frames = linear_spectrogram.shape[1]

    #sanity check
    assert linear_frames == mel_frames

    if hparams.use_lws:  # hparams.use_lws = False
        #Ensure time resolution adjustement between audio and mel-spectrogram
        fft_size = hparams.fft_size if hparams.win_size is None else hparams.win_size
        l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))

        #Zero pad audio signal
        out = np.pad(out, (l, r),
                     mode='constant',
                     constant_values=constant_values)
    else:
        #Ensure time resolution adjustement between audio and mel-spectrogram
        pad = audio.librosa_pad_lr(wav, hparams.fft_size,
                                   audio.get_hop_size(hparams))

        #Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency)
        out = np.pad(out, pad, mode='reflect')

    assert len(out) >= mel_frames * audio.get_hop_size(hparams)

    #time resolution adjustement
    #ensure length of raw audio is multiple of hop size so that we can use
    #transposed convolution to upsample
    out = out[:mel_frames * audio.get_hop_size(hparams)]
    assert len(out) % audio.get_hop_size(hparams) == 0
    time_steps = len(out)

    # Write the spectrogram and audio to disk
    wav_id = os.path.splitext(os.path.basename(wav_path))[0]

    # Write the spectrograms to disk:
    audio_filename = '{}-audio.npy'.format(wav_id)
    mel_filename = '{}-mel.npy'.format(wav_id)
    linear_filename = '{}-linear.npy'.format(wav_id)
    npz_filename = '{}.npz'.format(wav_id)
    npz_flag = True
    if npz_flag:
        # Tacotron 코드와 맞추기 위해, 같은 key를 사용한다.
        data = {
            'audio': out.astype(out_dtype),
            'mel': mel_spectrogram.T,
            'linear': linear_spectrogram.T,
            'time_steps': time_steps,
            'mel_frames': mel_frames,
            'text': text,
            'tokens': text_to_sequence(text),  # eos(~)에 해당하는 "1"이 끝에 붙는다.
            'loss_coeff': 1  # For Tacotron
        }

        np.savez(os.path.join(out_dir, npz_filename),
                 **data,
                 allow_pickle=False)
    else:
        np.save(os.path.join(out_dir, audio_filename),
                out.astype(out_dtype),
                allow_pickle=False)
        np.save(os.path.join(out_dir, mel_filename),
                mel_spectrogram.T,
                allow_pickle=False)
        np.save(os.path.join(out_dir, linear_filename),
                linear_spectrogram.T,
                allow_pickle=False)

    # Return a tuple describing this training example
    return (audio_filename, mel_filename, linear_filename, time_steps,
            mel_frames, text, npz_filename)
Ejemplo n.º 7
0
def _process_utterance(out_dir, wav_path, text, hparams):
    """
    Preprocesses a single utterance wav/text pair

    this writes the mel scale spectogram to disk and return a tuple to write
    to the train.txt file

    Args:
        - mel_dir: the directory to write the mel spectograms into
        - linear_dir: the directory to write the linear spectrograms into
        - wav_dir: the directory to write the preprocessed wav into
        - index: the numeric index to use in the spectogram filename
        - wav_path: path to the audio file containing the speech input
        - text: text spoken in the input audio file
        - hparams: hyper parameters

    Returns:
        - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
    """
    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path, sr=hparams.sample_rate) #1차원짜리 wav파일 뽑아옴
        #Load an audio file as a floating point time series.
        #Audio will be automatically resampled to the given rate (default sr=22050).
        #To preserve the native sampling rate of the file, use sr=None. 
        #print('====wav====')
        #print(wav,wav.shape) (240001,)
    except FileNotFoundError: #catch missing wav exception
        print('file {} present in csv metadata is not present in wav folder. skipping!'.format(
            wav_path))
        return None

    #rescale wav
    if hparams.rescaling:   # hparams.rescale = True
        wav = wav / np.abs(wav).max() * hparams.rescaling_max
    #We rescale because it is assumed in Wavenet training that wavs are in [-1, 1] when computing the mixture loss. This is mainly coming from PixelCNN implementation.
    #https://github.com/Rayhane-mamah/Tacotron-2/issues/69

    #M-AILABS extra silence specific
    if hparams.trim_silence:  # hparams.trim_silence = True
        wav = audio.trim_silence(wav, hparams)   # Trim leading and trailing silence

    #Mu-law quantize, default 값은 'raw'
    #The quantization noise is from the analog to digital conversion. The mu-law compression actually reduces the noise and increases the dynamic range.
    #If you search a little bit in the code you will find that the input is always mu-law encoded here.
    #scalar_input only determines if the model uses a one-hot encoding for every data point of the input waveform, or just uses floating point values for each sample.
    if hparams.input_type=='mulaw-quantize':
        #[0, quantize_channels)
        out = audio.mulaw_quantize(wav, hparams.quantize_channels)

        #Trim silences
        start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
        wav = wav[start: end]
        out = out[start: end]

        constant_values = mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16

    elif hparams.input_type=='mulaw':
        #[-1, 1]
        out = audio.mulaw(wav, hparams.quantize_channels)
        constant_values = mulaw(0., hparams.quantize_channels)
        out_dtype = np.float32

    else:  # raw
        #[-1, 1]
        out = wav
        constant_values = 0.
        out_dtype = np.float32

    # Compute the mel scale spectrograFm from the wav
    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
    #print('====mel_spectrogram====')
    #print(mel_spectrogram,mel_spectrogram.shape) #(80,797),(80,801) ...
    mel_frames = mel_spectrogram.shape[1]
    #print('===mel frame====')
    #print(mel_frames) 801, 797 ,...
    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:   # hparams.max_mel_frames = 1000, hparams.clip_mels_length = True
        return None

    #Compute the linear scale spectrogram from the wav
    linear_spectrogram = audio.linearspectrogram(wav, hparams).astype(np.float32)
    #print('====linear_spectrogram====')
    #print(linear_spectrogram,linear_spectrogram.shape) #(1025,787),(1025,801)
    linear_frames = linear_spectrogram.shape[1]

    #sanity check
    assert linear_frames == mel_frames

    if hparams.use_lws:    # hparams.use_lws = False
        #Ensure time resolution adjustement between audio and mel-spectrogram
        fft_size = hparams.fft_size if hparams.win_size is None else hparams.win_size
        l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))

        #Zero pad audio signal
        out = np.pad(out, (l, r), mode='constant', constant_values=constant_values)
    else:
        #Ensure time resolution adjustement between audio and mel-spectrogram
        pad = audio.librosa_pad_lr(wav, hparams.fft_size, audio.get_hop_size(hparams)) #1024 == 2048//2 == fft_size//2
        #print('===pad===')
        #print(pad) 
        #Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency)
        #print(out,out.shape) #(240001,)
        out = np.pad(out, pad, mode='reflect') #shape : (242049,) - 패딩
        #print(out,out.shape) #(242049,)
        #print('===out====')
        #print(out,out.shape)

    assert len(out) >= mel_frames * audio.get_hop_size(hparams)

    #time resolution adjustement
    #ensure length of raw audio is multiple of hop size so that we can use
    #transposed convolution to upsample
    out = out[:mel_frames * audio.get_hop_size(hparams)] #240300으로 맞춤(자름)
    assert len(out) % audio.get_hop_size(hparams) == 0
    time_steps = len(out)
    #print(audio.get_hop_size(hparams)) : 300
    #print(out,out.shape) #(240300,) = 801*300
    # Write the spectrogram and audio to disk
    wav_id = os.path.splitext(os.path.basename(wav_path))[0] #확장자 제외하고 파일 이름 얻기
    #print('====wav_id====')
    #print(wav_id)
    # Write the spectrograms to disk:
    audio_filename = '{}-audio.npy'.format(wav_id)
    mel_filename = '{}-mel.npy'.format(wav_id)
    linear_filename = '{}-linear.npy'.format(wav_id)
    npz_filename = '{}.npz'.format(wav_id)
    npz_flag=True
    if npz_flag:
        # Tacotron 코드와 맞추기 위해, 같은 key를 사용한다.
        data = {
            'audio': out.astype(out_dtype),
            'mel': mel_spectrogram.T,  
            'linear': linear_spectrogram.T,
            'time_steps': time_steps,
            'mel_frames': mel_frames,
            'text': text,
            'tokens': text_to_sequence(text),   # eos(~)에 해당하는 "1"이 끝에 붙는다.
            'loss_coeff': 1  # For Tacotron
        }
        #print('=====data====')
        #print(data)
        np.savez(os.path.join(out_dir,npz_filename ), **data, allow_pickle=False) #여러개의 배열을 1개의 압축되지 않은 *.npz 포맷 파일로 저장하기
    else:
        np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False)
        np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
        np.save(os.path.join(out_dir, linear_filename), linear_spectrogram.T, allow_pickle=False)

    # Return a tuple describing this training example
    #print('====mel_frames====')
    #print(mel_frames)
    #print('====time_steps====')
    #print(time_steps)
    return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text,npz_filename)
Ejemplo n.º 8
0
def _process_utterance(mfcc_dir, wav_dir, index, wav_path, hparams, mode):
	"""
	Preprocesses a single utterance wav/text pair

	this writes the mfcc to disk and return a tuple to write
	to the train.txt file

	Args:
		- mfcc_dir: the directory to write the mfcc into
		- linear_dir: the directory to write the linear spectrograms into
		- wav_dir: the directory to write the preprocessed wav into
		- index: the numeric index to use in the spectrogram filename
		- wav_path: path to the audio file containing the speech input
		- text: text spoken in the input audio file
		- hparams: hyper parameters

	Returns:
		- A tuple: (audio_filename, mfcc_filename, linear_filename, time_steps, mfcc_frames, linear_frames, text)
	"""

	try:
		# Load the audio as numpy array
		wav_full = audio.load_wav(wav_path, sr=hparams.sample_rate)
	except FileNotFoundError: #catch missing wav exception
		print('file {} present in csv metadata is not present in wav folder. skipping!'.format(
			wav_path))
		return None

	#M-AILABS extra silence specific
	if hparams.trim_silence:
		wav_full = audio.trim_silence(wav_full, hparams)

	# Preprocess Audio & Extract MFCC (mfcc + d + a)
	sample_idx = 0
	sample_metadata = []

	if (mode == "train") or (mode == "post_train"):
		# Add the same size slice from the end
		if wav_full.shape[0] >= hparams.sample_size:
			n_slice = int(np.floor(wav_full.shape[0]/hparams.sample_size))
			samples = wav_full[:n_slice * hparams.sample_size].reshape((n_slice, hparams.sample_size))
			if wav_full.shape[0] % hparams.sample_size != 0:
				## FOR UNIT SEARCH : slice each audio by sample_size
				last_slice = wav_full[::-1][:hparams.sample_size]
				samples = np.vstack((samples, last_slice))
		else:
			samples = [wav_full]
	else:
		samples = [wav_full]


	for wav in samples:

		#Pre-emphasize
		preem_wav = audio.preemphasis(wav, hparams.preemphasis, hparams.preemphasize)

		#rescale wav
		if hparams.rescale:
			wav = wav / np.abs(wav).max() * hparams.rescaling_max
			preem_wav = preem_wav / np.abs(preem_wav).max() * hparams.rescaling_max

			#Assert all audio is in [-1, 1]
			if (wav > 1.).any() or (wav < -1.).any():
				raise RuntimeError('wav has invalid value: {}'.format(wav_path))
			if (preem_wav > 1.).any() or (preem_wav < -1.).any():
				raise RuntimeError('wav has invalid value: {}'.format(wav_path))

		#Mu-law quantize
		if is_mulaw_quantize(hparams.input_type):
			#[0, quantize_channels)
			out = mulaw_quantize(wav, hparams.quantize_channels)

			# #Trim silences
			# start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
			# wav = wav[start: end]
			# preem_wav = preem_wav[start: end]
			# out = out[start: end]

			constant_values = mulaw_quantize(0, hparams.quantize_channels)
			out_dtype = np.int16

		elif is_mulaw(hparams.input_type):
			#[-1, 1]
			out = mulaw(wav, hparams.quantize_channels)
			constant_values = mulaw(0., hparams.quantize_channels)
			out_dtype = np.float32

		else:
			#[-1, 1]
			out = wav
			constant_values = 0.
			out_dtype = np.float32

		# Compute mfcc
		mfcc = audio.mfcc(wav, hparams)
		mfcc_frames = mfcc.shape[0]

		# # Compute the mel scale spectrogram from the wav
		# mel_spectrogram = audio.melspectrogram(preem_wav, hparams).astype(np.float32)
		# mel_frames = mel_spectrogram.shape[1]

		if mfcc_frames > hparams.max_mel_frames and hparams.clip_mels_length:
			return None

		#Ensure time resolution adjustement between audio and mel-spectrogram
		l_pad, r_pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams))
		#Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency)
		out = np.pad(out, (l_pad, r_pad), mode='constant', constant_values=constant_values)

		assert len(out) >= mfcc_frames * audio.get_hop_size(hparams)

		#time resolution adjustement
		#ensure length of raw audio is multiple of hop size so that we can use
		out = out[:int(np.ceil(mfcc_frames/hparams.vqvae_down_freq) * hparams.vqvae_down_freq * audio.get_hop_size(hparams))]
		assert len(out) % audio.get_hop_size(hparams) == 0
		time_steps = len(out)

		# Write the spectrogram and audio to disk
		audio_filename = os.path.join(wav_dir, 'audio-{}-{}.npy'.format(index, sample_idx))
		mfcc_filename = os.path.join(mfcc_dir, 'mfcc-{}-{}.npy'.format(index, sample_idx))
		np.save(audio_filename, out.astype(out_dtype), allow_pickle=False)
		np.save(mfcc_filename, mfcc, allow_pickle=False)

		#global condition features
		if hparams.gin_channels > 0:
			if (mode == "train") or (mode == "post_train"):
				speaker_id = hparams.speakers.index(index[:4])
			elif mode == "synth":
				speaker_id = 0
			else:
				speaker_id = '<no_g>'

		sample_metadata.append((audio_filename, mfcc_filename, mfcc_filename, speaker_id, time_steps, mfcc_frames))
		sample_idx += 1


	return sample_metadata
Ejemplo n.º 9
0
def _process_utterance(out_dir, index, wav_path, text):
    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        # [0, quantize_channels)
        out = P.mulaw_quantize(wav, hparams.quantize_channels)

        # Trim silences
        start, end = audio.start_and_end_indices(out,
                                                 hparams.silence_threshold)
        wav = wav[start:end]
        out = out[start:end]
        constant_values = P.mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16
    elif is_mulaw(hparams.input_type):
        # [-1, 1]
        out = P.mulaw(wav, hparams.quantize_channels)
        constant_values = P.mulaw(0.0, hparams.quantize_channels)
        out_dtype = np.float32
    else:
        # [-1, 1]
        out = wav
        constant_values = 0.0
        out_dtype = np.float32

    # Compute a mel-scale spectrogram from the trimmed wav:
    # (N, D)
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
    # lws pads zeros internally before performing stft
    # this is needed to adjust time resolution between audio and mel-spectrogram
    l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size())

    # zero pad for quantized signal
    out = np.pad(out, (l, r), mode="constant", constant_values=constant_values)
    N = mel_spectrogram.shape[0]
    assert len(out) >= N * audio.get_hop_size()

    # time resolution adjustment
    # ensure length of raw audio is multiple of hop_size so that we can use
    # transposed convolution to upsample
    out = out[:N * audio.get_hop_size()]
    assert len(out) % audio.get_hop_size() == 0

    timesteps = len(out)

    # Write the spectrograms to disk:
    audio_filename = 'bznsyp-audio-%05d.npy' % index
    mel_filename = 'bznsyp-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, audio_filename),
            out.astype(out_dtype),
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.astype(np.float32),
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (audio_filename, mel_filename, timesteps, text)
def assert_ready_for_upsampling(x, c):
    assert len(x) % len(c) == 0 and len(x) // len(c) == audio.get_hop_size()
Ejemplo n.º 11
0
def _process_utterance(out_dir, index, speaker_id, wav_path, text):
    sr = hparams.sample_rate

    # Load the audio to a numpy array. Resampled if needed
    wav = audio.load_wav(wav_path)

    lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab")

    # Trim silence from hts labels if available
    # TODO
    if exists(lab_path) and False:
        labels = hts.load(lab_path)
        b = int(start_at(labels) * 1e-7 * sr)
        e = int(end_at(labels) * 1e-7 * sr)
        wav = wav[b:e]
        wav, _ = librosa.effects.trim(wav, top_db=20)
    else:
        wav, _ = librosa.effects.trim(wav, top_db=20)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        # [0, quantize_channels)
        out = P.mulaw_quantize(wav, hparams.quantize_channels)

        # Trim silences
        start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
        wav = wav[start:end]
        out = out[start:end]
        constant_values = P.mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16
    elif is_mulaw(hparams.input_type):
        # [-1, 1]
        out = P.mulaw(wav, hparams.quantize_channels)
        constant_values = P.mulaw(0.0, hparams.quantize_channels)
        out_dtype = np.float32
    else:
        # [-1, 1]
        out = wav
        constant_values = 0.0
        out_dtype = np.float32

    # Compute a mel-scale spectrogram from the trimmed wav:
    # (N, D)
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
    # lws pads zeros internally before performing stft
    # this is needed to adjust time resolution between audio and mel-spectrogram
    l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size())

    # zero pad for quantized signal
    out = np.pad(out, (l, r), mode="constant", constant_values=constant_values)
    N = mel_spectrogram.shape[0]
    assert len(out) >= N * audio.get_hop_size()

    # time resolution adjustment
    # ensure length of raw audio is multiple of hop_size so that we can use
    # transposed convolution to upsample
    out = out[:N * audio.get_hop_size()]
    assert len(out) % audio.get_hop_size() == 0

    timesteps = len(out)

    # Write the spectrograms to disk:
    audio_filename = 'cmu_arctic-audio-%05d.npy' % index
    mel_filename = 'cmu_arctic-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, audio_filename),
            out.astype(out_dtype), allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.astype(np.float32), allow_pickle=False)

    # Return a tuple describing this training example:
    return (audio_filename, mel_filename, timesteps, text, speaker_id)
Ejemplo n.º 12
0
def eval_model(global_step,
               writer,
               device,
               model,
               y,
               c,
               g,
               input_lengths,
               eval_dir,
               ema=None):
    if ema is not None:
        print("Using averaged model for evaluation")
        model = clone_as_averaged_model(device, model, ema)
        model.make_generation_fast_()

    model.eval()
    idx = np.random.randint(0, len(y))
    length = input_lengths[idx].data.cpu().item()

    # (T,)
    y_target = y[idx].view(-1).data.cpu().numpy()[:length]

    if c is not None:
        if hparams.upsample_conditional_features:
            c = c[idx, :, :length // audio.get_hop_size()].unsqueeze(0)
        else:
            c = c[idx, :, :length].unsqueeze(0)
        assert c.dim() == 3
        print("Shape of local conditioning features: {}".format(c.size()))
    if g is not None:
        # TODO: test
        g = g[idx]
        print("Shape of global conditioning features: {}".format(g.size()))

    # Dummy silence
    if is_mulaw_quantize(hparams.input_type):
        initial_value = P.mulaw_quantize(0, hparams.quantize_channels)
    elif is_mulaw(hparams.input_type):
        initial_value = P.mulaw(0.0, hparams.quantize_channels)
    else:
        initial_value = 0.0
    print("Intial value:", initial_value)

    # (C,)
    if is_mulaw_quantize(hparams.input_type):
        initial_input = np_utils.to_categorical(
            initial_value,
            num_classes=hparams.quantize_channels).astype(np.float32)
        initial_input = torch.from_numpy(initial_input).view(
            1, 1, hparams.quantize_channels)
    else:
        initial_input = torch.zeros(1, 1, 1).fill_(initial_value)
    initial_input = initial_input.to(device)

    # Run the model in fast eval mode
    with torch.no_grad():
        y_hat = model.incremental_forward(initial_input,
                                          c=c,
                                          g=g,
                                          T=length,
                                          softmax=True,
                                          quantize=True,
                                          tqdm=tqdm,
                                          log_scale_min=hparams.log_scale_min)

    if is_mulaw_quantize(hparams.input_type):
        y_hat = y_hat.max(1)[1].view(-1).long().cpu().data.numpy()
        y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels)
        y_target = P.inv_mulaw_quantize(y_target, hparams.quantize_channels)
    elif is_mulaw(hparams.input_type):
        y_hat = P.inv_mulaw(
            y_hat.view(-1).cpu().data.numpy(), hparams.quantize_channels)
        y_target = P.inv_mulaw(y_target, hparams.quantize_channels)
    else:
        y_hat = y_hat.view(-1).cpu().data.numpy()

    # Save audio
    os.makedirs(eval_dir, exist_ok=True)
    path = join(eval_dir, "step{:09d}_predicted.wav".format(global_step))
    librosa.output.write_wav(path, y_hat, sr=hparams.sample_rate)
    path = join(eval_dir, "step{:09d}_target.wav".format(global_step))
    librosa.output.write_wav(path, y_target, sr=hparams.sample_rate)

    # save figure
    path = join(eval_dir, "step{:09d}_waveplots.png".format(global_step))
    save_waveplot(path, y_hat, y_target)
Ejemplo n.º 13
0
def collate_fn(batch):
    """Create batch

    Args:
        batch(tuple): List of tuples
            - x[0] (ndarray,int) : list of (T,)
            - x[1] (ndarray,int) : list of (T, D)
            - x[2] (ndarray,int) : list of (1,), speaker id
    Returns:
        tuple: Tuple of batch
            - x (FloatTensor) : Network inputs (B, C, T)
            - y (LongTensor)  : Network targets (B, T, 1)
    """

    local_conditioning = len(batch[0]) >= 2 and hparams.cin_channels > 0
    global_conditioning = len(batch[0]) >= 3 and hparams.gin_channels > 0

    if hparams.max_time_sec is not None:
        max_time_steps = int(hparams.max_time_sec * hparams.sample_rate)
    elif hparams.max_time_steps is not None:
        max_time_steps = hparams.max_time_steps
    else:
        max_time_steps = None

    # Time resolution adjustment
    if local_conditioning:
        new_batch = []
        for idx in range(len(batch)):
            x, c, g = batch[idx]
            if hparams.upsample_conditional_features:
                assert_ready_for_upsampling(x, c)
                if max_time_steps is not None:
                    max_steps = ensure_divisible(max_time_steps,
                                                 audio.get_hop_size(), True)
                    if len(x) > max_steps:
                        max_time_frames = max_steps // audio.get_hop_size()
                        s = np.random.randint(0, len(c) - max_time_frames)
                        ts = s * audio.get_hop_size()
                        x = x[ts:ts + audio.get_hop_size() * max_time_frames]
                        c = c[s:s + max_time_frames, :]
                        assert_ready_for_upsampling(x, c)
            else:
                x, c = audio.adjust_time_resolution(x, c)
                if max_time_steps is not None and len(x) > max_time_steps:
                    s = np.random.randint(0, len(x) - max_time_steps)
                    x, c = x[s:s + max_time_steps], c[s:s + max_time_steps, :]
                assert len(x) == len(c)
            new_batch.append((x, c, g))
        batch = new_batch
    else:
        new_batch = []
        for idx in range(len(batch)):
            x, c, g = batch[idx]
            x = audio.trim(x)
            if max_time_steps is not None and len(x) > max_time_steps:
                s = np.random.randint(0, len(x) - max_time_steps)
                if local_conditioning:
                    x, c = x[s:s + max_time_steps], c[s:s + max_time_steps, :]
                else:
                    x = x[s:s + max_time_steps]
            new_batch.append((x, c, g))
        batch = new_batch

    # Lengths
    input_lengths = [len(x[0]) for x in batch]
    max_input_len = max(input_lengths)

    # (B, T, C)
    # pad for time-axis
    if is_mulaw_quantize(hparams.input_type):
        x_batch = np.array([
            _pad_2d(
                np_utils.to_categorical(x[0],
                                        num_classes=hparams.quantize_channels),
                max_input_len) for x in batch
        ],
                           dtype=np.float32)
    else:
        x_batch = np.array(
            [_pad_2d(x[0].reshape(-1, 1), max_input_len) for x in batch],
            dtype=np.float32)
    assert len(x_batch.shape) == 3

    # (B, T)
    if is_mulaw_quantize(hparams.input_type):
        y_batch = np.array([_pad(x[0], max_input_len) for x in batch],
                           dtype=np.int)
    else:
        y_batch = np.array([_pad(x[0], max_input_len) for x in batch],
                           dtype=np.float32)
    assert len(y_batch.shape) == 2

    # (B, T, D)
    if local_conditioning:
        max_len = max([len(x[1]) for x in batch])
        c_batch = np.array([_pad_2d(x[1], max_len) for x in batch],
                           dtype=np.float32)
        assert len(c_batch.shape) == 3
        # (B x C x T)
        c_batch = torch.FloatTensor(c_batch).transpose(1, 2).contiguous()
    else:
        c_batch = None

    if global_conditioning:
        g_batch = torch.LongTensor([x[2] for x in batch])
    else:
        g_batch = None

    # Covnert to channel first i.e., (B, C, T)
    x_batch = torch.FloatTensor(x_batch).transpose(1, 2).contiguous()
    # Add extra axis
    if is_mulaw_quantize(hparams.input_type):
        y_batch = torch.LongTensor(y_batch).unsqueeze(-1).contiguous()
    else:
        y_batch = torch.FloatTensor(y_batch).unsqueeze(-1).contiguous()

    input_lengths = torch.LongTensor(input_lengths)

    return x_batch, y_batch, c_batch, g_batch, input_lengths
def collate_fn(batch):
    """Create batch

    Args:
        batch(tuple): List of tuples
            - x[0] (ndarray,int) : list of (T,)
            - x[1] (ndarray,int) : list of (T, D)
            - x[2] (ndarray,int) : list of (1,), speaker id
    Returns:
        tuple: Tuple of batch
            - x (FloatTensor) : Network inputs (B, C, T)
            - y (LongTensor)  : Network targets (B, T, 1)
    """

    local_conditioning = len(batch[0]) >= 2 and hparams.cin_channels > 0
    global_conditioning = len(batch[0]) >= 3 and hparams.file_channel > 0

    if hparams.max_time_sec is not None:
        max_time_steps = int(hparams.max_time_sec * hparams.sample_rate)
    elif hparams.max_time_steps is not None:
        max_time_steps = hparams.max_time_steps
    else:
        max_time_steps = None
    max_time_second = max_time_steps / hparams.sample_rate

    use_image_num = int(
        np.floor(max_time_second / (0.04 * hparams.image_hope_size)))
    # Time resolution adjustment
    video_block = []
    flow_block = []
    if local_conditioning:
        new_batch = []
        for idx in range(len(batch)):
            x, c, video, flow, start, g, path = batch[idx]
            if hparams.upsample_conditional_features:
                assert_ready_for_upsampling(x, c)
                if max_time_steps is not None:
                    max_steps = ensure_divisible(max_time_steps,
                                                 audio.get_hop_size(), True)
                    if len(x) > max_steps:

                        for ln in range(hparams.load_num):
                            mel_start = 3 + 4 * start[ln]
                            c1 = c[mel_start:mel_start + use_image_num * 4]
                            x1 = x[mel_start *
                                   hparams.hop_size:(mel_start +
                                                     use_image_num * 4) *
                                   hparams.hop_size]
                            new_batch.append(
                                (x1, c1, g, os.path.join(path,
                                                         str(start[ln]))))
                        video_block.append(torch.FloatTensor(video))
                        flow_block.append(torch.FloatTensor(flow))
        batch = new_batch

    # Lengths
    input_lengths = [len(x[0]) for x in batch]
    max_input_len = max(input_lengths)

    # (B, T, C)
    # pad for time-axis
    if is_mulaw_quantize(hparams.input_type):
        x_batch = np.array([
            _pad_2d(
                np_utils.to_categorical(x[0],
                                        num_classes=hparams.quantize_channels),
                max_input_len) for x in batch
        ],
                           dtype=np.float32)
    else:
        x_batch = np.array(
            [_pad_2d(x[0].reshape(-1, 1), max_input_len) for x in batch],
            dtype=np.float32)
    assert len(x_batch.shape) == 3

    # (B, T)
    if is_mulaw_quantize(hparams.input_type):
        y_batch = np.array([_pad(x[0], max_input_len) for x in batch],
                           dtype=np.int)
    else:
        y_batch = np.array([_pad(x[0], max_input_len) for x in batch],
                           dtype=np.float32)
    assert len(y_batch.shape) == 2

    # (B, T, D)
    if local_conditioning:
        max_len = max([len(x[1]) for x in batch])
        c_batch = np.array([_pad_2d(x[1], max_len) for x in batch],
                           dtype=np.float32)
        assert len(c_batch.shape) == 3
        # (B x C x T)
        c_batch = torch.FloatTensor(c_batch).transpose(1, 2).contiguous()
    else:
        c_batch = None

    if global_conditioning:
        g_batch = torch.LongTensor([x[2] for x in batch])
    else:
        g_batch = None

    path_batch = list(x[3] for x in batch)

    video_batch = torch.cat(video_block, 0)
    flow_batch = torch.cat(flow_block, 0)

    # Covnert to channel first i.e., (B, C, T)
    x_batch = torch.FloatTensor(x_batch).transpose(1, 2).contiguous()
    # Add extra axis
    if is_mulaw_quantize(hparams.input_type):
        y_batch = torch.LongTensor(y_batch).unsqueeze(-1).contiguous()
    else:
        y_batch = torch.FloatTensor(y_batch).unsqueeze(-1).contiguous()

    input_lengths = torch.LongTensor(input_lengths)

    return video_batch, flow_batch, c_batch, x_batch, y_batch, g_batch, input_lengths, path_batch
Ejemplo n.º 15
0
 def _assert_ready_for_upsample(self, x, c):
     assert len(x) % len(c) == 0 and len(x) // len(c) == audio.get_hop_size()
Ejemplo n.º 16
0
def _process_utterance(out_dir, index, audio_filepath, text):
    # Load the audio to a numpy array:
    wav_whole = audio.load_wav(audio_filepath)

    if hparams.rescaling:
        wav_whole = wav_whole / np.abs(wav_whole).max() * hparams.rescaling_max

    # This is a librivox source, so the audio files are going to be v. long
    # compared to a typical 'utterance' : So split the wav into chunks

    tup_results = []

    n_samples = int(8.0 * hparams.sample_rate)  # All 8 second utterances
    n_chunks = wav_whole.shape[0] // n_samples

    for chunk_idx in range(n_chunks):
        chunk_start, chunk_end = chunk_idx * n_samples, (chunk_idx + 1) * n_samples
        if chunk_idx == n_chunks - 1:  # This is the last chunk - allow it to extend to the end of the file
            chunk_end = None
        wav = wav_whole[chunk_start: chunk_end]

        # Mu-law quantize
        if is_mulaw_quantize(hparams.input_type):
            # [0, quantize_channels)
            out = P.mulaw_quantize(wav, hparams.quantize_channels)

            # Trim silences
            start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
            wav = wav[start:end]
            out = out[start:end]
            constant_values = P.mulaw_quantize(0, hparams.quantize_channels)
            out_dtype = np.int16
        elif is_mulaw(hparams.input_type):
            # [-1, 1]
            out = P.mulaw(wav, hparams.quantize_channels)
            constant_values = P.mulaw(0.0, hparams.quantize_channels)
            out_dtype = np.float32
        else:
            # [-1, 1]
            out = wav
            constant_values = 0.0
            out_dtype = np.float32

        # Compute a mel-scale spectrogram from the trimmed wav:
        # (N, D)
        mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
        # lws pads zeros internally before performing stft
        # this is needed to adjust time resolution between audio and mel-spectrogram
        l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size())

        # zero pad for quantized signal
        out = np.pad(out, (l, r), mode="constant", constant_values=constant_values)
        N = mel_spectrogram.shape[0]
        assert len(out) >= N * audio.get_hop_size()

        # time resolution adjustment
        # ensure length of raw audio is multiple of hop_size so that we can use
        # transposed convolution to upsample
        out = out[:N * audio.get_hop_size()]
        assert len(out) % audio.get_hop_size() == 0

        timesteps = len(out)

        # Write the spectrograms to disk:
        audio_filename = 'librivox-audio-%04d-%05d.npy' % (index, chunk_idx,)
        mel_filename = 'librivox-mel-%04d-%05d.npy' % (index, chunk_idx,)
        text_idx = '%s - %05d' % (text, chunk_idx,)
        np.save(os.path.join(out_dir, audio_filename),
                out.astype(out_dtype), allow_pickle=False)
        np.save(os.path.join(out_dir, mel_filename),
                mel_spectrogram.astype(np.float32), allow_pickle=False)

        # Add results tuple describing this training example:
        tup_results.append((audio_filename, mel_filename, timesteps, text_idx))

    # Return all the audio results tuples (unpack in caller)
    return tup_results
Ejemplo n.º 17
0
def _process_utterance(out_dir, index, wav_path, text, silence_threshold,
                       fft_size):
    '''Preprocesses a single utterance audio/text pair.

    This writes the mel and linear scale spectrograms to disk and returns a tuple to write
    to the train.txt file.

    Args:
      out_dir: The directory to write the spectrograms into
      index: The numeric index to use in the spectrogram filenames.
      wav_path: Path to the audio file containing the speech input
      text: The text spoken in the input audio file

    Returns:
      A (spectrogram_filename, mel_filename, text, mel_len) tuple to write to train.txt
    '''
    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    if hp.rescaling:
        wav = wav / np.abs(wav).max() * hp.rescaling_max

    if hp.input_type != "raw":
        # Mu-law quantize
        out = P.mulaw_quantize(wav)

        # Trim silences
        start, end = audio.start_and_end_indices(out, silence_threshold)
        out = out[start:end]
        wav = wav[start:end]
        constant_value = P.mulaw_quantize(0, 256)
        out_dtype = np.int16
    else:
        out = wav
        constant_value = 0.
        out_dtype = np.float32

    # Compute a mel-scale spectrogram from the trimmed wav:
    # (N, D)
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T

    # lws pads zeros internally before performing stft
    # this is needed to adjust time resolution between audio and mel-spectrogram
    l, r = audio.lws_pad_lr(wav, fft_size, audio.get_hop_size())

    # zero pad for quantized signal
    out = np.pad(out, (l, r), mode="constant", constant_values=constant_value)
    mel_len = mel_spectrogram.shape[0]
    assert len(out) >= mel_len * audio.get_hop_size()

    # time resolution adjustment
    # ensure length of raw audio is multiple of hop_size so that we can use
    # transposed convolution to upsample
    out = out[:mel_len * audio.get_hop_size()]
    assert len(out) % audio.get_hop_size() == 0

    timesteps = len(out)

    wav_id = wav_path.split('/')[-1].split('.')[0]
    # Write the spectrograms to disk:
    audio_path = os.path.join(out_dir, '{}-audio.npy'.format(wav_id))
    mel_path = os.path.join(out_dir, '{}-mel.npy'.format(wav_id))
    np.save(audio_path, out.astype(out_dtype), allow_pickle=False)
    np.save(mel_path, mel_spectrogram.astype(np.float32), allow_pickle=False)

    # Return a tuple describing this training example:
    return os.path.abspath(audio_path), os.path.abspath(
        mel_path), text, timesteps