Ejemplo n.º 1
0
    def _adjust_time_resolution(self, batch, local_condition, max_time_steps):
        '''Adjust time resolution between audio and local condition
		'''
        if local_condition:
            new_batch = []
            for b in batch:
                x, c, g, l = b
                self._assert_ready_for_upsample(x, c)
                if max_time_steps is not None:
                    max_steps = _ensure_divisible(
                        max_time_steps, audio.get_hop_size(self._hparams),
                        True)
                    if len(x) > max_time_steps:
                        max_time_frames = max_steps // audio.get_hop_size(
                            self._hparams)
                        start = np.random.randint(0, len(c) - max_time_frames)
                        time_start = start * audio.get_hop_size(self._hparams)
                        x = x[time_start:time_start + max_time_frames *
                              audio.get_hop_size(self._hparams)]
                        c = c[start:start + max_time_frames, :]
                        self._assert_ready_for_upsample(x, c)

                new_batch.append((x, c, g, l))
            return new_batch

        else:
            new_batch = []
            for b in batch:
                x, c, g, l = b
                x = audio.trim_silence(x, hparams)
                if max_time_steps is not None and len(x) > max_time_steps:
                    start = np.random.randint(0, len(c) - max_time_steps)
                    x = x[start:start + max_time_steps]
                new_batch.append((x, c, g, l))
            return new_batch
Ejemplo n.º 2
0
def extract_mel(wav_filename, out_wav_path, out_dir, key, hparams, args):
    if not os.path.exists(wav_filename):
        print("Wav file {} doesn't exists.".format(wav_filename))
        return None

    wav = audio.load_wav(wav_filename, sr=hparams.sample_rate)
    # Process wav samples
    wav = audio.trim_silence(wav, hparams)
    n_samples = len(wav)

    # Extract mel spectrogram
    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
    n_frames = mel_spectrogram.shape[1]
    if n_frames > hparams.max_acoustic_length:
        print(
            "Ignore wav {} because the frame number {} is too long (Max {} frames in hparams.yaml)."
            .format(wav_filename, n_frames, hparams.max_acoustic_length))
        return None

    # Align features
    desired_frames = int(min(n_samples / hparams.hop_size, n_frames))
    wav = wav[:desired_frames * hparams.hop_size]
    mel_spectrogram = mel_spectrogram[:, :desired_frames]
    n_samples = wav.shape[0]
    n_frames = mel_spectrogram.shape[1]
    assert (n_samples / hparams.hop_size == n_frames)

    # Save intermediate acoustic features
    mel_filename = os.path.join(out_dir, key + '.npy')
    np.save(mel_filename, mel_spectrogram.T, allow_pickle=False)
    audio.save_wav(wav, out_wav_path, hparams)

    return (wav_filename, mel_filename, n_samples, n_frames)
Ejemplo n.º 3
0
def create_seed(filename,sample_rate,quantization_channels,window_size,scalar_input):
    # seed의 앞부분만 사용한다.
    seed_audio, _ = librosa.load(filename, sr=sample_rate, mono=True)
    seed_audio = audio.trim_silence(seed_audio, default_hparams)
    if scalar_input:
        if len(seed_audio) < window_size:
            return seed_audio
        else: return seed_audio[:window_size]
    else:
        quantized = mu_law_encode(seed_audio, quantization_channels)
    
    
        # 짧으면 짧은 대로 return하는데, padding이라도 해야되지 않나???
        cut_index = tf.cond(tf.size(quantized) < tf.constant(window_size), lambda: tf.size(quantized), lambda: tf.constant(window_size))
    
        return quantized[:cut_index]
Ejemplo n.º 4
0
def _process_utterance(out_dir, wav_path, text, hparams):
    """
    Preprocesses a single utterance wav/text pair

    this writes the mel scale spectogram to disk and return a tuple to write
    to the train.txt file

    Args:
        - mel_dir: the directory to write the mel spectograms into
        - linear_dir: the directory to write the linear spectrograms into
        - wav_dir: the directory to write the preprocessed wav into
        - index: the numeric index to use in the spectogram filename
        - wav_path: path to the audio file containing the speech input
        - text: text spoken in the input audio file
        - hparams: hyper parameters

    Returns:
        - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
    """
    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
    except FileNotFoundError:  #catch missing wav exception
        print(
            'file {} present in csv metadata is not present in wav folder. skipping!'
            .format(wav_path))
        return None

    #rescale wav
    if hparams.rescaling:  # hparams.rescale = True
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    #M-AILABS extra silence specific
    if hparams.trim_silence:  # hparams.trim_silence = True
        wav = audio.trim_silence(wav,
                                 hparams)  # Trim leading and trailing silence

    #Mu-law quantize, default 값은 'raw'
    if hparams.input_type == 'mulaw-quantize':
        #[0, quantize_channels)
        out = audio.mulaw_quantize(wav, hparams.quantize_channels)

        #Trim silences
        start, end = audio.start_and_end_indices(out,
                                                 hparams.silence_threshold)
        wav = wav[start:end]
        out = out[start:end]

        constant_values = mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16

    elif hparams.input_type == 'mulaw':
        #[-1, 1]
        out = audio.mulaw(wav, hparams.quantize_channels)
        constant_values = audio.mulaw(0., hparams.quantize_channels)
        out_dtype = np.float32

    else:  # raw
        #[-1, 1]
        out = wav
        constant_values = 0.
        out_dtype = np.float32

    # Compute the mel scale spectrogram from the wav
    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]

    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:  # hparams.max_mel_frames = 1000, hparams.clip_mels_length = True
        return None

    #Compute the linear scale spectrogram from the wav
    linear_spectrogram = audio.linearspectrogram(wav,
                                                 hparams).astype(np.float32)
    linear_frames = linear_spectrogram.shape[1]

    #sanity check
    assert linear_frames == mel_frames

    if hparams.use_lws:  # hparams.use_lws = False
        #Ensure time resolution adjustement between audio and mel-spectrogram
        fft_size = hparams.fft_size if hparams.win_size is None else hparams.win_size
        l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))

        #Zero pad audio signal
        out = np.pad(out, (l, r),
                     mode='constant',
                     constant_values=constant_values)
    else:
        #Ensure time resolution adjustement between audio and mel-spectrogram
        pad = audio.librosa_pad_lr(wav, hparams.fft_size,
                                   audio.get_hop_size(hparams))

        #Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency)
        out = np.pad(out, pad, mode='reflect')

    assert len(out) >= mel_frames * audio.get_hop_size(hparams)

    #time resolution adjustement
    #ensure length of raw audio is multiple of hop size so that we can use
    #transposed convolution to upsample
    out = out[:mel_frames * audio.get_hop_size(hparams)]
    assert len(out) % audio.get_hop_size(hparams) == 0
    time_steps = len(out)

    # Write the spectrogram and audio to disk
    wav_id = os.path.splitext(os.path.basename(wav_path))[0]

    # Write the spectrograms to disk:
    audio_filename = '{}-audio.npy'.format(wav_id)
    mel_filename = '{}-mel.npy'.format(wav_id)
    linear_filename = '{}-linear.npy'.format(wav_id)
    npz_filename = '{}.npz'.format(wav_id)
    npz_flag = True
    if npz_flag:
        # Tacotron 코드와 맞추기 위해, 같은 key를 사용한다.
        data = {
            'audio': out.astype(out_dtype),
            'mel': mel_spectrogram.T,
            'linear': linear_spectrogram.T,
            'time_steps': time_steps,
            'mel_frames': mel_frames,
            'text': text,
            'tokens': text_to_sequence(text),  # eos(~)에 해당하는 "1"이 끝에 붙는다.
            'loss_coeff': 1  # For Tacotron
        }

        np.savez(os.path.join(out_dir, npz_filename),
                 **data,
                 allow_pickle=False)
    else:
        np.save(os.path.join(out_dir, audio_filename),
                out.astype(out_dtype),
                allow_pickle=False)
        np.save(os.path.join(out_dir, mel_filename),
                mel_spectrogram.T,
                allow_pickle=False)
        np.save(os.path.join(out_dir, linear_filename),
                linear_spectrogram.T,
                allow_pickle=False)

    # Return a tuple describing this training example
    return (audio_filename, mel_filename, linear_filename, time_steps,
            mel_frames, text, npz_filename)
Ejemplo n.º 5
0
def _process_utterance(out_dir, wav_path, text, hparams):
    """
    Preprocesses a single utterance wav/text pair

    this writes the mel scale spectogram to disk and return a tuple to write
    to the train.txt file

    Args:
        - mel_dir: the directory to write the mel spectograms into
        - linear_dir: the directory to write the linear spectrograms into
        - wav_dir: the directory to write the preprocessed wav into
        - index: the numeric index to use in the spectogram filename
        - wav_path: path to the audio file containing the speech input
        - text: text spoken in the input audio file
        - hparams: hyper parameters

    Returns:
        - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
    """
    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path, sr=hparams.sample_rate) #1차원짜리 wav파일 뽑아옴
        #Load an audio file as a floating point time series.
        #Audio will be automatically resampled to the given rate (default sr=22050).
        #To preserve the native sampling rate of the file, use sr=None. 
        #print('====wav====')
        #print(wav,wav.shape) (240001,)
    except FileNotFoundError: #catch missing wav exception
        print('file {} present in csv metadata is not present in wav folder. skipping!'.format(
            wav_path))
        return None

    #rescale wav
    if hparams.rescaling:   # hparams.rescale = True
        wav = wav / np.abs(wav).max() * hparams.rescaling_max
    #We rescale because it is assumed in Wavenet training that wavs are in [-1, 1] when computing the mixture loss. This is mainly coming from PixelCNN implementation.
    #https://github.com/Rayhane-mamah/Tacotron-2/issues/69

    #M-AILABS extra silence specific
    if hparams.trim_silence:  # hparams.trim_silence = True
        wav = audio.trim_silence(wav, hparams)   # Trim leading and trailing silence

    #Mu-law quantize, default 값은 'raw'
    #The quantization noise is from the analog to digital conversion. The mu-law compression actually reduces the noise and increases the dynamic range.
    #If you search a little bit in the code you will find that the input is always mu-law encoded here.
    #scalar_input only determines if the model uses a one-hot encoding for every data point of the input waveform, or just uses floating point values for each sample.
    if hparams.input_type=='mulaw-quantize':
        #[0, quantize_channels)
        out = audio.mulaw_quantize(wav, hparams.quantize_channels)

        #Trim silences
        start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
        wav = wav[start: end]
        out = out[start: end]

        constant_values = mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16

    elif hparams.input_type=='mulaw':
        #[-1, 1]
        out = audio.mulaw(wav, hparams.quantize_channels)
        constant_values = mulaw(0., hparams.quantize_channels)
        out_dtype = np.float32

    else:  # raw
        #[-1, 1]
        out = wav
        constant_values = 0.
        out_dtype = np.float32

    # Compute the mel scale spectrograFm from the wav
    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
    #print('====mel_spectrogram====')
    #print(mel_spectrogram,mel_spectrogram.shape) #(80,797),(80,801) ...
    mel_frames = mel_spectrogram.shape[1]
    #print('===mel frame====')
    #print(mel_frames) 801, 797 ,...
    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:   # hparams.max_mel_frames = 1000, hparams.clip_mels_length = True
        return None

    #Compute the linear scale spectrogram from the wav
    linear_spectrogram = audio.linearspectrogram(wav, hparams).astype(np.float32)
    #print('====linear_spectrogram====')
    #print(linear_spectrogram,linear_spectrogram.shape) #(1025,787),(1025,801)
    linear_frames = linear_spectrogram.shape[1]

    #sanity check
    assert linear_frames == mel_frames

    if hparams.use_lws:    # hparams.use_lws = False
        #Ensure time resolution adjustement between audio and mel-spectrogram
        fft_size = hparams.fft_size if hparams.win_size is None else hparams.win_size
        l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))

        #Zero pad audio signal
        out = np.pad(out, (l, r), mode='constant', constant_values=constant_values)
    else:
        #Ensure time resolution adjustement between audio and mel-spectrogram
        pad = audio.librosa_pad_lr(wav, hparams.fft_size, audio.get_hop_size(hparams)) #1024 == 2048//2 == fft_size//2
        #print('===pad===')
        #print(pad) 
        #Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency)
        #print(out,out.shape) #(240001,)
        out = np.pad(out, pad, mode='reflect') #shape : (242049,) - 패딩
        #print(out,out.shape) #(242049,)
        #print('===out====')
        #print(out,out.shape)

    assert len(out) >= mel_frames * audio.get_hop_size(hparams)

    #time resolution adjustement
    #ensure length of raw audio is multiple of hop size so that we can use
    #transposed convolution to upsample
    out = out[:mel_frames * audio.get_hop_size(hparams)] #240300으로 맞춤(자름)
    assert len(out) % audio.get_hop_size(hparams) == 0
    time_steps = len(out)
    #print(audio.get_hop_size(hparams)) : 300
    #print(out,out.shape) #(240300,) = 801*300
    # Write the spectrogram and audio to disk
    wav_id = os.path.splitext(os.path.basename(wav_path))[0] #확장자 제외하고 파일 이름 얻기
    #print('====wav_id====')
    #print(wav_id)
    # Write the spectrograms to disk:
    audio_filename = '{}-audio.npy'.format(wav_id)
    mel_filename = '{}-mel.npy'.format(wav_id)
    linear_filename = '{}-linear.npy'.format(wav_id)
    npz_filename = '{}.npz'.format(wav_id)
    npz_flag=True
    if npz_flag:
        # Tacotron 코드와 맞추기 위해, 같은 key를 사용한다.
        data = {
            'audio': out.astype(out_dtype),
            'mel': mel_spectrogram.T,  
            'linear': linear_spectrogram.T,
            'time_steps': time_steps,
            'mel_frames': mel_frames,
            'text': text,
            'tokens': text_to_sequence(text),   # eos(~)에 해당하는 "1"이 끝에 붙는다.
            'loss_coeff': 1  # For Tacotron
        }
        #print('=====data====')
        #print(data)
        np.savez(os.path.join(out_dir,npz_filename ), **data, allow_pickle=False) #여러개의 배열을 1개의 압축되지 않은 *.npz 포맷 파일로 저장하기
    else:
        np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False)
        np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
        np.save(os.path.join(out_dir, linear_filename), linear_spectrogram.T, allow_pickle=False)

    # Return a tuple describing this training example
    #print('====mel_frames====')
    #print(mel_frames)
    #print('====time_steps====')
    #print(time_steps)
    return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text,npz_filename)
Ejemplo n.º 6
0
def _process_utterance(mfcc_dir, wav_dir, index, wav_path, hparams, mode):
	"""
	Preprocesses a single utterance wav/text pair

	this writes the mfcc to disk and return a tuple to write
	to the train.txt file

	Args:
		- mfcc_dir: the directory to write the mfcc into
		- linear_dir: the directory to write the linear spectrograms into
		- wav_dir: the directory to write the preprocessed wav into
		- index: the numeric index to use in the spectrogram filename
		- wav_path: path to the audio file containing the speech input
		- text: text spoken in the input audio file
		- hparams: hyper parameters

	Returns:
		- A tuple: (audio_filename, mfcc_filename, linear_filename, time_steps, mfcc_frames, linear_frames, text)
	"""

	try:
		# Load the audio as numpy array
		wav_full = audio.load_wav(wav_path, sr=hparams.sample_rate)
	except FileNotFoundError: #catch missing wav exception
		print('file {} present in csv metadata is not present in wav folder. skipping!'.format(
			wav_path))
		return None

	#M-AILABS extra silence specific
	if hparams.trim_silence:
		wav_full = audio.trim_silence(wav_full, hparams)

	# Preprocess Audio & Extract MFCC (mfcc + d + a)
	sample_idx = 0
	sample_metadata = []

	if (mode == "train") or (mode == "post_train"):
		# Add the same size slice from the end
		if wav_full.shape[0] >= hparams.sample_size:
			n_slice = int(np.floor(wav_full.shape[0]/hparams.sample_size))
			samples = wav_full[:n_slice * hparams.sample_size].reshape((n_slice, hparams.sample_size))
			if wav_full.shape[0] % hparams.sample_size != 0:
				## FOR UNIT SEARCH : slice each audio by sample_size
				last_slice = wav_full[::-1][:hparams.sample_size]
				samples = np.vstack((samples, last_slice))
		else:
			samples = [wav_full]
	else:
		samples = [wav_full]


	for wav in samples:

		#Pre-emphasize
		preem_wav = audio.preemphasis(wav, hparams.preemphasis, hparams.preemphasize)

		#rescale wav
		if hparams.rescale:
			wav = wav / np.abs(wav).max() * hparams.rescaling_max
			preem_wav = preem_wav / np.abs(preem_wav).max() * hparams.rescaling_max

			#Assert all audio is in [-1, 1]
			if (wav > 1.).any() or (wav < -1.).any():
				raise RuntimeError('wav has invalid value: {}'.format(wav_path))
			if (preem_wav > 1.).any() or (preem_wav < -1.).any():
				raise RuntimeError('wav has invalid value: {}'.format(wav_path))

		#Mu-law quantize
		if is_mulaw_quantize(hparams.input_type):
			#[0, quantize_channels)
			out = mulaw_quantize(wav, hparams.quantize_channels)

			# #Trim silences
			# start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
			# wav = wav[start: end]
			# preem_wav = preem_wav[start: end]
			# out = out[start: end]

			constant_values = mulaw_quantize(0, hparams.quantize_channels)
			out_dtype = np.int16

		elif is_mulaw(hparams.input_type):
			#[-1, 1]
			out = mulaw(wav, hparams.quantize_channels)
			constant_values = mulaw(0., hparams.quantize_channels)
			out_dtype = np.float32

		else:
			#[-1, 1]
			out = wav
			constant_values = 0.
			out_dtype = np.float32

		# Compute mfcc
		mfcc = audio.mfcc(wav, hparams)
		mfcc_frames = mfcc.shape[0]

		# # Compute the mel scale spectrogram from the wav
		# mel_spectrogram = audio.melspectrogram(preem_wav, hparams).astype(np.float32)
		# mel_frames = mel_spectrogram.shape[1]

		if mfcc_frames > hparams.max_mel_frames and hparams.clip_mels_length:
			return None

		#Ensure time resolution adjustement between audio and mel-spectrogram
		l_pad, r_pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams))
		#Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency)
		out = np.pad(out, (l_pad, r_pad), mode='constant', constant_values=constant_values)

		assert len(out) >= mfcc_frames * audio.get_hop_size(hparams)

		#time resolution adjustement
		#ensure length of raw audio is multiple of hop size so that we can use
		out = out[:int(np.ceil(mfcc_frames/hparams.vqvae_down_freq) * hparams.vqvae_down_freq * audio.get_hop_size(hparams))]
		assert len(out) % audio.get_hop_size(hparams) == 0
		time_steps = len(out)

		# Write the spectrogram and audio to disk
		audio_filename = os.path.join(wav_dir, 'audio-{}-{}.npy'.format(index, sample_idx))
		mfcc_filename = os.path.join(mfcc_dir, 'mfcc-{}-{}.npy'.format(index, sample_idx))
		np.save(audio_filename, out.astype(out_dtype), allow_pickle=False)
		np.save(mfcc_filename, mfcc, allow_pickle=False)

		#global condition features
		if hparams.gin_channels > 0:
			if (mode == "train") or (mode == "post_train"):
				speaker_id = hparams.speakers.index(index[:4])
			elif mode == "synth":
				speaker_id = 0
			else:
				speaker_id = '<no_g>'

		sample_metadata.append((audio_filename, mfcc_filename, mfcc_filename, speaker_id, time_steps, mfcc_frames))
		sample_idx += 1


	return sample_metadata
Ejemplo n.º 7
0
def _process_utterance(out_dir, index, wav_path, text, hparams):
    """
    Preprocesses a single utterance wav/text pair

    this writes the mel scale spectogram to disk and return a tuple to write
    to the train.txt file

    Args:
        - mel_dir: the directory to write the mel spectograms into
        - linear_dir: the directory to write the linear spectrograms into
        - wav_dir: the directory to write the preprocessed wav into
        - index: the numeric index to use in the spectogram filename
        - wav_path: path to the audio file containing the speech input
        - text: text spoken in the input audio file
        - hparams: hyper parameters

    Returns:
        - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
    """
    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
    except FileNotFoundError:  # catch missing wav exception
        print(
            'file {} present in csv metadata is not present in wav folder. skipping!'
            .format(wav_path))
        return None

    # Trim lead/trail silences
    if hparams.trim_silence:
        wav = audio.trim_silence(wav, hparams)

    # Pre-emphasize
    preem_wav = audio.preemphasis(wav, hparams.preemphasis,
                                  hparams.preemphasize)

    # rescale wav
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max
        preem_wav = preem_wav / np.abs(preem_wav).max() * hparams.rescaling_max

        # Assert all audio is in [-1, 1]
        if (wav > 1.).any() or (wav < -1.).any():
            raise RuntimeError('wav has invalid value: {}'.format(wav_path))
        if (preem_wav > 1.).any() or (preem_wav < -1.).any():
            raise RuntimeError('wav has invalid value: {}'.format(wav_path))

    # [-1, 1]
    out = wav
    constant_values = 0.

    # Compute the mel scale spectrogram from the wav
    mel_spectrogram = audio.melspectrogram(preem_wav,
                                           hparams).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]

    if (mel_frames > hparams.max_mel_frames and hparams.clip_mels_length) or (
            hparams.min_text_tokens > len(text)
            or hparams.min_mel_frames > mel_frames):
        return None

    # Compute the linear scale spectrogram from the wav
    linear_spectrogram = audio.linearspectrogram(preem_wav,
                                                 hparams).astype(np.float32)
    linear_frames = linear_spectrogram.shape[1]

    # sanity check
    assert linear_frames == mel_frames

    # Ensure time resolution adjustement between audio and mel-spectrogram
    l_pad, r_pad = audio.librosa_pad_lr(wav, hparams.hop_size,
                                        hparams.pad_sides)

    # Reflect pad audio signal on the right (Just like it's done in Librosa to avoid frame inconsistency)
    out = np.pad(out, (l_pad, r_pad),
                 mode='constant',
                 constant_values=constant_values)

    assert len(out) >= mel_frames * hparams.hop_size

    # time resolution adjustement
    # ensure length of raw audio is multiple of hop size so that we can use
    # transposed convolution to upsample
    out = out[:mel_frames * hparams.hop_size]
    assert len(out) % hparams.hop_size == 0
    time_steps = len(out)
    npz_filename = '{}.npz'.format(index)
    mel_spectrogram = mel_spectrogram.T
    linear_spectrogram = linear_spectrogram.T

    r = hparams.reduction_factor
    if hparams.symmetric_mels:
        _pad_value = -hparams.max_abs_value
    else:
        _pad_value = 0.
    target_length = len(linear_spectrogram)
    mel_spectrogram = np.pad(mel_spectrogram, [[r, r], [0, 0]],
                             "constant",
                             constant_values=_pad_value)
    linear_spectrogram = np.pad(linear_spectrogram, [[r, r], [0, 0]],
                                "constant",
                                constant_values=_pad_value)
    target_length = target_length + 2 * r
    padded_target_length = (target_length // r + 1) * r
    num_pad = padded_target_length - target_length
    stop_token_target = np.pad(np.zeros(padded_target_length - 1,
                                        dtype=np.float32), (0, 1),
                               "constant",
                               constant_values=1)
    mel_spectrogram = np.pad(mel_spectrogram, ((0, num_pad), (0, 0)),
                             "constant",
                             constant_values=_pad_value)
    linear_spectrogram = np.pad(linear_spectrogram, ((0, num_pad), (0, 0)),
                                "constant",
                                constant_values=_pad_value)

    data = {
        'mel': mel_spectrogram,
        'linear': linear_spectrogram,
        'input_data': text_to_sequence(text),  # eos(~)
        'time_steps': time_steps,
        'stop_token_target': stop_token_target,
        'mel_frames': padded_target_length,
        'text': text,
    }
    np.savez(os.path.join(out_dir, npz_filename), **data, allow_pickle=False)
    # Return a tuple describing this training example
    return npz_filename, time_steps, padded_target_length, text
# -*- coding: utf-8 -*-
import numpy as np
from utils import audio
from hparams import hparams as hps

path = r'./data/000001.wav'

# 第一步,加载语音,数据本来就是[-1,1],所以不需要归一化
wav = audio.load_wav(path, hps.sample_rate)

# 第二步,去除前后的静音
if hps.trim_silence:
    wav = audio.trim_silence(wav, hps)

# 第三步,计算mel图谱
mel_spectrogram = audio.melspectrogram(wav, hps).astype(np.float32)

#第四步,计算线性图谱(声谱图)
linear_spectrogram = audio.linearspectrogram(wav, hps).astype(np.float32)

savename = path.split('/')[-1].split('.')[0]
mel_filename = './data/mel-{}.npy'.format(savename)
linear_filename = './data/linear-{}.npy'.format(savename)

np.save(mel_filename, mel_spectrogram.T, allow_pickle=False)
np.save(linear_filename, linear_spectrogram.T, allow_pickle=False)