def _process_utterance(out_dir, index, wav_path, text): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - out-dir: the directory to write the spectograms into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file Returns: - A tuple: (mel_filename, n_frames, text) """ # Load the audio as numpy array wav = audio.load_wav(wav_path) # Compute the linear-scale spectrogram from the wav to calculate n_frames spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrogram to disk mel_filename = 'ljspeech-mel-{:05d}.npy'.format(index) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example return (mel_filename, n_frames, text)
def extract_mel(wav_filename, out_wav_path, out_dir, key, hparams, args): if not os.path.exists(wav_filename): print("Wav file {} doesn't exists.".format(wav_filename)) return None wav = audio.load_wav(wav_filename, sr=hparams.sample_rate) # Process wav samples wav = audio.trim_silence(wav, hparams) n_samples = len(wav) # Extract mel spectrogram mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) n_frames = mel_spectrogram.shape[1] if n_frames > hparams.max_acoustic_length: print( "Ignore wav {} because the frame number {} is too long (Max {} frames in hparams.yaml)." .format(wav_filename, n_frames, hparams.max_acoustic_length)) return None # Align features desired_frames = int(min(n_samples / hparams.hop_size, n_frames)) wav = wav[:desired_frames * hparams.hop_size] mel_spectrogram = mel_spectrogram[:, :desired_frames] n_samples = wav.shape[0] n_frames = mel_spectrogram.shape[1] assert (n_samples / hparams.hop_size == n_frames) # Save intermediate acoustic features mel_filename = os.path.join(out_dir, key + '.npy') np.save(mel_filename, mel_spectrogram.T, allow_pickle=False) audio.save_wav(wav, out_wav_path, hparams) return (wav_filename, mel_filename, n_samples, n_frames)
def infer(model, src_pth): src = load_wav(src_pth, seg=False) mel = melspectrogram(src).astype(np.float32) mel = mode(torch.Tensor([mel])) with torch.no_grad(): res = model.infer(mel)[0] return [src, to_arr(res)]
def wav2spec(self, wav_path): wav = audio.load_wav(wav_path) spec = audio.melspectrogram(wav).astype(np.float32) spec = spec.transpose() feat_size = spec.shape[1] pad_spec = np.zeros( [(len(spec) + self.outputs_per_step - 1) // self.outputs_per_step * self.outputs_per_step, feat_size], dtype='float32') pad_spec[:len(spec)] = spec return pad_spec.reshape([-1, self.outputs_per_step * feat_size])
def _process_wav(wav_path, audio_path, spc_path): wav = audio.load_wav(wav_path) wav1, wav2, wav3, wav4 = audio.subband(wav) if hparams.feature_type == 'mcc': # Extract mcc and f0 spc = audio.extract_mcc(wav) else: # Extract mels spc = audio.melspectrogram(wav).astype(np.float32) # Align audios and mels hop_length = int(hparams.frame_shift_ms / 4000 * hparams.sample_rate) length_diff_1 = len(spc) * hop_length - len(wav1) length_diff_2 = len(spc) * hop_length - len(wav2) length_diff_3 = len(spc) * hop_length - len(wav3) length_diff_4 = len(spc) * hop_length - len(wav4) wav1 = wav1.reshape(-1,1) if length_diff_1 > 0: wav1 = np.pad(wav1, [[0, length_diff_1], [0, 0]], 'constant') elif length_diff_1 < 0: wav1 = wav1[: hop_length * spc.shape[0]] wav2 = wav2.reshape(-1,1) if length_diff_2 > 0: wav2 = np.pad(wav2, [[0, length_diff_2], [0, 0]], 'constant') elif length_diff_2 < 0: wav2 = wav2[: hop_length * spc.shape[0]] wav3 = wav3.reshape(-1,1) if length_diff_3 > 0: wav3 = np.pad(wav1, [[0, length_diff_3], [0, 0]], 'constant') elif length_diff_3 < 0: wav3 = wav3[: hop_length * spc.shape[0]] wav4 = wav4.reshape(-1,1) if length_diff_4 > 0: wav4 = np.pad(wav4, [[0, length_diff_4], [0, 0]], 'constant') elif length_diff_4 < 0: wav4 = wav4[: hop_length * spc.shape[0]] fid1 = os.path.basename(audio_path).replace('.npy', '_band1.npy') fid2 = os.path.basename(audio_path).replace('.npy', '_band2.npy') fid3 = os.path.basename(audio_path).replace('.npy', '_band3.npy') fid4 = os.path.basename(audio_path).replace('.npy', '_band4.npy') fid1 = os.path.join('training_data/audios', fid1) fid2 = os.path.join('training_data/audios', fid2) fid3 = os.path.join('training_data/audios', fid3) fid4 = os.path.join('training_data/audios', fid4) np.save(fid1, wav1) np.save(fid2, wav2) np.save(fid3, wav3) np.save(fid4, wav4) np.save(spc_path, spc) return (fid1, fid2, fid3, fid4, spc_path, spc.shape[0])
def __getitem__(self, index): if hps.prep: wav, mel = self.f_list[index] seg_ml = hps.seg_l // hps.frame_shift + 1 ms = np.random.randint(0, mel.shape[1] - seg_ml) if mel.shape[1] > seg_ml else 0 ws = hps.frame_shift * ms wav = wav[ws:ws + hps.seg_l] mel = mel[:, ms:ms + seg_ml] else: wav = load_wav(self.f_list[index]) mel = melspectrogram(wav).astype(np.float32) return wav, mel
def infer(wav_path, text, model): sequence = text_to_sequence(text, hps.text_cleaners) sequence = to_var(torch.IntTensor(sequence)[None, :]).long() mel = melspectrogram(load_wav(wav_path)) r = mel.shape[1]%hps.n_frames_per_step mel_in = to_var(torch.Tensor([mel[:, :-r]])) if mel_in.shape[2] < 1: return None sequence = torch.cat([sequence, sequence], 0) mel_in = torch.cat([mel_in, mel_in], 0) _, mel_outputs_postnet, _, _ = model.teacher_infer(sequence, mel_in) ret = mel ret[:, :-r] = to_arr(mel_outputs_postnet[0]) return ret
def _process_utterance(out_dir, index, wav_path, text): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: wav = audio.load_wav(wav_path) # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) # print(len(spectrogram)) # print(len(spectrogram[0])) # print(type(spectrogram)) # print(np.shape(spectrogram)) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # print(np.shape(mel_spectrogram)) # print() # Write the spectrograms to disk: spectrogram_filename = 'ljspeech-spec-%05d.npy' % index mel_filename = 'ljspeech-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, text)
def files_to_list(fdir): f_list = [] with open(os.path.join(fdir, 'metadata.csv'), encoding='utf-8') as f: for line in f: parts = line.strip().split('|') wav_path = os.path.join(fdir, 'wavs', '%s.wav' % parts[0]) if hps.prep: wav = load_wav(wav_path, False) if wav.shape[0] < hps.seg_l: wav = np.pad(wav, (0, hps.seg_l - wav.shape[0]), 'constant', constant_values=(0, 0)) mel = melspectrogram(wav).astype(np.float32) f_list.append([wav, mel]) else: f_list.append(wav_path) if hps.prep and hps.pth is not None: with open(hps.pth, 'wb') as w: pickle.dump(f_list, w) return f_list
def _process_wav(wav_path, audio_path, spc_path): wav = audio.load_wav(wav_path) if hparams.feature_type == 'mcc': # Extract mcc and f0 spc = audio.extract_mcc(wav) else: # Extract mels spc = audio.melspectrogram(wav).astype(np.float32) # Align audios and mels hop_length = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate) length_diff = len(spc) * hop_length - len(wav) wav = wav.reshape(-1, 1) if length_diff > 0: wav = np.pad(wav, [[0, length_diff], [0, 0]], 'constant') elif length_diff < 0: wav = wav[:hop_length * spc.shape[0]] np.save(audio_path, wav) np.save(spc_path, spc) return (audio_path, spc_path, spc.shape[0])
def _process_utterance(out_dir, index, wav_path, text): # Load the audio to a numpy array: wav = audio.load_wav(wav_path) # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = 'meta_spec_%05d.npy' % index mel_filename = 'meta_mel_%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, text)
def main(): with tf.device( '/cpu:0'): # cpu가 더 빠르다. gpu로 설정하면 Error. tf.device 없이 하면 더 느려진다. config = get_arguments() started_datestring = "{0:%Y-%m-%dT%H-%M-%S}".format(datetime.now()) logdir = os.path.join(config.logdir, 'generate', started_datestring) print('logdir0-------------' + logdir) if not os.path.exists(logdir): os.makedirs(logdir) load_hparams(hparams, config.checkpoint_dir) sess = tf.Session() scalar_input = hparams.scalar_input net = WaveNetModel( batch_size=config.batch_size, dilations=hparams.dilations, filter_width=hparams.filter_width, residual_channels=hparams.residual_channels, dilation_channels=hparams.dilation_channels, quantization_channels=hparams.quantization_channels, out_channels=hparams.out_channels, skip_channels=hparams.skip_channels, use_biases=hparams.use_biases, scalar_input=hparams.scalar_input, global_condition_channels=hparams.gc_channels, global_condition_cardinality=config.gc_cardinality, local_condition_channels=hparams.num_mels, upsample_factor=hparams.upsample_factor, legacy=hparams.legacy, residual_legacy=hparams.residual_legacy, train_mode=False ) # train 단계에서는 global_condition_cardinality를 AudioReader에서 파악했지만, 여기서는 넣어주어야 함 if scalar_input: samples = tf.placeholder(tf.float32, shape=[net.batch_size, None]) else: samples = tf.placeholder( tf.int32, shape=[net.batch_size, None] ) # samples: mu_law_encode로 변환된 것. one-hot으로 변환되기 전. (batch_size, 길이) # local condition이 (N,T,num_mels) 여야 하지만, 길이 1까지로 들어가야하기 때무넹, (N,1,num_mels) --> squeeze하면 (N,num_mels) upsampled_local_condition = tf.placeholder( tf.float32, shape=[net.batch_size, hparams.num_mels]) next_sample = net.predict_proba_incremental( samples, upsampled_local_condition, [config.gc_id] * net.batch_size ) # Fast Wavenet Generation Algorithm-1611.09482 algorithm 적용 # making local condition data. placeholder - upsampled_local_condition 넣어줄 upsampled local condition data를 만들어 보자. print('logdir0-------------' + logdir) mel_input = np.load(config.mel) sample_size = mel_input.shape[0] * hparams.hop_size mel_input = np.tile(mel_input, (config.batch_size, 1, 1)) with tf.variable_scope('wavenet', reuse=tf.AUTO_REUSE): upsampled_local_condition_data = net.create_upsample( mel_input, upsample_type=hparams.upsample_type) var_list = [ var for var in tf.global_variables() if 'queue' not in var.name ] saver = tf.train.Saver(var_list) print('Restoring model from {}'.format(config.checkpoint_dir)) load(saver, sess, config.checkpoint_dir) init_op = tf.group(tf.initialize_all_variables(), net.queue_initializer) sess.run(init_op) # 이 부분이 없으면, checkpoint에서 복원된 값들이 들어 있다. quantization_channels = hparams.quantization_channels if config.wav_seed: # wav_seed의 길이가 receptive_field보다 작으면, padding이라도 해야 되는 거 아닌가? 그냥 짧으면 짧은 대로 return함 --> 그래서 너무 짧으면 error seed = create_seed(config.wav_seed, hparams.sample_rate, quantization_channels, net.receptive_field, scalar_input) # --> mu_law encode 된 것. if scalar_input: waveform = seed.tolist() else: waveform = sess.run( seed).tolist() # [116, 114, 120, 121, 127, ...] print('Priming generation...') for i, x in enumerate(waveform[-net.receptive_field:-1] ): # 제일 마지막 1개는 아래의 for loop의 첫 loop에서 넣어준다. if i % 100 == 0: print('Priming sample {}/{}'.format( i, net.receptive_field), end='\r') sess.run(next_sample, feed_dict={ samples: np.array([x] * net.batch_size).reshape( net.batch_size, 1), upsampled_local_condition: np.zeros([net.batch_size, hparams.num_mels]) }) print('Done.') waveform = np.array([waveform[-net.receptive_field:]] * net.batch_size) else: # Silence with a single random sample at the end. if scalar_input: waveform = [0.0] * (net.receptive_field - 1) waveform = np.array(waveform * net.batch_size).reshape( net.batch_size, -1) waveform = np.concatenate( [ waveform, 2 * np.random.rand(net.batch_size).reshape( net.batch_size, -1) - 1 ], axis=-1) # -1~1사이의 random number를 만들어 끝에 붙힌다. # wavefor: shape(batch_size,net.receptive_field ) else: waveform = [quantization_channels / 2] * ( net.receptive_field - 1 ) # 필요한 receptive_field 크기보다 1개 작게 만든 후, 아래에서 random하게 1개를 덧붙힌다. waveform = np.array(waveform * net.batch_size).reshape( net.batch_size, -1) waveform = np.concatenate( [ waveform, np.random.randint(quantization_channels, size=net.batch_size).reshape( net.batch_size, -1) ], axis=-1) # one hot 변환 전. (batch_size, 5117) start_time = time.time() upsampled_local_condition_data = sess.run( upsampled_local_condition_data) last_sample_timestamp = datetime.now() for step in range(sample_size): # 원하는 길이를 구하기 위해 loop sample_size window = waveform[:, -1:] # 제일 끝에 있는 1개만 samples에 넣어 준다. window: shape(N,1) # Run the WaveNet to predict the next sample. # fast가 아닌경우. window: [128.0, 128.0, ..., 128.0, 178, 185] # fast인 경우, window는 숫자 1개. prediction = sess.run( next_sample, feed_dict={ samples: window, upsampled_local_condition: upsampled_local_condition_data[:, step, :] } ) # samples는 mu law encoding된 것. 계산 과정에서 one hot으로 변환된다. --> (batch_size,256) if scalar_input: sample = prediction # logistic distribution으로부터 sampling 되었기 때문에, randomness가 있다. else: # Scale prediction distribution using temperature. # 다음 과정은 config.temperature==1이면 각 원소를 합으로 나누어주는 것에 불과. 이미 softmax를 적용한 겂이므로, 합이 1이된다. 그래서 값의 변화가 없다. # config.temperature가 1이 아니며, 각 원소의 log취한 값을 나눈 후, 합이 1이 되도록 rescaling하는 것이 된다. np.seterr(divide='ignore') scaled_prediction = np.log( prediction ) / config.temperature # config.temperature인 경우는 값의 변화가 없다. scaled_prediction = ( scaled_prediction - np.logaddexp.reduce( scaled_prediction, axis=-1, keepdims=True) ) # np.log(np.sum(np.exp(scaled_prediction))) scaled_prediction = np.exp(scaled_prediction) np.seterr(divide='warn') # Prediction distribution at temperature=1.0 should be unchanged after # scaling. if config.temperature == 1.0: np.testing.assert_allclose( prediction, scaled_prediction, atol=1e-5, err_msg= 'Prediction scaling at temperature=1.0 is not working as intended.' ) # argmax로 선택하지 않기 때문에, 같은 입력이 들어가도 달라질 수 있다. sample = [[ np.random.choice(np.arange(quantization_channels), p=p) ] for p in scaled_prediction] # choose one sample per batch waveform = np.concatenate([waveform, sample], axis=-1) #window.shape: (N,1) # Show progress only once per second. current_sample_timestamp = datetime.now() time_since_print = current_sample_timestamp - last_sample_timestamp if time_since_print.total_seconds() > 1.: duration = time.time() - start_time print('Sample {:3<d}/{:3<d}, ({:.3f} sec/step)'.format( step + 1, sample_size, duration), end='\r') last_sample_timestamp = current_sample_timestamp # Introduce a newline to clear the carriage return from the progress. print() # Save the result as a wav file. if hparams.input_type == 'raw': out = waveform[:, net.receptive_field:] elif hparams.input_type == 'mulaw': decode = mu_law_decode(samples, quantization_channels, quantization=False) out = sess.run( decode, feed_dict={samples: waveform[:, net.receptive_field:]}) else: # 'mulaw-quantize' decode = mu_law_decode(samples, quantization_channels, quantization=True) out = sess.run( decode, feed_dict={samples: waveform[:, net.receptive_field:]}) # save wav for i in range(net.batch_size): config.wav_out_path = logdir + '/test-{}.wav'.format(i) mel_path = config.wav_out_path.replace(".wav", ".png") gen_mel_spectrogram = audio.melspectrogram(out[i], hparams).astype( np.float32).T audio.save_wav(out[i], config.wav_out_path, hparams.sample_rate) # save_wav 내에서 out[i]의 값이 바뀐다. plot.plot_spectrogram(gen_mel_spectrogram, mel_path, title='generated mel spectrogram', target_spectrogram=mel_input[i]) print('Finished generating.')
def extract_audio_mels(audio_path): wav = audio.load_wav(audio_path) mels = audio.melspectrogram(wav) return mels
def get_mel(self, filename): wav = load_wav(filename) mel = melspectrogram(wav).astype(np.float32) return torch.Tensor(mel)
def _process_utterance(out_dir, index, wav_path, text): # Load the audio to a numpy array: wav = audio.load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) # Write the spectrograms to disk: audio_filename = 'bznsyp-audio-%05d.npy' % index mel_filename = 'bznsyp-mel-%05d.npy' % index np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return (audio_filename, mel_filename, timesteps, text)
def _process_utterance(out_dir, index, audio_filepath, text): # Load the audio to a numpy array: wav_whole = audio.load_wav(audio_filepath) if hparams.rescaling: wav_whole = wav_whole / np.abs(wav_whole).max() * hparams.rescaling_max # This is a librivox source, so the audio files are going to be v. long # compared to a typical 'utterance' : So split the wav into chunks tup_results = [] n_samples = int(8.0 * hparams.sample_rate) # All 8 second utterances n_chunks = wav_whole.shape[0] // n_samples for chunk_idx in range(n_chunks): chunk_start, chunk_end = chunk_idx * n_samples, (chunk_idx + 1) * n_samples if chunk_idx == n_chunks - 1: # This is the last chunk - allow it to extend to the end of the file chunk_end = None wav = wav_whole[chunk_start: chunk_end] # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) # Write the spectrograms to disk: audio_filename = 'librivox-audio-%04d-%05d.npy' % (index, chunk_idx,) mel_filename = 'librivox-mel-%04d-%05d.npy' % (index, chunk_idx,) text_idx = '%s - %05d' % (text, chunk_idx,) np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False) # Add results tuple describing this training example: tup_results.append((audio_filename, mel_filename, timesteps, text_idx)) # Return all the audio results tuples (unpack in caller) return tup_results
def _process_utterance(out_dir, index, speaker_id, wav_path, text): sr = hparams.sample_rate # Load the audio to a numpy array. Resampled if needed wav = audio.load_wav(wav_path) lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab") # Trim silence from hts labels if available # TODO if exists(lab_path) and False: labels = hts.load(lab_path) b = int(start_at(labels) * 1e-7 * sr) e = int(end_at(labels) * 1e-7 * sr) wav = wav[b:e] wav, _ = librosa.effects.trim(wav, top_db=20) else: wav, _ = librosa.effects.trim(wav, top_db=20) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) # Write the spectrograms to disk: audio_filename = 'cmu_arctic-audio-%05d.npy' % index mel_filename = 'cmu_arctic-mel-%05d.npy' % index np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return (audio_filename, mel_filename, timesteps, text, speaker_id)
def get_mel(wav_path): wav = load_wav(wav_path) return torch.Tensor(melspectrogram(wav).astype(np.float32))
def _process_utterance(out_dir, index, wav_path, text, silence_threshold, fft_size): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, text, mel_len) tuple to write to train.txt ''' # Load the audio to a numpy array: wav = audio.load_wav(wav_path) if hp.rescaling: wav = wav / np.abs(wav).max() * hp.rescaling_max if hp.input_type != "raw": # Mu-law quantize out = P.mulaw_quantize(wav) # Trim silences start, end = audio.start_and_end_indices(out, silence_threshold) out = out[start:end] wav = wav[start:end] constant_value = P.mulaw_quantize(0, 256) out_dtype = np.int16 else: out = wav constant_value = 0. out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_value) mel_len = mel_spectrogram.shape[0] assert len(out) >= mel_len * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:mel_len * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) wav_id = wav_path.split('/')[-1].split('.')[0] # Write the spectrograms to disk: audio_path = os.path.join(out_dir, '{}-audio.npy'.format(wav_id)) mel_path = os.path.join(out_dir, '{}-mel.npy'.format(wav_id)) np.save(audio_path, out.astype(out_dtype), allow_pickle=False) np.save(mel_path, mel_spectrogram.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return os.path.abspath(audio_path), os.path.abspath( mel_path), text, timesteps
def get_mel(self, audio): audio_norm = audio / wavenet_utils.MAX_WAV_VALUE melspec = melspectrogram(audio_norm, hparams) melspec = melspec.transpose() return melspec
def eval_step(sess,logdir,step,waveform,upsampled_local_condition_data,speaker_id_data,mel_input_data,samples,speaker_id,upsampled_local_condition,next_sample,temperature=1.0): waveform = waveform[:,:1] sample_size = upsampled_local_condition_data.shape[1] last_sample_timestamp = datetime.now() start_time = time.time() for step2 in range(sample_size): # 원하는 길이를 구하기 위해 loop sample_size window = waveform[:,-1:] # 제일 끝에 있는 1개만 samples에 넣어 준다. window: shape(N,1) prediction = sess.run(next_sample, feed_dict={samples: window,upsampled_local_condition: upsampled_local_condition_data[:,step2,:],speaker_id: speaker_id_data }) if hparams.scalar_input: sample = prediction # logistic distribution으로부터 sampling 되었기 때문에, randomness가 있다. else: # Scale prediction distribution using temperature. # 다음 과정은 config.temperature==1이면 각 원소를 합으로 나누어주는 것에 불과. 이미 softmax를 적용한 겂이므로, 합이 1이된다. 그래서 값의 변화가 없다. # config.temperature가 1이 아니며, 각 원소의 log취한 값을 나눈 후, 합이 1이 되도록 rescaling하는 것이 된다. np.seterr(divide='ignore') scaled_prediction = np.log(prediction) / temperature # config.temperature인 경우는 값의 변화가 없다. scaled_prediction = (scaled_prediction - np.logaddexp.reduce(scaled_prediction,axis=-1,keepdims=True)) # np.log(np.sum(np.exp(scaled_prediction))) scaled_prediction = np.exp(scaled_prediction) np.seterr(divide='warn') # Prediction distribution at temperature=1.0 should be unchanged after # scaling. if temperature == 1.0: np.testing.assert_allclose( prediction, scaled_prediction, atol=1e-5, err_msg='Prediction scaling at temperature=1.0 is not working as intended.') # argmax로 선택하지 않기 때문에, 같은 입력이 들어가도 달라질 수 있다. sample = [[np.random.choice(np.arange(hparams.quantization_channels), p=p)] for p in scaled_prediction] # choose one sample per batch waveform = np.concatenate([waveform,sample],axis=-1) #window.shape: (N,1) # Show progress only once per second. current_sample_timestamp = datetime.now() time_since_print = current_sample_timestamp - last_sample_timestamp if time_since_print.total_seconds() > 1.: duration = time.time() - start_time print('Sample {:3<d}/{:3<d}, ({:.3f} sec/step)'.format(step2 + 1, sample_size, duration), end='\r') last_sample_timestamp = current_sample_timestamp print('\n') # Save the result as a wav file. if hparams.input_type == 'raw': out = waveform[:,1:] elif hparams.input_type == 'mulaw': decode = mu_law_decode(samples, hparams.quantization_channels,quantization=False) out = sess.run(decode, feed_dict={samples: waveform[:,1:]}) else: # 'mulaw-quantize' decode = mu_law_decode(samples, hparams.quantization_channels,quantization=True) out = sess.run(decode, feed_dict={samples: waveform[:,1:]}) # save wav for i in range(1): wav_out_path= logdir + '/test-{}-{}.wav'.format(step,i) mel_path = wav_out_path.replace(".wav", ".png") gen_mel_spectrogram = audio.melspectrogram(out[i], hparams).astype(np.float32).T audio.save_wav(out[i], wav_out_path, hparams.sample_rate) # save_wav 내에서 out[i]의 값이 바뀐다. plot.plot_spectrogram(gen_mel_spectrogram, mel_path, title='generated mel spectrogram{}'.format(step),target_spectrogram=mel_input_data[i])
class Synthesizer: def load(self, checkpoint_path, hparams, model_name='WaveNet'): log('Constructing model: {}'.format(model_name)) self._hparams = hparams local_cond, global_cond = self._check_conditions() self.local_conditions = tf.placeholder( tf.float32, shape=(None, None, hparams.num_mfccs), name='local_condition_features') if local_cond else None self.global_conditions = tf.placeholder( tf.int32, shape=(None, 1), name='global_condition_features') if global_cond else None self.synthesis_length = tf.placeholder( tf.int32, shape=(), name='synthesis_length') if not local_cond else None self.input_lengths = tf.placeholder( tf.int32, shape=(1, ), name='input_lengths') if hparams.wavenet_synth_debug else None self.synth_debug = hparams.wavenet_synth_debug with tf.variable_scope('WaveNet_model') as scope: self.model = create_model(model_name, hparams) self.model.initialize(y=None, c=self.local_conditions, g=self.global_conditions, input_lengths=self.input_lengths, synthesis_length=self.synthesis_length, test_inputs=None) self._hparams = hparams sh_saver = create_shadow_saver(self.model) log('Loading checkpoint: {}'.format(checkpoint_path)) #Memory allocation on the GPU as needed config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True self.session = tf.Session(config=config) self.session.run(tf.global_variables_initializer()) load_averaged_model(self.session, sh_saver, checkpoint_path) def synthesize(self, mel_spectrograms, speaker_ids, basenames, out_dir, log_dir, embed_dir, embed_only=True): hparams = self._hparams local_cond, global_cond = self._check_conditions() #Switch mels in case of debug if self.synth_debug: assert len(hparams.wavenet_debug_mels) == len( hparams.wavenet_debug_wavs) mel_spectrograms = [ np.load(mel_file) for mel_file in hparams.wavenet_debug_mels ] #Prepare local condition batch maxlen = max([len(x) for x in mel_spectrograms]) #[-max, max] or [0,max] T2_output_range = ( -self._hparams.max_abs_value, self._hparams.max_abs_value) if self._hparams.symmetric_mels else ( 0, self._hparams.max_abs_value) if self._hparams.clip_for_wavenet: mel_spectrograms = [ np.clip(x, T2_output_range[0], T2_output_range[1]) for x in mel_spectrograms ] c_batch = np.asarray(mel_spectrograms).astype(np.float32) print("c batch shape {}".format(c_batch.shape)) if self._hparams.normalize_for_wavenet: #rerange to [0, 1] c_batch = _interp(c_batch, T2_output_range).astype(np.float32) g = None if speaker_ids is None else np.asarray( speaker_ids, dtype=np.int32).reshape(len(c_batch), 1) print("g shape {}".format(g.shape)) feed_dict = {} if local_cond: feed_dict[self.local_conditions] = c_batch else: feed_dict[self.synthesis_length] = 100 if global_cond: feed_dict[self.global_conditions] = g if self.synth_debug: debug_wavs = hparams.wavenet_debug_wavs assert len(debug_wavs) % hparams.wavenet_num_gpus == 0 test_wavs = [ np.load(debug_wav).reshape(-1, 1) for debug_wav in debug_wavs ] #pad wavs to same length max_test_len = max([len(x) for x in test_wavs]) test_wavs = np.stack([ _pad_inputs(x, max_test_len) for x in test_wavs ]).astype(np.float32) assert len(test_wavs) == len(debug_wavs) #### GTA False feed_dict[self.input_lengths] = np.asarray([test_wavs.shape[1]]) if embed_only == False: #Generate wavs and clip extra padding to select Real speech parts #### VQVAE Out generated_wavs, upsampled_features, vq_embeddings, vq_onehot, vq_w, vq_enc_ind = self.session.run( [ self.model.tower_y_hat, self.model.tower_synth_upsampled_local_features, self.model.vq_embeddings, self.model.vq_onehot, self.model.vq_w, self.model.vq_enc_ind ], feed_dict=feed_dict) #Linearize outputs (n_gpus -> 1D) generated_wavs = [ wav for gpu_wavs in generated_wavs for wav in gpu_wavs ] upsampled_features = [ feat for gpu_feats in upsampled_features for feat in gpu_feats ] for i, (generated_wav, input_mel, upsampled_feature, vq_embedding) in enumerate( zip(generated_wavs, mel_spectrograms, upsampled_features, vq_embeddings)): #Save wav to disk audio_filename = os.path.join( out_dir, 'wavenet-audio-{}.wav'.format(basenames[i])) save_wavenet_wav(generated_wav, audio_filename, sr=hparams.sample_rate, inv_preemphasize=hparams.preemphasize, k=hparams.preemphasis) #### Vq embedding save (shape [batch_size, num_frames, embed_dim]) embed_filename = os.path.join(embed_dir, 'emb-{}.npy'.format(basenames[i])) np.save(embed_filename, vq_embedding) onehot_filename = os.path.join( embed_dir, 'onehot-{}.npy'.format(basenames[i])) np.save(onehot_filename, vq_onehot) wmatrix_filename = os.path.join( embed_dir, 'wmatrix-{}.npy'.format(basenames[i])) np.save(wmatrix_filename, vq_w) idx_filename = os.path.join(embed_dir, 'idx-{}.npy'.format(basenames[i])) np.save(idx_filename, vq_enc_ind) #Compare generated wav mel with original input mel to evaluate wavenet audio reconstruction performance #Both mels should match on low frequency information, wavenet mel should contain more high frequency detail when compared to Tacotron mels. generated_mel = melspectrogram(generated_wav, hparams).T util.plot_spectrogram( generated_mel, os.path.join( log_dir, 'wavenet-mel-spectrogram-{}.png'.format(basenames[i])), title= 'Local Condition vs Reconstructed Audio Mel-Spectrogram analysis', target_spectrogram=input_mel) #Save upsampled features to visualize checkerboard artifacts. util.plot_spectrogram( upsampled_feature.T, os.path.join( log_dir, 'wavenet-upsampled_features-{}.png'.format(basenames[i])), title='Upmsampled Local Condition features', auto_aspect=True) #Save waveplot to disk if log_dir is not None: plot_filename = os.path.join( log_dir, 'wavenet-waveplot-{}.png'.format(basenames[i])) util.waveplot(plot_filename, generated_wav, None, hparams, title='WaveNet generated Waveform.') else: #Generate wavs and clip extra padding to select Real speech parts #### VQVAE Out vq_embeddings, vq_onehot, vq_w, vq_enc_ind = self.session.run( [ self.model.vq_embeddings, self.model.vq_onehot, self.model.vq_w, self.model.vq_enc_ind ], feed_dict=feed_dict) for i, vq_embedding in enumerate(vq_embeddings): #### Vq embedding save (shape [batch_size, num_frames, embed_dim]) embed_filename = os.path.join(embed_dir, 'emb-{}.npy'.format(basenames[i])) np.save(embed_filename, vq_embedding) onehot_filename = os.path.join( embed_dir, 'onehot-{}.npy'.format(basenames[i])) np.save(onehot_filename, vq_onehot) wmatrix_filename = os.path.join( embed_dir, 'wmatrix-{}.npy'.format(basenames[i])) np.save(wmatrix_filename, vq_w) idx_filename = os.path.join(embed_dir, 'idx-{}.npy'.format(basenames[i])) np.save(idx_filename, vq_enc_ind) def _check_conditions(self): local_condition = self._hparams.cin_channels > 0 global_condition = self._hparams.gin_channels > 0 return local_condition, global_condition
def _process_utterance(out_dir, index, wav_path, text, hparams): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate) except FileNotFoundError: # catch missing wav exception print( 'file {} present in csv metadata is not present in wav folder. skipping!' .format(wav_path)) return None # Trim lead/trail silences if hparams.trim_silence: wav = audio.trim_silence(wav, hparams) # Pre-emphasize preem_wav = audio.preemphasis(wav, hparams.preemphasis, hparams.preemphasize) # rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max preem_wav = preem_wav / np.abs(preem_wav).max() * hparams.rescaling_max # Assert all audio is in [-1, 1] if (wav > 1.).any() or (wav < -1.).any(): raise RuntimeError('wav has invalid value: {}'.format(wav_path)) if (preem_wav > 1.).any() or (preem_wav < -1.).any(): raise RuntimeError('wav has invalid value: {}'.format(wav_path)) # [-1, 1] out = wav constant_values = 0. # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(preem_wav, hparams).astype(np.float32) mel_frames = mel_spectrogram.shape[1] if (mel_frames > hparams.max_mel_frames and hparams.clip_mels_length) or ( hparams.min_text_tokens > len(text) or hparams.min_mel_frames > mel_frames): return None # Compute the linear scale spectrogram from the wav linear_spectrogram = audio.linearspectrogram(preem_wav, hparams).astype(np.float32) linear_frames = linear_spectrogram.shape[1] # sanity check assert linear_frames == mel_frames # Ensure time resolution adjustement between audio and mel-spectrogram l_pad, r_pad = audio.librosa_pad_lr(wav, hparams.hop_size, hparams.pad_sides) # Reflect pad audio signal on the right (Just like it's done in Librosa to avoid frame inconsistency) out = np.pad(out, (l_pad, r_pad), mode='constant', constant_values=constant_values) assert len(out) >= mel_frames * hparams.hop_size # time resolution adjustement # ensure length of raw audio is multiple of hop size so that we can use # transposed convolution to upsample out = out[:mel_frames * hparams.hop_size] assert len(out) % hparams.hop_size == 0 time_steps = len(out) npz_filename = '{}.npz'.format(index) mel_spectrogram = mel_spectrogram.T linear_spectrogram = linear_spectrogram.T r = hparams.reduction_factor if hparams.symmetric_mels: _pad_value = -hparams.max_abs_value else: _pad_value = 0. target_length = len(linear_spectrogram) mel_spectrogram = np.pad(mel_spectrogram, [[r, r], [0, 0]], "constant", constant_values=_pad_value) linear_spectrogram = np.pad(linear_spectrogram, [[r, r], [0, 0]], "constant", constant_values=_pad_value) target_length = target_length + 2 * r padded_target_length = (target_length // r + 1) * r num_pad = padded_target_length - target_length stop_token_target = np.pad(np.zeros(padded_target_length - 1, dtype=np.float32), (0, 1), "constant", constant_values=1) mel_spectrogram = np.pad(mel_spectrogram, ((0, num_pad), (0, 0)), "constant", constant_values=_pad_value) linear_spectrogram = np.pad(linear_spectrogram, ((0, num_pad), (0, 0)), "constant", constant_values=_pad_value) data = { 'mel': mel_spectrogram, 'linear': linear_spectrogram, 'input_data': text_to_sequence(text), # eos(~) 'time_steps': time_steps, 'stop_token_target': stop_token_target, 'mel_frames': padded_target_length, 'text': text, } np.savez(os.path.join(out_dir, npz_filename), **data, allow_pickle=False) # Return a tuple describing this training example return npz_filename, time_steps, padded_target_length, text
def get_mel(filename): wav = load_wav(filename) mel = melspectrogram(wav).astype(np.float32) return mel
ctr = 0 for line in f: if len(line) > 2: ctr += 1 line = line.split('\n')[0] fname = line.split()[0] phones = ' '.join(k for k in line.split()[1:]) if generate_feats_flag: wav_fname = wav_dir + '/' + fname + '.wav' wav = audio.load_wav(wav_fname) max_samples = _max_out_length * 5 / 1000 * 16000 spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) lspec_fname = lspec_dir + '/' + fname + '_lspec.npy' mspec_fname = mspec_dir + '/' + fname + '_mspec.npy' np.save(lspec_fname, spectrogram.T, allow_pickle=False) np.save(mspec_fname, mel_spectrogram.T, allow_pickle=False) g = open(data_file, 'a') g.write(lspec_fname + '|' + mspec_fname + '|' + str(n_frames) + '| ' + phones + '\n') g.close() g = open(feats_dir + '/' + fname + '.feats', 'w') for phone in phones.split(): g.write(phone + '\n') g.close() if ctr % 100 == 1:
def _process_utterance(out_dir, wav_path, text, hparams): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate) #1차원짜리 wav파일 뽑아옴 #Load an audio file as a floating point time series. #Audio will be automatically resampled to the given rate (default sr=22050). #To preserve the native sampling rate of the file, use sr=None. #print('====wav====') #print(wav,wav.shape) (240001,) except FileNotFoundError: #catch missing wav exception print('file {} present in csv metadata is not present in wav folder. skipping!'.format( wav_path)) return None #rescale wav if hparams.rescaling: # hparams.rescale = True wav = wav / np.abs(wav).max() * hparams.rescaling_max #We rescale because it is assumed in Wavenet training that wavs are in [-1, 1] when computing the mixture loss. This is mainly coming from PixelCNN implementation. #https://github.com/Rayhane-mamah/Tacotron-2/issues/69 #M-AILABS extra silence specific if hparams.trim_silence: # hparams.trim_silence = True wav = audio.trim_silence(wav, hparams) # Trim leading and trailing silence #Mu-law quantize, default 값은 'raw' #The quantization noise is from the analog to digital conversion. The mu-law compression actually reduces the noise and increases the dynamic range. #If you search a little bit in the code you will find that the input is always mu-law encoded here. #scalar_input only determines if the model uses a one-hot encoding for every data point of the input waveform, or just uses floating point values for each sample. if hparams.input_type=='mulaw-quantize': #[0, quantize_channels) out = audio.mulaw_quantize(wav, hparams.quantize_channels) #Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start: end] out = out[start: end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif hparams.input_type=='mulaw': #[-1, 1] out = audio.mulaw(wav, hparams.quantize_channels) constant_values = mulaw(0., hparams.quantize_channels) out_dtype = np.float32 else: # raw #[-1, 1] out = wav constant_values = 0. out_dtype = np.float32 # Compute the mel scale spectrograFm from the wav mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) #print('====mel_spectrogram====') #print(mel_spectrogram,mel_spectrogram.shape) #(80,797),(80,801) ... mel_frames = mel_spectrogram.shape[1] #print('===mel frame====') #print(mel_frames) 801, 797 ,... if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: # hparams.max_mel_frames = 1000, hparams.clip_mels_length = True return None #Compute the linear scale spectrogram from the wav linear_spectrogram = audio.linearspectrogram(wav, hparams).astype(np.float32) #print('====linear_spectrogram====') #print(linear_spectrogram,linear_spectrogram.shape) #(1025,787),(1025,801) linear_frames = linear_spectrogram.shape[1] #sanity check assert linear_frames == mel_frames if hparams.use_lws: # hparams.use_lws = False #Ensure time resolution adjustement between audio and mel-spectrogram fft_size = hparams.fft_size if hparams.win_size is None else hparams.win_size l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) #Zero pad audio signal out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) else: #Ensure time resolution adjustement between audio and mel-spectrogram pad = audio.librosa_pad_lr(wav, hparams.fft_size, audio.get_hop_size(hparams)) #1024 == 2048//2 == fft_size//2 #print('===pad===') #print(pad) #Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency) #print(out,out.shape) #(240001,) out = np.pad(out, pad, mode='reflect') #shape : (242049,) - 패딩 #print(out,out.shape) #(242049,) #print('===out====') #print(out,out.shape) assert len(out) >= mel_frames * audio.get_hop_size(hparams) #time resolution adjustement #ensure length of raw audio is multiple of hop size so that we can use #transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] #240300으로 맞춤(자름) assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) #print(audio.get_hop_size(hparams)) : 300 #print(out,out.shape) #(240300,) = 801*300 # Write the spectrogram and audio to disk wav_id = os.path.splitext(os.path.basename(wav_path))[0] #확장자 제외하고 파일 이름 얻기 #print('====wav_id====') #print(wav_id) # Write the spectrograms to disk: audio_filename = '{}-audio.npy'.format(wav_id) mel_filename = '{}-mel.npy'.format(wav_id) linear_filename = '{}-linear.npy'.format(wav_id) npz_filename = '{}.npz'.format(wav_id) npz_flag=True if npz_flag: # Tacotron 코드와 맞추기 위해, 같은 key를 사용한다. data = { 'audio': out.astype(out_dtype), 'mel': mel_spectrogram.T, 'linear': linear_spectrogram.T, 'time_steps': time_steps, 'mel_frames': mel_frames, 'text': text, 'tokens': text_to_sequence(text), # eos(~)에 해당하는 "1"이 끝에 붙는다. 'loss_coeff': 1 # For Tacotron } #print('=====data====') #print(data) np.savez(os.path.join(out_dir,npz_filename ), **data, allow_pickle=False) #여러개의 배열을 1개의 압축되지 않은 *.npz 포맷 파일로 저장하기 else: np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example #print('====mel_frames====') #print(mel_frames) #print('====time_steps====') #print(time_steps) return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text,npz_filename)
batch = phonemizer.encode(batch, njobs=args.NJOBS, clean=False) phonemes.extend(batch) audio_data = np.concatenate([np.array(audio_data), np.expand_dims(phonemes, axis=1)], axis=1) if args.CACHE_PHON: np.save(phon_path, audio_data, allow_pickle=True) print('\nBuilding dataset and writing files') np.random.seed(42) np.random.shuffle(audio_data) test_metafile = os.path.join(args.TARGET_DIR, 'test_metafile.txt') train_metafile = os.path.join(args.TARGET_DIR, 'train_metafile.txt') test_lines = [''.join([filename, '|', text, '|', phon, '\n']) for filename, text, phon in audio_data[:config['n_test']]] train_lines = [''.join([filename, '|', text, '|', phon, '\n']) for filename, text, phon in audio_data[config['n_test']:-1]] with open(test_metafile, 'w+', encoding='utf-8') as test_f: test_f.writelines(test_lines) with open(train_metafile, 'w+', encoding='utf-8') as train_f: train_f.writelines(train_lines) for i in tqdm.tqdm(range(len(audio_data))): filename, _, _ = audio_data[i] wav_path = os.path.join(args.WAV_DIR, filename + '.wav') y, sr = librosa.load(wav_path, sr=config['sampling_rate']) mel = melspectrogram(y, config) mel_path = os.path.join(mel_dir, filename) np.save(mel_path, mel.T) print('\nDone')
def _process_utterance(out_dir, wav_path, text, hparams): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate) except FileNotFoundError: #catch missing wav exception print( 'file {} present in csv metadata is not present in wav folder. skipping!' .format(wav_path)) return None #rescale wav if hparams.rescaling: # hparams.rescale = True wav = wav / np.abs(wav).max() * hparams.rescaling_max #M-AILABS extra silence specific if hparams.trim_silence: # hparams.trim_silence = True wav = audio.trim_silence(wav, hparams) # Trim leading and trailing silence #Mu-law quantize, default 값은 'raw' if hparams.input_type == 'mulaw-quantize': #[0, quantize_channels) out = audio.mulaw_quantize(wav, hparams.quantize_channels) #Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif hparams.input_type == 'mulaw': #[-1, 1] out = audio.mulaw(wav, hparams.quantize_channels) constant_values = audio.mulaw(0., hparams.quantize_channels) out_dtype = np.float32 else: # raw #[-1, 1] out = wav constant_values = 0. out_dtype = np.float32 # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) mel_frames = mel_spectrogram.shape[1] if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: # hparams.max_mel_frames = 1000, hparams.clip_mels_length = True return None #Compute the linear scale spectrogram from the wav linear_spectrogram = audio.linearspectrogram(wav, hparams).astype(np.float32) linear_frames = linear_spectrogram.shape[1] #sanity check assert linear_frames == mel_frames if hparams.use_lws: # hparams.use_lws = False #Ensure time resolution adjustement between audio and mel-spectrogram fft_size = hparams.fft_size if hparams.win_size is None else hparams.win_size l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) #Zero pad audio signal out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) else: #Ensure time resolution adjustement between audio and mel-spectrogram pad = audio.librosa_pad_lr(wav, hparams.fft_size, audio.get_hop_size(hparams)) #Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency) out = np.pad(out, pad, mode='reflect') assert len(out) >= mel_frames * audio.get_hop_size(hparams) #time resolution adjustement #ensure length of raw audio is multiple of hop size so that we can use #transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) # Write the spectrogram and audio to disk wav_id = os.path.splitext(os.path.basename(wav_path))[0] # Write the spectrograms to disk: audio_filename = '{}-audio.npy'.format(wav_id) mel_filename = '{}-mel.npy'.format(wav_id) linear_filename = '{}-linear.npy'.format(wav_id) npz_filename = '{}.npz'.format(wav_id) npz_flag = True if npz_flag: # Tacotron 코드와 맞추기 위해, 같은 key를 사용한다. data = { 'audio': out.astype(out_dtype), 'mel': mel_spectrogram.T, 'linear': linear_spectrogram.T, 'time_steps': time_steps, 'mel_frames': mel_frames, 'text': text, 'tokens': text_to_sequence(text), # eos(~)에 해당하는 "1"이 끝에 붙는다. 'loss_coeff': 1 # For Tacotron } np.savez(os.path.join(out_dir, npz_filename), **data, allow_pickle=False) else: np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text, npz_filename)
from utils.audio import melspectrogram,inv_mel_spectrogram,load_wav,save_wav wav_path = "LJ001-0008.wav" raw_wav = load_wav(wav_path) mel_spec = melspectrogram(raw_wav) inv_wav = inv_mel_spectrogram(mel_spec) save_wav(inv_wav,"inv.wav")