def _process_utterance(out_dir, index, wav_path, text): # Load the audio to a numpy array: wav = audio.load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) # Write the spectrograms to disk: audio_filename = 'ljspeech-audio-%05d.npy' % index mel_filename = 'ljspeech-mel-%05d.npy' % index # np.save(os.path.join(out_dir, audio_filename), # out.astype(out_dtype), allow_pickle=False) # np.save(os.path.join(out_dir, mel_filename), # mel_spectrogram.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return (audio_filename, mel_filename, timesteps, text)
def _process_utterance(out_dir, wav_path): # Load the audio to a numpy array: wav = audio.load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T return mel_spectrogram.astype(np.float32)
def _extract_mel(wav_path): # Load the audio to a numpy array. Resampled if needed. wav = audio.load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjast time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjastment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 assert len(out) // N == audio.get_hop_size() timesteps = len(out) return out, mel_spectrogram, timesteps, out_dtype
def test_mulaw(): # Check corner cases assert P.mulaw_quantize(-1.0, 2) == 0 assert P.mulaw_quantize(-0.5, 2) == 0 assert P.mulaw_quantize(-0.001, 2) == 0 assert P.mulaw_quantize(0.0, 2) == 1 assert P.mulaw_quantize(0.0001, 2) == 1 assert P.mulaw_quantize(0.5, 2) == 1 assert P.mulaw_quantize(0.99999, 2) == 1 assert P.mulaw_quantize(1.0, 2) == 2 np.random.seed(1234) # forward/backward correctness for mu in [128, 256, 512]: for x in np.random.rand(100): y = P.mulaw(x, mu) assert y >= 0 and y <= 1 x_hat = P.inv_mulaw(y, mu) assert np.allclose(x, x_hat) # forward/backward correctness for quantize for mu in [128, 256, 512]: for x, y in [(-1.0, 0), (0.0, mu // 2), (0.99999, mu - 1)]: y_hat = P.mulaw_quantize(x, mu) err = np.abs(x - P.inv_mulaw_quantize(y_hat, mu)) print(y, y_hat, err) assert np.allclose(y, y_hat) # have small quantize error assert err <= 0.1 # ndarray input for mu in [128, 256, 512]: x = np.random.rand(10) y = P.mulaw(x, mu) x_hat = P.inv_mulaw(y, mu) assert np.allclose(x, x_hat) P.inv_mulaw_quantize(P.mulaw_quantize(x)) # torch array input from warnings import warn import torch torch.manual_seed(1234) for mu in [128, 256, 512]: x = torch.rand(10) y = P.mulaw(x, mu) x_hat = P.inv_mulaw(y, mu) assert np.allclose(x, x_hat) P.inv_mulaw_quantize(P.mulaw_quantize(x))
def _process_utterance_single(out_dir, text, wav_path, hparams=hparams): # modified version of LJSpeech _process_utterance audio.set_hparams(hparams) # Load the audio to a numpy array: wav = audio.load_wav(wav_path) sr = hparams.sample_rate # Added from the multispeaker version lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab") if not exists(lab_path): lab_path = os.path.splitext(wav_path)[0]+'.lab' # Trim silence from hts labels if available if exists(lab_path): labels = hts.load(lab_path) wav = clean_by_phoneme(labels, wav, sr) wav, _ = librosa.effects.trim(wav, top_db=25) else: if hparams.process_only_htk_aligned: return None wav, _ = librosa.effects.trim(wav, top_db=15) # End added from the multispeaker version if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max if hparams.max_audio_length != 0 and librosa.core.get_duration(y=wav, sr=sr) > hparams.max_audio_length: return None if hparams.min_audio_length != 0 and librosa.core.get_duration(y=wav, sr=sr) < hparams.min_audio_length: return None # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) # Write the spectrograms to disk: # Get filename from wav_path wav_name = os.path.basename(wav_path) wav_name = os.path.splitext(wav_name)[0] out_filename = 'audio-{}.npy'.format(wav_name) mel_filename = 'mel-{}.npy'.format(wav_name) np.save(os.path.join(out_dir, out_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return (out_filename, mel_filename, timesteps, text)
def _process_utterance(wav_path, out_dir): fname = wav_path.split(os.sep)[-1].split(".")[0] audio_filename = '{}_resolved.npy'.format(fname) mel_filename = '{}_mel.npy'.format(fname) apth = os.path.join(out_dir, audio_filename) mpth = os.path.join(out_dir, mel_filename) if os.path.exists(apth) and os.path.exists(mpth): print("File {} already processed".format(wav_path)) return # Load the audio to a numpy array: wav = audio.load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) # Write the spectrograms to disk: np.save(apth, out.astype(out_dtype), allow_pickle=False) np.save(mpth, mel_spectrogram.astype(np.float32), allow_pickle=False)
def _process_utterance(out_dir,wav_path,sp2ind_dir,text): sp_f = open(sp2ind_dir,'r') sp2ind = json.load(sp_f) sp = wav_path.split('/')[-1].split('.')[0].split('_')[0] if sp in sp2ind: sp_ind = sp2ind[sp] else: sp_ind = -1 wav = audio.load_wav(wav_path) if not 'test' in wav_path: wav,_ = librosa.effects.trim(wav,top_db=60,frame_length=2048,hop_length=512) if hparams.highpass_cutoff > 0.0: wav = audio.low_cut_filter(wav, hparams.sample_rate, hparams.highpass_cutoff) if is_mulaw_quantize(hparams.input_type): # Trim silences in mul-aw quantized domain silence_threshold = 0 if silence_threshold > 0: # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels - 1) start, end = audio.start_and_end_indices(out, silence_threshold) wav = wav[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels - 1) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] constant_values = P.mulaw(0.0, hparams.quantize_channels - 1) out_dtype = np.float32 else: # [-1, 1] constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.logmelspectrogram(wav).astype(np.float32).T mfcc = audio.mfcc(wav).astype(np.float32).T if hparams.global_gain_scale > 0: wav *= hparams.global_gain_scale # Time domain preprocessing if hparams.preprocess is not None and hparams.preprocess not in ["", "none"]: f = getattr(audio, hparams.preprocess) wav = f(wav) # Clip if np.abs(wav).max() > 1.0: print("""Warning: abs max value exceeds 1.0: {}""".format(np.abs(wav).max())) # ignore this sample #return ("dummy", "dummy","dummy", -1,-1, "dummy") wav = np.clip(wav, -1.0, 1.0) # Set waveform target (out) if is_mulaw_quantize(hparams.input_type): out = P.mulaw_quantize(wav, hparams.quantize_channels - 1) elif is_mulaw(hparams.input_type): out = P.mulaw(wav, hparams.quantize_channels - 1) else: out = wav # zero pad # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.pad_lr(out, hparams.fft_size, audio.get_hop_size()) if l > 0 or r > 0: out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 # Write the spectrograms to disk: #name = splitext(basename(wav_path))[0] #audio_filename = '%s-wave.npy' % (name) #mel_filename = '%s-feats.npy' % (name) audio_filename = f'{out_dir}wave.npy' mel_filename = f'{out_dir}mel.npy' mfcc_filename = f'{out_dir}mfcc.npy' assert mfcc.shape[0] == N np.save(audio_filename, out.astype(out_dtype), allow_pickle=False) np.save(mel_filename, mel_spectrogram.astype(np.float32), allow_pickle=False) np.save(mfcc_filename, mfcc.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return (out_dir, N, sp_ind,text)
def _process_song(out_dir, index, wav_path, text): # Load the audio to a numpy array: wav = audio.load_wav(wav_path) # Trim begin/end silences # NOTE: the threshold was chosen for clean signals wav, _ = librosa.effects.trim(wav, top_db=60, frame_length=2048, hop_length=512) if hparams.highpass_cutoff > 0.0: wav = audio.low_cut_filter(wav, hparams.sample_rate, hparams.highpass_cutoff) # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # Trim silences in mul-aw quantized domain silence_threshold = 0 if silence_threshold > 0: # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels - 1) start, end = audio.start_and_end_indices(out, silence_threshold) wav = wav[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels - 1) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] constant_values = P.mulaw(0.0, hparams.quantize_channels - 1) out_dtype = np.float32 else: # [-1, 1] constant_values = 0.0 out_dtype = np.float32 #### CLAIRE Work here wav_name = os.path.splitext(os.path.basename(wav_path))[0] os.makedirs('./pwavs', exist_ok=True) pwav_path = './pwavs/{0}.wav'.format(wav_name) scipy.io.wavfile.write(pwav_path, 16000, wav) # make the chord directory if it does not exist chord_dir = "chord_dir" os.makedirs(chord_dir, exist_ok=True) # create xml file with notes and timestamps #subprocess.check_call(['./extract_chord_notes.sh', wav_path, chord_dir], shell=True) #os.system('./extract_chord_notes.sh {0} {1}'.format(pwav_path, chord_dir)) os.system('./extract_chromagram.sh {0} {1} > /dev/null 2>&1'.format( pwav_path, chord_dir)) note_filename = '{0}/{1}.csv'.format(chord_dir, wav_name) #### Instead of computing the Mel Spectrogram, here return a time series of one hot encoded chords. # vector with 1 in row for each note played # 1000 samples per second note_samples = int(len(wav) / 2048) # 12 notes per octave chords_time_series = np.zeros((24, note_samples)) #print(np.shape(chords_time_series)) with open(note_filename, newline='\n') as csvfile: #chordreader = csv.reader(csvfile, delimeter=',') chordreader = csvfile.readlines() #print(chordreader) for idx, row in enumerate(chordreader): row = row.split(",") chromogram_samples = np.array(row).astype(np.float)[1:] chords_time_series[:, idx] = chromogram_samples chords_time_series = chords_time_series.T # if hparams.global_gain_scale > 0: # wav *= hparams.global_gain_scale # Time domain preprocessing if hparams.preprocess is not None and hparams.preprocess not in [ "", "none" ]: f = getattr(audio, hparams.preprocess) wav = f(wav) # wav = np.clip(wav, -1.0, 1.0) # Set waveform target (out) if is_mulaw_quantize(hparams.input_type): out = P.mulaw_quantize(wav, hparams.quantize_channels - 1) elif is_mulaw(hparams.input_type): out = P.mulaw(wav, hparams.quantize_channels - 1) else: out = wav # zero pad # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.pad_lr(out, hparams.fft_size, audio.get_hop_size()) if l > 0 or r > 0: out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = chords_time_series.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 # Write the spectrograms to disk: name = splitext(basename(wav_path))[0] audio_filename = '%s-wave.npy' % (name) chords_filename = '%s-feats.npy' % (name) np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, chords_filename), chords_time_series.astype(out_dtype), allow_pickle=False) # Return a tuple describing this training example: return (audio_filename, chords_filename, N, text)
def _process_utterance(out_dir, index, speaker_id, wav_path, text): sr = hparams.sample_rate # Load the audio to a numpy array. Resampled if needed wav = audio.load_wav(wav_path) lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab") # Trim silence from hts labels if available # TODO if exists(lab_path) and False: labels = hts.load(lab_path) b = int(start_at(labels) * 1e-7 * sr) e = int(end_at(labels) * 1e-7 * sr) wav = wav[b:e] wav, _ = librosa.effects.trim(wav, top_db=20) else: wav, _ = librosa.effects.trim(wav, top_db=20) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) # Write the spectrograms to disk: audio_filename = 'cmu_arctic-audio-%05d.npy' % index mel_filename = 'cmu_arctic-mel-%05d.npy' % index np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return (audio_filename, mel_filename, timesteps, text, speaker_id)
def _process_utterance(out_dir, index, speaker_id, wav_path, text): sr = hparams.sample_rate # Load the audio to a numpy array. Resampled if needed wav = audio.load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 #print("Wavepath is ", wav_path) filename = wav_path.split('/wav/')[-1].split('.wav')[0] fname = filename filename = ccoeffs_feats_path + '/' + filename + '.mcep' mel_spectrogram = np.loadtxt(filename) #print("Shape of mel scptrogram is ", mel_spectrogram.shape) # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) #mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution between audio and mel-spectrogram #l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal #out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] #out = ensure_divisible(out, N) #print("Length of out: ", len(out), "N ", N) #print("Out and N: ", len(out), N) #if len(out) < N * audio.get_hop_size(): #print("Out and N: ", filename, len(out), N, N * audio.get_hop_size()) # sys.exit() #assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample #out = out[:N * 80] #out = ensure_divisible(out, N) g = open('logfile','a') g.write("Processing " + fname + '\n') g.close() out,mel_spectrogram = ensure_frameperiod(out,mel_spectrogram) #out = ensure_divisible(out, audio.get_hop_size()) #assert len(out) % audio.get_hop_size() == 0 #assert len(out) % N == 0 timesteps = len(out) g = open('logfile','a') g.write(fname + ' ' + str(len(out)) + ' ' + str(N) + ' ' + str(len(out) % N) + '\n') g.write('\n') g.close() # Write the spectrograms to disk: audio_filename = fname + '-audio-%05d.npy' % index mel_filename = fname + '-mel-%05d.npy' % index np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return (audio_filename, mel_filename, timesteps, text, speaker_id)
def eval_model(hparams, global_step, model, x, y, c, g, input_lengths, eval_dir): """ Function for model evaluation. This function is used for debugging in this project. """ model.set_train(False) idx = np.random.randint(0, len(y)) length = input_lengths.asnumpy()[idx] y_target = np.reshape(y.asnumpy()[idx], (-1)) y_target = y_target[:length] if c is not None: expand_op = P.ExpandDims() if hparams.upsample_conditional_features: c = expand_op( c[idx, :, :int(length // audio.get_hop_size() + hparams.cin_pad * 2)], 0) else: c = expand_op(c[idx, :, :length], 0) assert c.dim() == 3 print("Shape of local conditioning features: {}".format(c.size())) if g is not None: g = g[idx] print("Shape of global conditioning features: {}".format(g.size())) # Dummy silence if is_mulaw_quantize(hparams.input_type): initial_value = P1.mulaw_quantize(0, hparams.quantize_channels - 1) elif is_mulaw(hparams.input_type): initial_value = P1.mulaw(0.0, hparams.quantize_channels) else: initial_value = 0.0 # (C,) if is_mulaw_quantize(hparams.input_type): initial_input = to_categorical( initial_value, num_classes=hparams.quantize_channels).astype(np.float32) initial_input = Tensor( np.reshape(initial_input, (1, 1, hparams.quantize_channels))) else: initial_input = np.ones((1, 1, 1)) * initial_value initial_input = Tensor(initial_input) # Run the model in fast eval mode y_hat = model.incremental_forward(initial_input, c=c, g=g, T=length, softmax=True, quantize=True, tqdm=tqdm, log_scale_min=hparams.log_scale_min) if is_mulaw_quantize(hparams.input_type): y_hat = np.reshape(np.argmax(y_hat, 1), (-1)) y_hat = P1.inv_mulaw_quantize(y_hat, hparams.quantize_channels - 1) y_target = P1.inv_mulaw_quantize(y_target, hparams.quantize_channels - 1) elif is_mulaw(hparams.input_type): y_hat = P1.inv_mulaw(np.reshape(y_hat, (-1)), hparams.quantize_channels) y_target = P1.inv_mulaw(y_target, hparams.quantize_channels) else: y_hat = np.reshape(y_hat, (-1)) # Save audio os.makedirs(eval_dir, exist_ok=True) path = os.path.join(eval_dir, "step{:09d}_predicted.wav".format(global_step)) librosa.output.write_wav(path, y_hat, sr=hparams.sample_rate) path = os.path.join(eval_dir, "step{:09d}_target.wav".format(global_step)) librosa.output.write_wav(path, y_target, sr=hparams.sample_rate) # Save figure path = os.path.join(eval_dir, "step{:09d}_waveplots.png".format(global_step)) save_waveplot(path, y_hat, y_target, hparams.sample_rate)
def _process_utterance(out_dir, index, wav_path, text): # Load the audio to a numpy array: wav = audio.load_wav(wav_path) # Trim begin/end silences # NOTE: the threshold was chosen for clean signals #wav, _ = librosa.effects.trim(wav, top_db=60, frame_length=2048, hop_length=512) #if hparams.highpass_cutoff > 0.0: # wav = audio.low_cut_filter(wav, hparams.sample_rate, hparams.highpass_cutoff) # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # Trim silences in mul-aw quantized domain silence_threshold = 0 #if silence_threshold > 0: # [0, quantize_channels) # out = P.mulaw_quantize(wav, hparams.quantize_channels - 1) # start, end = audio.start_and_end_indices(out, silence_threshold) # wav = wav[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels - 1) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] constant_values = P.mulaw(0.0, hparams.quantize_channels - 1) out_dtype = np.float32 else: # [-1, 1] constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.logmelspectrogram(wav).astype(np.float32).T if hparams.global_gain_scale > 0: wav *= hparams.global_gain_scale # Time domain preprocessing if hparams.preprocess is not None and hparams.preprocess not in [ "", "none" ]: f = getattr(audio, hparams.preprocess) wav = f(wav) # Clip if np.abs(wav).max() > 1.0: print("""Warning: abs max value exceeds 1.0: {}""".format( np.abs(wav).max())) # ignore this sample return ("dummy", "dummy", -1, "dummy") wav = np.clip(wav, -1.0, 1.0) # Set waveform target (out) if is_mulaw_quantize(hparams.input_type): out = P.mulaw_quantize(wav, hparams.quantize_channels - 1) elif is_mulaw(hparams.input_type): out = P.mulaw(wav, hparams.quantize_channels - 1) else: out = wav #print(len(wav)) # zero pad # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.pad_lr(out, hparams.fft_size, audio.get_hop_size()) if l > 0 or r > 0: out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 # Write the spectrograms to disk: name = splitext(basename(wav_path))[0] audio_filename = '%s-wave.npy' % (name) mel_filename = '%s-feats.npy' % (name) spectrogram = '%s-img.png' % (name) from PIL import Image np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False) print("mel_max: " + str(np.max(mel_spectrogram.astype(np.float32)))) print("mel_min: " + str(np.min(mel_spectrogram.astype(np.float32)))) print("mel_shape: " + str(mel_spectrogram.astype(np.float32).shape)) #Save as image img = audio.mel2png(mel_spectrogram.astype(np.float32)) #print("Shape of img before save : " + str(img.shape)) spec_path = os.path.join(out_dir, spectrogram) # save as PNG io.imsave(spec_path, img, check_contrast=False) #Image.fromarray(img).save(os.path.join(out_dir, spectrogram)) # Return a tuple describing this training example: mel_back = audio.png2mel(io.imread(spec_path)) #print("Shape of image after save: " + str(mel_back.shape)) #print("Subtraction: " + str(mel_back - mel_spectrogram)) return (audio_filename, mel_filename, N, text)
def _process_utterance(out_dir, index, speaker_id, wav_path, lab_path, binary_dict, continuous_dict, text): # Load the audio to a numpy array. Resampled if needed wav = audio.load_wav(wav_path) # determine sessionID and uttID wavbn = os.path.basename(wav_path) uttID = os.path.splitext(wavbn)[0] if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # time-aligned context if hparams.frame_shift_ms is None: frame_shift_in_micro_sec = (hparams.hop_size * 10000000) // hparams.sample_rate else: frame_shift_in_micro_sec = hparams.frame_shift_ms * 10000 labels = hts.HTSLabelFile(frame_shift_in_micro_sec) labels.load(lab_path) linguistic_features = fe.linguistic_features( labels, binary_dict, continuous_dict, add_frame_features=True, frame_shift_in_micro_sec=frame_shift_in_micro_sec) Nwav = len(out) // audio.get_hop_size() out = out[:Nwav * audio.get_hop_size()] timesteps = len(out) context = linguistic_features # Write the spectrograms to disk: audio_filename = 'audio-' + uttID + '.npy' context_filename = 'context-' + uttID + '.npy' np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, context_filename), context.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return (audio_filename, context_filename, timesteps, text, speaker_id)
def _process_utterance(out_dir, index, audio_filepath, text): # Load the audio to a numpy array: wav_whole = audio.load_wav(audio_filepath) if hparams.rescaling: wav_whole = wav_whole / np.abs(wav_whole).max() * hparams.rescaling_max # This is a librivox source, so the audio files are going to be v. long # compared to a typical 'utterance' : So split the wav into chunks tup_results = [] n_samples = int(8.0 * hparams.sample_rate) # All 8 second utterances n_chunks = wav_whole.shape[0] // n_samples for chunk_idx in range(n_chunks): chunk_start, chunk_end = chunk_idx * \ n_samples, (chunk_idx + 1) * n_samples if chunk_idx == n_chunks - 1: # This is the last chunk - allow it # to extend to the end of the file chunk_end = None wav = wav_whole[chunk_start:chunk_end] # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution # between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) # Write the spectrograms to disk: audio_filename = 'librivox-audio-%04d-%05d.npy' % ( index, chunk_idx, ) mel_filename = 'librivox-mel-%04d-%05d.npy' % ( index, chunk_idx, ) text_idx = '%s - %05d' % ( text, chunk_idx, ) np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False) # Add results tuple describing this training example: tup_results.append((audio_filename, mel_filename, timesteps, text_idx)) # Return all the audio results tuples (unpack in caller) return tup_results
def eval_model(global_step, writer, device, model, y, c, g, input_lengths, eval_dir, ema=None): if ema is not None: print("Using averaged model for evaluation") model = clone_as_averaged_model(device, model, ema) model.make_generation_fast_() model.eval() #pick one of the available waves to try to emulate idx = np.random.randint(0, len(y)) length = input_lengths[idx].data.cpu().item() # (T,) y_target = y[idx].view(-1).data.cpu().numpy()[:length] if c is not None: if hparams.upsample_conditional_features: c = c[idx, :, :length // audio.get_hop_size()].unsqueeze(0) else: c = c[idx, :, :length].unsqueeze(0) assert c.dim() == 3 print("Shape of local conditioning features: {}".format(c.size())) if g is not None: # TODO: test g = g[idx] print("Shape of global conditioning features: {}".format(g.size())) # Dummy silence if is_mulaw_quantize(hparams.input_type): initial_value = P.mulaw_quantize(0, hparams.quantize_channels) elif is_mulaw(hparams.input_type): initial_value = P.mulaw(0.0, hparams.quantize_channels) else: #initial_value = 0.0 initial_value = float(y_target[0]) #TODO change initial value to first value of actual waveform instead of zero?? <MLK, 10/19> print("Intial value:", initial_value) # (C,) if is_mulaw_quantize(hparams.input_type): initial_input = np_utils.to_categorical( initial_value, num_classes=hparams.quantize_channels).astype(np.float32) initial_input = torch.from_numpy(initial_input).view( 1, 1, hparams.quantize_channels) else: initial_input = torch.zeros(1, 1, 1).fill_(initial_value) initial_input = initial_input.to(device) # Run the model in fast eval mode with torch.no_grad(): y_hat = model.incremental_forward( initial_input, c=c, g=g, T=length, softmax=True, quantize=True, tqdm=tqdm, log_scale_min=hparams.log_scale_min) if is_mulaw_quantize(hparams.input_type): y_hat = y_hat.max(1)[1].view(-1).long().cpu().data.numpy() y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels) y_target = P.inv_mulaw_quantize(y_target, hparams.quantize_channels) elif is_mulaw(hparams.input_type): y_hat = P.inv_mulaw(y_hat.view(-1).cpu().data.numpy(), hparams.quantize_channels) y_target = P.inv_mulaw(y_target, hparams.quantize_channels) else: y_hat = y_hat.view(-1).cpu().data.numpy() # Save audio os.makedirs(eval_dir, exist_ok=True) path = join(eval_dir, "step_noncausal_{:09d}_predicted.npy".format(global_step)) np.save(path, y_hat) path = join(eval_dir, "step_noncausal_{:09d}_target.npy".format(global_step)) np.save(path, y_target) # save figure path = join(eval_dir, "step_noncausal_{:09d}_waveplots.png".format(global_step)) save_waveplot(path, y_hat, y_target)
def eval_model(global_step, writer, model, y, c, g, input_lengths, eval_dir, ema=None): if ema is not None: print("Using averaged model for evaluation") model = clone_as_averaged_model(model, ema) model.eval() idx = np.random.randint(0, len(y)) length = input_lengths[idx].data.cpu().numpy()[0] # (T,) y_target = y[idx].view(-1).data.cpu().numpy()[:length] if c is not None: c = c[idx, :, :length].unsqueeze(0) assert c.dim() == 3 print("Shape of local conditioning features: {}".format(c.size())) if g is not None: # TODO: test g = g[idx] print("Shape of global conditioning features: {}".format(g.size())) # Dummy silence if is_mulaw_quantize(hparams.input_type): initial_value = P.mulaw_quantize(0, hparams.quantize_channels) elif is_mulaw(hparams.input_type): initial_value = P.mulaw(0.0, hparams.quantize_channels) else: initial_value = 0.0 print("Intial value:", initial_value) # (C,) if is_mulaw_quantize(hparams.input_type): initial_input = np_utils.to_categorical( initial_value, num_classes=hparams.quantize_channels).astype(np.float32) initial_input = Variable(torch.from_numpy(initial_input)).view( 1, 1, hparams.quantize_channels) else: initial_input = Variable(torch.zeros(1, 1, 1).fill_(initial_value)) initial_input = initial_input.cuda() if use_cuda else initial_input # Run the model in fast eval mode y_hat = model.incremental_forward(initial_input, c=c, g=g, T=length, softmax=True, quantize=True, tqdm=tqdm, log_scale_min=hparams.log_scale_min) if is_mulaw_quantize(hparams.input_type): y_hat = y_hat.max(1)[1].view(-1).long().cpu().data.numpy() y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels) y_target = P.inv_mulaw_quantize(y_target, hparams.quantize_channels) elif is_mulaw(hparams.input_type): y_hat = P.inv_mulaw( y_hat.view(-1).cpu().data.numpy(), hparams.quantize_channels) y_target = P.inv_mulaw(y_target, hparams.quantize_channels) else: y_hat = y_hat.view(-1).cpu().data.numpy() # Save audio os.makedirs(eval_dir, exist_ok=True) path = join(eval_dir, "step{:09d}_predicted.wav".format(global_step)) librosa.output.write_wav(path, y_hat, sr=hparams.sample_rate) path = join(eval_dir, "step{:09d}_target.wav".format(global_step)) librosa.output.write_wav(path, y_target, sr=hparams.sample_rate) # save figure path = join(eval_dir, "step{:09d}_waveplots.png".format(global_step)) save_waveplot(path, y_hat, y_target)
def eval_model(global_step, writer, device, model, y, c, g, input_lengths, eval_dir, ema=None): if ema is not None: print("Using averaged model for evaluation") model = clone_as_averaged_model(device, model, ema) model.make_generation_fast_() model.eval() idx = np.random.randint(0, len(y)) length = input_lengths[idx].data.cpu().item() # (T,) y_target = y[idx].view(-1).data.cpu().numpy()[:length] if c is not None: if hparams.upsample_conditional_features: c = c[idx, :, :length // audio.get_hop_size() + hparams.cin_pad * 2].unsqueeze(0) else: c = c[idx, :, :length].unsqueeze(0) assert c.dim() == 3 print("Shape of local conditioning features: {}".format(c.size())) if g is not None: # TODO: test g = g[idx] print("Shape of global conditioning features: {}".format(g.size())) # Dummy silence if is_mulaw_quantize(hparams.input_type): initial_value = P.mulaw_quantize(0, hparams.quantize_channels - 1) elif is_mulaw(hparams.input_type): initial_value = P.mulaw(0.0, hparams.quantize_channels) else: initial_value = 0.0 # (C,) if is_mulaw_quantize(hparams.input_type): initial_input = to_categorical( initial_value, num_classes=hparams.quantize_channels).astype(np.float32) initial_input = torch.from_numpy(initial_input).view( 1, 1, hparams.quantize_channels) else: initial_input = torch.zeros(1, 1, 1).fill_(initial_value) initial_input = initial_input.to(device) # Run the model in fast eval mode with torch.no_grad(): y_hat = model.incremental_forward( initial_input, c=c, g=g, T=length, softmax=True, quantize=True, tqdm=tqdm, log_scale_min=hparams.log_scale_min) if is_mulaw_quantize(hparams.input_type): y_hat = y_hat.max(1)[1].view(-1).long().cpu().data.numpy() y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels - 1) y_target = P.inv_mulaw_quantize(y_target, hparams.quantize_channels - 1) elif is_mulaw(hparams.input_type): y_hat = P.inv_mulaw(y_hat.view(-1).cpu().data.numpy(), hparams.quantize_channels) y_target = P.inv_mulaw(y_target, hparams.quantize_channels) else: y_hat = y_hat.view(-1).cpu().data.numpy() # Save audio os.makedirs(eval_dir, exist_ok=True) path = join(eval_dir, "step{:09d}_predicted.wav".format(global_step)) librosa.output.write_wav(path, y_hat, sr=hparams.sample_rate) path = join(eval_dir, "step{:09d}_target.wav".format(global_step)) librosa.output.write_wav(path, y_target, sr=hparams.sample_rate) # save figure path = join(eval_dir, "step{:09d}_waveplots.png".format(global_step)) save_waveplot(path, y_hat, y_target) # add audio and figures to tensorboard writer.add_audio('target_audio', y_target, global_step, hparams.sample_rate) writer.add_audio('generated_audio', y_hat, global_step, hparams.sample_rate)
def _process_utterance(out_dir, index, wav_path, text, trim_silence=False): # Load the audio to a numpy array: wav = audio.load_wav(wav_path) # Trim begin/end silences # NOTE: the threshold was chosen for clean signals # TODO: Remove, get this out of here. if trim_silence: wav, _ = librosa.effects.trim(wav, top_db=60, frame_length=2048, hop_length=512) if hparams.highpass_cutoff > 0.0: wav = audio.low_cut_filter(wav, hparams.sample_rate, hparams.highpass_cutoff) # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # Trim silences in mul-aw quantized domain silence_threshold = 0 if silence_threshold > 0: # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels - 1) start, end = audio.start_and_end_indices(out, silence_threshold) wav = wav[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels - 1) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] constant_values = P.mulaw(0.0, hparams.quantize_channels - 1) out_dtype = np.float32 else: # [-1, 1] constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.logmelspectrogram(wav).astype(np.float32).T if hparams.global_gain_scale > 0: wav *= hparams.global_gain_scale # Time domain preprocessing if hparams.preprocess is not None and hparams.preprocess not in [ "", "none" ]: f = getattr(audio, hparams.preprocess) wav = f(wav) # Clip if np.abs(wav).max() > 1.0: print("""Warning: abs max value exceeds 1.0: {}""".format( np.abs(wav).max())) # ignore this sample return ("dummy", "dummy", -1, "dummy") wav = np.clip(wav, -1.0, 1.0) # Set waveform target (out) if is_mulaw_quantize(hparams.input_type): out = P.mulaw_quantize(wav, hparams.quantize_channels - 1) elif is_mulaw(hparams.input_type): out = P.mulaw(wav, hparams.quantize_channels - 1) else: out = wav # zero pad # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.pad_lr(out, hparams.fft_size, audio.get_hop_size()) if l > 0 or r > 0: out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 assert_ready_for_upsampling(out, mel_spectrogram, cin_pad=0, debug=True) # Write the spectrograms to disk: name = splitext(basename(wav_path))[0] audio_filename = "%s-wave.npy" % (name) mel_filename = "%s-feats.npy" % (name) np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save( os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False, ) # Return a tuple describing this training example: return (audio_filename, mel_filename, N, text)
def _process_utterance(out_dir, index, wav_path, text, sample_rate, fft_size, hop_size, n_mels, redis_connection): # Load the audio to a numpy array: wav = load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize # this really gets called if input_type in hparams # is changed from raw to mulaw if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) # mel_spectrogram = # audio.melspectrogram(wav, 22050, 1024, 40).astype(np.float32).T # change this line to adjust hyperparams mel_spectrogram = melspectrogram(wav, sample_rate, fft_size, hop_size, n_mels).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution # between audio and mel-spectrogram l, r = lws_pad_lr(wav, fft_size, hop_size) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] # assert len(out) >= N * audio.get_hop_size() assert len(out) >= N * hop_size # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample # out = out[:N * audio.get_hop_size()] # assert len(out) % audio.get_hop_size() == 0 out = out[:N * hop_size] assert len(out) % hop_size == 0 timesteps = len(out) # compute example reconstruction # change this line to adjust hparams # signal = audio.inv_mel_spectrogram(mel_spectrogram, # sample_rate, fft_size, n_mels) # mel_spectrogram = mel_spectrogram.T # Write the spectrograms to disk: audio_filename = 'ljspeech-audio-%05d.npy' % index mel_filename = 'ljspeech-mel-%05d.npy' % index # recon_audio_filename = 'ljspeech-audio-%05d.wav' % index data = out.tobytes() target = np.asarray(text).tobytes() redis_connection.set(index, data + target) # np.save(os.path.join(out_dir, audio_filename), # out.astype(out_dtype), allow_pickle=False) # np.save(os.path.join(out_dir, mel_filename), # mel_spectrogram.astype(np.float32), allow_pickle=False) # audio.save_wav(signal, os.path.join(out_dir, recon_audio_filename)) # Return a tuple describing this training example: return (audio_filename, mel_filename, timesteps, text)