def convert(predictor, df): pred_spec, y_spec, ppgs = predictor(next(df().get_data())) # Denormalizatoin pred_spec = denormalize_db(pred_spec, hp.Default.max_db, hp.Default.min_db) y_spec = denormalize_db(y_spec, hp.Default.max_db, hp.Default.min_db) # Db to amp pred_spec = db2amp(pred_spec) y_spec = db2amp(y_spec) # Emphasize the magnitude pred_spec = np.power(pred_spec, hp.Convert.emphasis_magnitude) y_spec = np.power(y_spec, hp.Convert.emphasis_magnitude) # Spectrogram to waveform audio = np.array([spec2wav(spec.T, hp.Default.n_fft, hp.Default.win_length, hp.Default.hop_length, hp.Default.n_iter) for spec in pred_spec]) y_audio = np.array([spec2wav(spec.T, hp.Default.n_fft, hp.Default.win_length, hp.Default.hop_length, hp.Default.n_iter) for spec in y_spec]) # Apply inverse pre-emphasis audio = inv_preemphasis(audio, coeff=hp.Default.preemphasis) y_audio = inv_preemphasis(y_audio, coeff=hp.Default.preemphasis) # if hp.Convert.one_full_wav: # # Concatenate to a wav # y_audio = np.reshape(y_audio, (1, y_audio.size), order='C') # audio = np.reshape(audio, (1, audio.size), order='C') return audio, y_audio, ppgs
def convert(predictor, df): a, b, c = next(df().get_data()) pred_spec, r_spec = predictor(a, b, c) # Denormalizatoin pred_spec = denormalize_db(pred_spec, hp.default.max_db, hp.default.min_db) r_spec = denormalize_db(r_spec, hp.default.max_db, hp.default.min_db) # Db to amp pred_spec = db2amp(pred_spec) r_spec = db2amp(r_spec) # Emphasize the magnitude pred_spec = np.power(pred_spec, hp.convert.emphasis_magnitude) r_spec = np.power(r_spec, hp.convert.emphasis_magnitude) # Spectrogram to waveform audio = np.array( list(map(lambda spec: spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp.default.hop_length, hp.default.n_iter), pred_spec))) y_audio = np.array( list(map(lambda spec: spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp.default.hop_length, hp.default.n_iter), r_spec))) # Apply inverse pre-emphasis audio = inv_preemphasis(audio, coeff=hp.default.preemphasis) y_audio = inv_preemphasis(y_audio, coeff=hp.default.preemphasis) # if hp.convert.one_full_wav: # # Concatenate to a wav # y_audio = np.reshape(y_audio, (1, y_audio.size), order='C') # audio = np.reshape(audio, (1, audio.size), order='C') return audio, y_audio
def do_convert(predictor, input_name, logdir2): convert_s = datetime.datetime.now() # Load input audio input_audio, _ = librosa.load(input_name, sr=hp.default.sr, dtype=np.float64) # Extract F0 from input audio first input_f0, t_table = pw.dio(input_audio, hp.default.sr) input_f0 = pw.stonemask(input_audio, input_f0, t_table, hp.default.sr) # Get MFCC, Spectral Envelope, and Aperiodicity mfcc = _get_mfcc(input_audio, hp.default.n_fft, hp.default.win_length, hp.default.hop_length) mfcc = np.expand_dims(mfcc, axis=0) input_ap = pw.d4c(input_audio, input_f0, t_table, hp.default.sr, fft_size=hp.default.n_fft) input_sp_en = _get_spectral_envelope(preemphasis(input_audio, coeff=hp.default.preemphasis), hp.default.n_fft) plt.imsave('./converted/debug/input_sp_en_original.png', input_sp_en, cmap='binary') input_sp_en = np.expand_dims(input_sp_en, axis=0) # Convert Spectral Envelope output_sp_en, ppgs = convert_spectral_envelope(predictor, mfcc, input_sp_en) output_sp_en = np.squeeze(output_sp_en.astype(np.float64), axis=0) preproc_s = datetime.datetime.now() # Denormalization output_sp_en = denormalize_db(output_sp_en, hp.default.max_db, hp.default.min_db) # Db to amp output_sp_en = librosa.db_to_amplitude(output_sp_en) # Emphasize the magnitude output_sp_en = np.power(output_sp_en, hp.convert.emphasis_magnitude) preproc_e = datetime.datetime.now() preproc_t = preproc_e - preproc_s print("Pre-Processing time:{}s".format(preproc_t.seconds)) # F0 transformation with WORLD Vocoder output_f0 = f0_adapt(input_f0, logdir2) # Synthesize audio and de-emphasize output_audio = pw.synthesize(output_f0, output_sp_en, input_ap, hp.default.sr) output_audio = inv_preemphasis(output_audio, coeff=hp.default.preemphasis) # Saving output_audio to 32-bit Float wav file output_audio = output_audio.astype(np.float32) librosa.output.write_wav(path="./converted/"+input_name,y=output_audio,sr=hp.default.sr) # Saving PPGS data to Grayscale Image and raw binary file ppgs = np.squeeze(ppgs, axis=0) plt.imsave('./converted/debug/'+input_name+'.png', ppgs, cmap='binary') np.save('./converted/debug/'+input_name+'.npy', ppgs) convert_e = datetime.datetime.now() convert_time = convert_e - convert_s print("Total Converting Time:{}s".format(convert_time.seconds))
def convert(predictor, df): t = next(df().get_data()) print(t[0].shape) pred_spec, y_spec, ppgs = predictor(next(df().get_data())) # Denormalizatoin pred_spec = denormalize_db(pred_spec, hp.default.max_db, hp.default.min_db) y_spec = denormalize_db(y_spec, hp.default.max_db, hp.default.min_db) # Db to amp pred_spec = db2amp(pred_spec) y_spec = db2amp(y_spec) # Emphasize the magnitude pred_spec = np.power(pred_spec, hp.convert.emphasis_magnitude) y_spec = np.power(y_spec, hp.convert.emphasis_magnitude) # Spectrogram to waveform audio = np.array( map( lambda spec: spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp. default.hop_length, hp.default.n_iter), pred_spec)) librosa.output.write_wav( '/home/user/vilin/deep-voice-conversion/output/file_trim_8.wav', audio[0], hp.default.sr) y_audio = np.array( map( lambda spec: spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp. default.hop_length, hp.default.n_iter), y_spec)) # Apply inverse pre-emphasis audio = inv_preemphasis(audio, coeff=hp.default.preemphasis) y_audio = inv_preemphasis(y_audio, coeff=hp.default.preemphasis) # if hp.convert.one_full_wav: # # Concatenate to a wav # y_audio = np.reshape(y_audio, (1, y_audio.size), order='C') # audio = np.reshape(audio, (1, audio.size), order='C') return audio, y_audio, ppgs
def convert(predictor, data): x_mfccs, y_spec, y_mel = data x_mfccs, y_spec, y_mel = data x_mfccs = np.array(x_mfccs).reshape((-1, ) + x_mfccs.shape) y_spec = np.array(y_spec).reshape((-1, ) + y_spec.shape) y_mel = np.array(y_mel).reshape((-1, ) + y_mel.shape) pred_spec, y_spec, ppgs = predictor(x_mfccs, y_spec, y_mel) # Denormalizatoin pred_spec = denormalize_db(pred_spec, hp.Default.max_db, hp.Default.min_db) y_spec = denormalize_db(y_spec, hp.Default.max_db, hp.Default.min_db) # Db to amp pred_spec = db2amp(pred_spec) y_spec = db2amp(y_spec) # Emphasize the magnitude pred_spec = np.power(pred_spec, hp.Convert.emphasis_magnitude) y_spec = np.power(y_spec, hp.Convert.emphasis_magnitude) # Spectrogram to waveform audio = np.array([ spec2wav(spec.T, hp.Default.n_fft, hp.Default.win_length, hp.Default.hop_length, hp.Default.n_iter) for spec in pred_spec ]) y_audio = np.array([ spec2wav(spec.T, hp.Default.n_fft, hp.Default.win_length, hp.Default.hop_length, hp.Default.n_iter) for spec in y_spec ]) # Apply inverse pre-emphasis audio = inv_preemphasis(audio, coeff=hp.Default.preemphasis) y_audio = inv_preemphasis(y_audio, coeff=hp.Default.preemphasis) if hp.Convert.one_full_wav: # Concatenate to a wav y_audio = np.reshape(y_audio, (1, y_audio.size), order='C') audio = np.reshape(audio, (1, audio.size), order='C') return audio, y_audio, ppgs
def predict(self, path_to_wav): x_mfcc, y_spec, mel = get_mfccs_and_spectrogram(path_to_wav) pred_spec, y_spec, ppgs = self.predictor(x_mfcc, y_spec, mel) pred_spec = denormalize_db(pred_spec, hp.default.max_db, hp.default.min_db) y_spec = denormalize_db(y_spec, hp.default.max_db, hp.default.min_db) pred_spec = db2amp(pred_spec) y_spec = db2amp(y_spec) pred_spec = np.power(pred_spec, hp.convert.emphasis_magnitude) y_spec = np.power(y_spec, hp.convert.emphasis_magnitude) # Spectrogram to waveform audio = np.array( map( lambda spec: spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp. default.hop_length, hp.default.n_iter), pred_spec)) y_audio = np.array( map( lambda spec: spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp. default.hop_length, hp.default.n_iter), y_spec))
def convert(predictor, df): # TODO need to fix reading in with duration pred_spec, y_spec, ppgs = predictor(next(df().get_data())) # Denormalizatoin pred_spec = denormalize_db(pred_spec, hp.default.max_db, hp.default.min_db) y_spec = denormalize_db(y_spec, hp.default.max_db, hp.default.min_db) # Db to amp pred_spec = db2amp(pred_spec) y_spec = db2amp(y_spec) # Emphasize the magnitude pred_spec = np.power(pred_spec, hp.convert.emphasis_magnitude) y_spec = np.power(y_spec, hp.convert.emphasis_magnitude) # Spectrogram to waveform audio = np.array( map( lambda spec: spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp. default.hop_length, hp.default.n_iter), pred_spec)) y_audio = np.array( map( lambda spec: spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp. default.hop_length, hp.default.n_iter), y_spec)) # Apply inverse pre-emphasis audio = inv_preemphasis(audio, coeff=hp.default.preemphasis) y_audio = inv_preemphasis(y_audio, coeff=hp.default.preemphasis) if hp.convert.one_full_wav: # Concatenate to a wav y_audio = np.reshape(y_audio, (1, y_audio.size), order='C') audio = np.reshape(audio, (1, audio.size), order='C') return audio, y_audio, ppgs
def sumspecimage(spec, spec_name): spec = denormalize_db(spec, hp.max_db, hp.min_db) spec = db2amp(spec) spec_image = spec.transpose(0, 2, 1) heatmap = np.expand_dims(spec_image, 3) tf.summary.image(spec_name, heatmap, max_outputs=spec_image.shape[0]) out_spec = np.power(np.maximum(spec, 0), 1) #hp.emphasis_magnitude) out_audio = np.array( list( map( lambda spec: spec2wav(spec.T, hp.n_fft, hp.win_length, hp. hop_length, hp.n_iter), out_spec))) out_audio = inv_preemphasis(out_audio, coeff=hp.preemphasis) tf.summary.audio(spec_name, out_audio, hp.sr, max_outputs=hp.batch_size)
def convert(predictor, mfcc, spec, mel_spec): print("convert") pred_s = datetime.datetime.now() pred_spec, _, ppgs = predictor(mfcc, spec, mel_spec) pred_e = datetime.datetime.now() pred_t = pred_e - pred_s print("Predicting time:{}s".format(pred_t.seconds)) preproc_s = datetime.datetime.now() # Denormalizatoin print("denormalize_db") pred_spec = denormalize_db(pred_spec, hp.default.max_db, hp.default.min_db) # Db to amp print("db2amp") pred_spec = db2amp(pred_spec) # Emphasize the magnitude print("emphasize") pred_spec = np.power(pred_spec, hp.convert.emphasis_magnitude) preproc_e = datetime.datetime.now() preproc_t = preproc_e - preproc_s print("Pre-Processing time:{}s".format(preproc_t.seconds)) audio = [] # Spectrogram to waveform recon_s = datetime.datetime.now() print("spec2wav") audio.append( spec2wav_lws(pred_spec[0], hp.default.n_fft, hp.default.win_length, hp.default.hop_length, hp.default.lws_mode)) recon_e = datetime.datetime.now() recon_t = recon_e - recon_s print("Converting Spectrogram-to-Wave time:{}s".format(recon_t.seconds)) audio = np.array(audio) # print('audio.shape : ', audio.shape) # Apply inverse pre-emphasis audio = inv_preemphasis(audio, coeff=hp.default.preemphasis) return audio[0], ppgs
def convert(predictor, tensor): # tensor = next(df().get_data()) # print(tensor.shape) pred_spec, y_spec, ppgs = predictor(tensor) # pred_spec, y_spec, ppgs = predictor(tf.expand_dims(df, 0)) # Denormalizatoin pred_spec = denormalize_db(pred_spec, hp.default.max_db, hp.default.min_db) # y_spec = denormalize_db(y_spec, hp.default.max_db, hp.default.min_db) # Db to amp pred_spec = db2amp(pred_spec) # y_spec = db2amp(y_spec) # Emphasize the magnitude pred_spec = np.power(pred_spec, hp.convert.emphasis_magnitude) # y_spec = np.power(y_spec, hp.convert.emphasis_magnitude) # Spectrogram to waveform audio = np.array( map( lambda spec: spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp. default.hop_length, hp.default.n_iter), pred_spec)) # y_audio = np.array(map(lambda spec: spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp.default.hop_length, # hp.default.n_iter), y_spec)) # Apply inverse pre-emphasis audio = inv_preemphasis(audio, coeff=hp.default.preemphasis) # y_audio = inv_preemphasis(y_audio, coeff=hp.default.preemphasis) # pickle.dump( y_audio, open( "y-audio.p", "wb" ) ) # pickle.dump( audio, open( "o-audio.p", "wb" ) ) # if hp.convert.one_full_wav: # # Concatenate to a wav # y_audio = np.reshape(y_audio, (1, y_audio.size), order='C') # audio = np.reshape(audio, (1, audio.size), order='C') # return audio, y_audio, ppgs return audio, ppgs