def run_demo(): """Test Griffin & Lim method for reconstructing audio from a magnitude spectrogram. Example of using the Griffin-Lim algorithm. The input file is loaded, the spectrogram is computed (note that we discard the phase information). Then, using only the (magnitude) spectrogram, the Griffin-Lim algorithm is run to reconstruct an audio signal from the spectrogram. The reconstructed audio is finally saved to a file. A plot of the spectrogram is also displayed. """ parser = argparse.ArgumentParser() parser.add_argument('--in_file', type=str, default="bkvhi.wav", help='Input WAV file') parser.add_argument('--sample_rate_hz', default=44100, type=int, help='Sample rate in Hz') parser.add_argument('--fft_size', default=2048, type=int, help='FFT siz') parser.add_argument('--iterations', default=300, type=int, help='Number of iterations to run') parser.add_argument('--enable_filter', action='store_true', help='Apply a low-pass filter') parser.add_argument('--enable_mel_scale', action='store_true', help='Convert to mel scale and back') parser.add_argument( '--cutoff_freq', type=int, default=1000, help='If filter is enable, the low-pass cutoff frequency in Hz') args = parser.parse_args() in_file = args.in_file # Load an audio file. It must be WAV format. Multi-channel files will be # converted to mono. input_signal = audio_utilities.get_signal(in_file, expected_fs=args.sample_rate_hz) # Hopsamp is the number of samples that the analysis window is shifted after # computing the FFT. For example, if the sample rate is 44100 Hz and hopsamp is # 256, then there will be approximately 44100/256 = 172 FFTs computed per second # and thus 172 spectral slices (i.e., columns) per second in the spectrogram. hopsamp = args.fft_size // 8 # Compute the Short-Time Fourier Transform (STFT) from the audio file. This is a 2-dim Numpy array with # time_slices rows and frequency_bins columns. Thus, you will need to take the # transpose of this matrix to get the usual STFT which has frequency bins as rows # and time slices as columns. stft_full = audio_utilities.stft_for_reconstruction( input_signal, args.fft_size, hopsamp) # Note that the STFT is complex-valued. Therefore, to get the (magnitude) # spectrogram, we need to take the absolute value. print(stft_full.shape) stft_mag = abs(stft_full)**2.0 # Note that `stft_mag` only contains the magnitudes and so we have lost the # phase information. scale = 1.0 / np.amax(stft_mag) print('Maximum value in the magnitude spectrogram: ', 1 / scale) # Rescale to put all values in the range [0, 1]. stft_mag *= scale print(stft_mag.shape) # We now have a (magnitude only) spectrogram, `stft_mag` that is normalized to be within [0, 1.0]. # In a practical use case, we would probably want to perform some processing on `stft_mag` here # which would produce a modified version that we would want to reconstruct audio from. figure(1) imshow(stft_mag.T**0.125, origin='lower', cmap=cm.hot, aspect='auto', interpolation='nearest') colorbar() title('Unmodified spectrogram') xlabel('time index') ylabel('frequency bin index') savefig('unmodified_spectrogram.png', dpi=150) # If the mel scale option is selected, apply a perceptual frequency scale. if args.enable_mel_scale: min_freq_hz = 70 max_freq_hz = 8000 mel_bin_count = 200 linear_bin_count = 1 + args.fft_size // 2 filterbank = audio_utilities.make_mel_filterbank( min_freq_hz, max_freq_hz, mel_bin_count, linear_bin_count, args.sample_rate_hz) figure(2) imshow(filterbank, origin='lower', cmap=cm.hot, aspect='auto', interpolation='nearest') colorbar() title('Mel scale filter bank') xlabel('linear frequency index') ylabel('mel frequency index') savefig('mel_scale_filterbank.png', dpi=150) mel_spectrogram = np.dot(filterbank, stft_mag.T) clf() figure(3) imshow(mel_spectrogram**0.125, origin='lower', cmap=cm.hot, aspect='auto', interpolation='nearest') colorbar() title('Mel scale spectrogram') xlabel('time index') ylabel('mel frequency bin index') savefig('mel_scale_spectrogram.png', dpi=150) inverted_mel_to_linear_freq_spectrogram = np.dot( filterbank.T, mel_spectrogram) clf() figure(4) imshow(inverted_mel_to_linear_freq_spectrogram**0.125, origin='lower', cmap=cm.hot, aspect='auto', interpolation='nearest') colorbar() title('Linear scale spectrogram obtained from mel scale spectrogram') xlabel('time index') ylabel('frequency bin index') savefig('inverted_mel_to_linear_freq_spectrogram.png', dpi=150) stft_modified = inverted_mel_to_linear_freq_spectrogram.T else: stft_modified = stft_mag savefig('stft_modified.png', dpi=150) ###### Optional: modify the spectrogram # For example, we can implement a low-pass filter by simply setting all frequency bins above # some threshold frequency (args.cutoff_freq) to 0 as follows. if args.enable_filter: # Calculate corresponding bin index. cutoff_bin = round(args.cutoff_freq * args.fft_size / args.sample_rate_hz) stft_modified[:, cutoff_bin:] = 0 ########### # Undo the rescaling. stft_modified_scaled = stft_modified / scale stft_modified_scaled = stft_modified_scaled**0.5 # Use the Griffin&Lim algorithm to reconstruct an audio signal from the # magnitude spectrogram. x_reconstruct = audio_utilities.reconstruct_signal_griffin_lim( stft_modified_scaled, args.fft_size, hopsamp, args.iterations) # The output signal must be in the range [-1, 1], otherwise we need to clip or normalize. max_sample = np.max(abs(x_reconstruct)) if max_sample > 1.0: x_reconstruct = x_reconstruct / max_sample # Save the reconstructed signal to a WAV file. audio_utilities.save_audio_to_file(x_reconstruct, args.sample_rate_hz) # Save the spectrogram image also. clf() figure(5) imshow(stft_modified.T**0.125, origin='lower', cmap=cm.hot, aspect='auto', interpolation='nearest') colorbar() title('Spectrogram used to reconstruct audio') xlabel('time index') ylabel('frequency bin index') savefig('reconstruction_spectrogram.png', dpi=150)
def decode(): """Decoding the inputs using current model.""" tf.logging.info("Get TEST sets number.") num_batch = get_num_batch(FLAGS.test_list_file, infer=True) with tf.Graph().as_default(): with tf.device('/cpu:0'): with tf.name_scope('input'): data_list = read_list(FLAGS.test_list_file) test_utt_id, test_inputs, _ = get_batch( data_list, batch_size=1, input_size=FLAGS.input_dim, output_size=FLAGS.output_dim, left=FLAGS.left_context, right=FLAGS.right_context, num_enqueuing_threads=FLAGS.num_threads, num_epochs=1, infer=True) # test_inputs = tf.squeeze(test_inputs, axis=[0]) devices = [] for i in xrange(FLAGS.num_gpu): device_name = ("/gpu:%d" % i) print('Using device: ', device_name) devices.append(device_name) # Prevent exhausting all the gpu memories. config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.4 #config.gpu_options.allow_growth = True config.allow_soft_placement = True set_session(tf.Session(config=config)) # execute the session with tf.Session(config=config) as sess: # Create two models with tr_inputs and cv_inputs individually. with tf.name_scope('model'): model = DNNTrainer(sess, FLAGS, devices, test_inputs, labels=None, cross_validation=True) show_all_variables() init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) print("Initializing variables ...") sess.run(init) if model.load(model.save_dir, moving_average=False): print("[*] Load Moving Average model SUCCESS") else: print("[!] Load failed. Checkpoint not found. Exit now.") sys.exit(1) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) cmvn_filename = os.path.join(FLAGS.data_dir, "train_cmvn.npz") if os.path.isfile(cmvn_filename): cmvn = np.load(cmvn_filename) else: tf.logging.fatal("%s not exist, exit now." % cmvn_filename) sys.exit(1) out_dir_name = os.path.join('/Work18/2017/linan/SE/my_enh', FLAGS.save_dir, FLAGS.savetestdir) # out_dir_name = os.path.join(FLAGS.save_dir, 'test') if not os.path.exists(out_dir_name): os.makedirs(out_dir_name) write_scp_path = os.path.join(out_dir_name, 'feats.scp') write_ark_path = os.path.join(out_dir_name, 'feats.ark') writer = ArkWriter(write_scp_path) outputs = model.generator(test_inputs, None, reuse=True) outputs = tf.reshape(outputs, [-1, model.output_dim]) print('shape is', np.shape(outputs)) try: for batch in range(num_batch): if coord.should_stop(): break # outputs = model.generator(test_inputs, None, reuse=True) # outputs = tf.reshape(outputs, [-1, model.output_dim]) utt_id, activations = sess.run([test_utt_id, outputs]) # sequence = activations * cmvn['stddev_labels'] + \ # cmvn['mean_labels'] sequence = activations save_result = np.vstack(sequence) dir_load = FLAGS.savetestdir dir_load = dir_load.split('/')[-1] mode = FLAGS.mode if mode == 'use_org': inputs_path = os.path.join( 'workspace/features/spectrogram/test', dir_load, '%s.wav.p' % utt_id[0]) data = cPickle.load(open(inputs_path, 'rb')) [mixed_complx_x] = data #tf.logging.info("Write inferred %s to %s" %(utt_id[0], np.shape(save_result))) save_result = np.exp(save_result) n_window = cfg.n_window s = recover_wav(save_result, mixed_complx_x, cfg.n_overlap, np.hamming) s *= np.sqrt( (np.hamming(n_window)**2 ).sum()) # Scaler for compensate the amplitude # change after spectrogram and IFFT. print("start enhance wav file") # Write out enhanced wav. out_path = os.path.join("workspace", "enh_wavs", "test", dir_load, "%s.enh.wav" % utt_id[0]) print("have enhanced all the wav") pp_data.create_folder(os.path.dirname(out_path)) pp_data.write_audio(out_path, s, 16000) elif mode == 'g_l': inputs_path = os.path.join( 'workspace/features/spectrogram/test', dir_load, '%s.wav.p' % utt_id[0]) data = cPickle.load(open(inputs_path, 'rb')) [mixed_complx_x] = data save_result = np.exp(save_result) s = save_result s = audio_utilities.reconstruct_signal_griffin_lim( s, mixed_complx_x, 512, 256, 15) #s = recover_wav(save_result,mixed_complx_x,cfg.n_overlap, np.hamming) s *= np.sqrt((np.hamming(cfg.n_window)**2).sum()) #s = audio._griffin_lim(s) out_path = os.path.join("workspace", "enh_wavs", "test2", dir_load, "%s.enh.wav" % utt_id[0]) pp_data.create_folder(os.path.dirname(out_path)) pp_data.write_audio(out_path, s, 16000) tf.logging.info("Write inferred%s" % (np.shape(s))) #writer.write_next_utt(write_ark_path, utt_id[0], save_result) tf.logging.info("Write inferred %s to %s" % (utt_id[0], out_path)) except Exception, e: # Report exceptions to the coordinator. coord.request_stop(e) finally:
for wav_path in wavs ] melSpectrum = np.load('mel-batch_0_sentence_0.npy') imshow(melSpectrum.T**0.125, origin='lower', cmap=cm.hot, aspect='auto', interpolation='nearest') stft_modified = melSpectrum.T filterbank = audio_utilities.make_mel_filterbank(70, 8000, 80, 123, 44100) inverted_mel_to_linear_freq_spectrogram = np.dot(filterbank.T, stft_modified) spectrogram = inverted_mel_to_linear_freq_spectrogram print("Linear spectrograms dim: ") print(spectrogram[0].shape) spectrogram = spectrogram.astype(np.float32) spectrogram = spectrogram # --------------------------------- librosa Version --------------------------------- # convert back x_reconstruct = audio_utilities.reconstruct_signal_griffin_lim( spectrogram, 244, 123, 300) max_sample = np.max(abs(x_reconstruct)) if max_sample > 1.0: x_reconstruct = x_reconstruct / max_sample audio_utilities.save_audio_to_file(x_reconstruct, 44100) print("Done!")
def run_recon(): """Test Griffin & Lim method for reconstructing audio from a magnitude spectrogram. Example of using the Griffin-Lim algorithm. The input file is loaded, the spectrogram is computed (note that we discard the phase information). Then, using only the (magnitude) spectrogram, the Griffin-Lim algorithm is run to reconstruct an audio signal from the spectrogram. The reconstructed audio is finally saved to a file. A plot of the spectrogram is also displayed. """ parser = argparse.ArgumentParser() parser.add_argument('--in_file', type=str, default="bkvhi.wav", help='Input WAV file') parser.add_argument('--sample_rate_hz', default=44100, type=int, help='Sample rate in Hz') parser.add_argument('--fft_size', default=2048, type=int, help='FFT siz') parser.add_argument('--iterations', default=300, type=int, help='Number of iterations to run') parser.add_argument('--enable_filter', action='store_true', help='Apply a low-pass filter') parser.add_argument('--enable_mel_scale', action='store_true', help='Convert to mel scale and back') parser.add_argument( '--cutoff_freq', type=int, default=1000, help='If filter is enable, the low-pass cutoff frequency in Hz') args = parser.parse_args() in_file = Image.open(args.in_file) #print(in_file.shape) in_file = in_file.resize((1025, 640), Image.ANTIALIAS) ext = ".png" in_file.save("rescaledimage" + ext) in_file = plt.imread("rescaledimage.png") print(in_file.shape) in_file = rgb2gray(in_file) hopsamp = args.fft_size // 8 print(in_file.shape) # Use the Griffin&Lim algorithm to reconstruct an audio signal from the # magnitude spectrogram. x_reconstruct = audio_utilities.reconstruct_signal_griffin_lim( in_file, args.fft_size, hopsamp, args.iterations) # The output signal must be in the range [-1, 1], otherwise we need to clip or normalize. max_sample = np.max(abs(x_reconstruct)) if max_sample > 1.0: x_reconstruct = x_reconstruct / max_sample # Save the reconstructed signal to a WAV file. audio_utilities.save_audio_to_file(x_reconstruct, args.sample_rate_hz)
imshow(melSpectrum.T**0.125, origin='lower', cmap=cm.hot, aspect='auto', interpolation='nearest') stft_modified=melSpectrum.T if args.enable_filter: # Calculate corresponding bin index. cutoff_bin = round(args.cutoff_freq*args.fft_size/args.sample_rate_hz) stft_modified[:, cutoff_bin:] = 0 stft_modified_scaled = stft_modified stft_modified_scaled = stft_modified_scaled**0.5 # Use the Griffin&Lim algorithm to reconstruct an audio signal from the # magnitude spectrogram. hopsamp = args.fft_size // 8 x_reconstruct = audio_utilities.reconstruct_signal_griffin_lim(stft_modified_scaled, args.fft_size, hopsamp, args.iterations) # The output signal must be in the range [-1, 1], otherwise we need to clip or normalize. max_sample = np.max(abs(x_reconstruct)) if max_sample > 1.0: x_reconstruct = x_reconstruct / max_sample # Save the reconstructed signal to a WAV file. audio_utilities.save_audio_to_file(x_reconstruct, args.sample_rate_hz) # Save the spectrogram image also. clf() figure(5) imshow(stft_modified.T**0.125, origin='lower', cmap=cm.hot, aspect='auto', interpolation='nearest') colorbar() title('Spectrogram used to reconstruct audio') xlabel('time index')