def get_stft_modified(fname, hopsamp=hopsamp, rate_hz=rate_hz, fft_size=fft_size, enableMel=False, enableFilter=False): input_signal = audio_utilities.get_signal(fname, expected_fs=rate_hz) stft_full = audio_utilities.stft_for_reconstruction( input_signal, fft_size, hopsamp) stft_mag = abs(stft_full)**2.0 scale = 1.0 / np.amax(stft_mag) stft_mag *= scale stft_modified = stft_mag print('[*]stft_modified : ', stft_modified.shape) ''' imshow(stft_mag.T**0.125, origin='lower', cmap=cm.hot, aspect='auto', interpolation='nearest') colorbar() title('Unmodified spectrogram') xlabel('time index') ylabel('frequency bin index') savefig(fname+'unmodified_spectrogram.png', dpi=150) ''' if enableMel: min_freq_hz = 30 max_freq_hz = 1000 mel_bin_count = 200 linear_bin_count = 1 + fft_size // 2 filterbank = audio_utilities.make_mel_filterbank( min_freq_hz, max_freq_hz, mel_bin_count, linear_bin_count, rate_hz) mel_spectrogram = np.dot(filterbank, stft_mag.T) inverted_mel_to_linear_freq_spectrogram = np.dot( filterbank.T, mel_spectrogram) stft_modified = inverted_mel_to_linear_freq_spectrogram.T if enableFilter: cutoff_freq = 1000 cutoff_bin = round(cutoff_freq * fft_size / rate_hz) stft_modified[:, cutoff_bin:] = 0 return stft_modified, scale
def run_demo(): """Test Griffin & Lim method for reconstructing audio from a magnitude spectrogram. Example of using the Griffin-Lim algorithm. The input file is loaded, the spectrogram is computed (note that we discard the phase information). Then, using only the (magnitude) spectrogram, the Griffin-Lim algorithm is run to reconstruct an audio signal from the spectrogram. The reconstructed audio is finally saved to a file. A plot of the spectrogram is also displayed. """ parser = argparse.ArgumentParser() parser.add_argument('--in_file', type=str, default="bkvhi.wav", help='Input WAV file') parser.add_argument('--sample_rate_hz', default=44100, type=int, help='Sample rate in Hz') parser.add_argument('--fft_size', default=2048, type=int, help='FFT siz') parser.add_argument('--iterations', default=300, type=int, help='Number of iterations to run') parser.add_argument('--enable_filter', action='store_true', help='Apply a low-pass filter') parser.add_argument('--enable_mel_scale', action='store_true', help='Convert to mel scale and back') parser.add_argument( '--cutoff_freq', type=int, default=1000, help='If filter is enable, the low-pass cutoff frequency in Hz') args = parser.parse_args() in_file = args.in_file # Load an audio file. It must be WAV format. Multi-channel files will be # converted to mono. input_signal = audio_utilities.get_signal(in_file, expected_fs=args.sample_rate_hz) # Hopsamp is the number of samples that the analysis window is shifted after # computing the FFT. For example, if the sample rate is 44100 Hz and hopsamp is # 256, then there will be approximately 44100/256 = 172 FFTs computed per second # and thus 172 spectral slices (i.e., columns) per second in the spectrogram. hopsamp = args.fft_size // 8 # Compute the Short-Time Fourier Transform (STFT) from the audio file. This is a 2-dim Numpy array with # time_slices rows and frequency_bins columns. Thus, you will need to take the # transpose of this matrix to get the usual STFT which has frequency bins as rows # and time slices as columns. stft_full = audio_utilities.stft_for_reconstruction( input_signal, args.fft_size, hopsamp) # Note that the STFT is complex-valued. Therefore, to get the (magnitude) # spectrogram, we need to take the absolute value. print(stft_full.shape) stft_mag = abs(stft_full)**2.0 # Note that `stft_mag` only contains the magnitudes and so we have lost the # phase information. scale = 1.0 / np.amax(stft_mag) print('Maximum value in the magnitude spectrogram: ', 1 / scale) # Rescale to put all values in the range [0, 1]. stft_mag *= scale print(stft_mag.shape) # We now have a (magnitude only) spectrogram, `stft_mag` that is normalized to be within [0, 1.0]. # In a practical use case, we would probably want to perform some processing on `stft_mag` here # which would produce a modified version that we would want to reconstruct audio from. figure(1) imshow(stft_mag.T**0.125, origin='lower', cmap=cm.hot, aspect='auto', interpolation='nearest') colorbar() title('Unmodified spectrogram') xlabel('time index') ylabel('frequency bin index') savefig('unmodified_spectrogram.png', dpi=150) # If the mel scale option is selected, apply a perceptual frequency scale. if args.enable_mel_scale: min_freq_hz = 70 max_freq_hz = 8000 mel_bin_count = 200 linear_bin_count = 1 + args.fft_size // 2 filterbank = audio_utilities.make_mel_filterbank( min_freq_hz, max_freq_hz, mel_bin_count, linear_bin_count, args.sample_rate_hz) figure(2) imshow(filterbank, origin='lower', cmap=cm.hot, aspect='auto', interpolation='nearest') colorbar() title('Mel scale filter bank') xlabel('linear frequency index') ylabel('mel frequency index') savefig('mel_scale_filterbank.png', dpi=150) mel_spectrogram = np.dot(filterbank, stft_mag.T) clf() figure(3) imshow(mel_spectrogram**0.125, origin='lower', cmap=cm.hot, aspect='auto', interpolation='nearest') colorbar() title('Mel scale spectrogram') xlabel('time index') ylabel('mel frequency bin index') savefig('mel_scale_spectrogram.png', dpi=150) inverted_mel_to_linear_freq_spectrogram = np.dot( filterbank.T, mel_spectrogram) clf() figure(4) imshow(inverted_mel_to_linear_freq_spectrogram**0.125, origin='lower', cmap=cm.hot, aspect='auto', interpolation='nearest') colorbar() title('Linear scale spectrogram obtained from mel scale spectrogram') xlabel('time index') ylabel('frequency bin index') savefig('inverted_mel_to_linear_freq_spectrogram.png', dpi=150) stft_modified = inverted_mel_to_linear_freq_spectrogram.T else: stft_modified = stft_mag savefig('stft_modified.png', dpi=150) ###### Optional: modify the spectrogram # For example, we can implement a low-pass filter by simply setting all frequency bins above # some threshold frequency (args.cutoff_freq) to 0 as follows. if args.enable_filter: # Calculate corresponding bin index. cutoff_bin = round(args.cutoff_freq * args.fft_size / args.sample_rate_hz) stft_modified[:, cutoff_bin:] = 0 ########### # Undo the rescaling. stft_modified_scaled = stft_modified / scale stft_modified_scaled = stft_modified_scaled**0.5 # Use the Griffin&Lim algorithm to reconstruct an audio signal from the # magnitude spectrogram. x_reconstruct = audio_utilities.reconstruct_signal_griffin_lim( stft_modified_scaled, args.fft_size, hopsamp, args.iterations) # The output signal must be in the range [-1, 1], otherwise we need to clip or normalize. max_sample = np.max(abs(x_reconstruct)) if max_sample > 1.0: x_reconstruct = x_reconstruct / max_sample # Save the reconstructed signal to a WAV file. audio_utilities.save_audio_to_file(x_reconstruct, args.sample_rate_hz) # Save the spectrogram image also. clf() figure(5) imshow(stft_modified.T**0.125, origin='lower', cmap=cm.hot, aspect='auto', interpolation='nearest') colorbar() title('Spectrogram used to reconstruct audio') xlabel('time index') ylabel('frequency bin index') savefig('reconstruction_spectrogram.png', dpi=150)
stft_pad = stft_full[:max_len, :] # Note that the STFT is complex-valued. Therefore, to get the (magnitude) # spectrogram, we need to take the absolute value. stft_mag = abs(stft_pad)**2.0 # Note that `stft_mag` only contains the magnitudes and so we have lost the # phase information. scale = 1.0 / np.amax(stft_mag) # print('Maximum value in the magnitude spectrogram: ', 1/scale) # Rescale to put all values in the range [0, 1]. stft_mag *= scale if mel: min_freq_hz = 70 max_freq_hz = 8000 mel_bin_count = 200 linear_bin_count = 1 + fft_size // 2 filterbank = audio_utilities.make_mel_filterbank( min_freq_hz, max_freq_hz, mel_bin_count, linear_bin_count, sample_rate) mel_spectrogram = np.dot(filterbank, stft_mag.T) inverted_mel_to_linear_freq_spectrogram = np.dot( filterbank.T, mel_spectrogram) strf_mag = inverted_mel_to_linear_freq_spectrogram.T print(strf_mag.shape) data[group].append((stft_mag, scale)) if not os.path.exists("cache"): os.mkdir("cache") np.save(os.path.join("cache", group + '.npy'), data[group])
outputs_py = [file + ".py.gen.wav" for file in wavs] outputs_tf = [file + ".tf.gen.wav" for file in wavs] wavs = [ audio.load_wav(wav_path + ".wav", hparams.sample_rate) for wav_path in wavs ] melSpectrum = np.load('mel-batch_0_sentence_0.npy') imshow(melSpectrum.T**0.125, origin='lower', cmap=cm.hot, aspect='auto', interpolation='nearest') stft_modified = melSpectrum.T filterbank = audio_utilities.make_mel_filterbank(70, 8000, 80, 123, 44100) inverted_mel_to_linear_freq_spectrogram = np.dot(filterbank.T, stft_modified) spectrogram = inverted_mel_to_linear_freq_spectrogram print("Linear spectrograms dim: ") print(spectrogram[0].shape) spectrogram = spectrogram.astype(np.float32) spectrogram = spectrogram # --------------------------------- librosa Version --------------------------------- # convert back x_reconstruct = audio_utilities.reconstruct_signal_griffin_lim( spectrogram, 244, 123, 300) max_sample = np.max(abs(x_reconstruct)) if max_sample > 1.0:
# Compute the Short-Time Fourier Transform (STFT) from the audio file. This is a 2-dim Numpy array with # time_slices rows and frequency_bins columns. Thus, you will need to take the # transpose of this matrix to get the usual STFT which has frequency bins as rows # and time slices as columns. stft_full = audio_utilities.stft_for_reconstruction(input_signal, args.fft_size, hopsamp) # Note that the STFT is complex-valued. Therefore, to get the (magnitude) # spectrogram, we need to take the absolute value. stft_mag = np.abs(stft_full)**2 linear_bin_count = 1 + args.fft_size // 2 filterbank = audio_utilities.make_mel_filterbank(args.min_freq_hz, args.max_freq_hz, args.mel_bin_count, linear_bin_count, args.sample_rate_hz) mel_spectrogram = np.dot(filterbank, stft_mag.T) # convert to decibels mel_spectrogram = np.abs(20. * np.log10(mel_spectrogram / 10e-6)) scale_mel = 1.0 / np.amax(mel_spectrogram) # Rescale to put all values in the range [0, 1]. mel_spectrogram *= scale_mel mel_spectrogram = np.flip(mel_spectrogram, 0) print('==>', out_file_prefix) imageio.imwrite(out_file_prefix + "_spectogram_bw.png",