def get_stft_modified(fname, hopsamp=hopsamp, rate_hz=rate_hz, fft_size=fft_size, enableMel=False, enableFilter=False): input_signal = audio_utilities.get_signal(fname, expected_fs=rate_hz) stft_full = audio_utilities.stft_for_reconstruction( input_signal, fft_size, hopsamp) stft_mag = abs(stft_full)**2.0 scale = 1.0 / np.amax(stft_mag) stft_mag *= scale stft_modified = stft_mag print('[*]stft_modified : ', stft_modified.shape) ''' imshow(stft_mag.T**0.125, origin='lower', cmap=cm.hot, aspect='auto', interpolation='nearest') colorbar() title('Unmodified spectrogram') xlabel('time index') ylabel('frequency bin index') savefig(fname+'unmodified_spectrogram.png', dpi=150) ''' if enableMel: min_freq_hz = 30 max_freq_hz = 1000 mel_bin_count = 200 linear_bin_count = 1 + fft_size // 2 filterbank = audio_utilities.make_mel_filterbank( min_freq_hz, max_freq_hz, mel_bin_count, linear_bin_count, rate_hz) mel_spectrogram = np.dot(filterbank, stft_mag.T) inverted_mel_to_linear_freq_spectrogram = np.dot( filterbank.T, mel_spectrogram) stft_modified = inverted_mel_to_linear_freq_spectrogram.T if enableFilter: cutoff_freq = 1000 cutoff_bin = round(cutoff_freq * fft_size / rate_hz) stft_modified[:, cutoff_bin:] = 0 return stft_modified, scale
def run_demo(): """Test Griffin & Lim method for reconstructing audio from a magnitude spectrogram. Example of using the Griffin-Lim algorithm. The input file is loaded, the spectrogram is computed (note that we discard the phase information). Then, using only the (magnitude) spectrogram, the Griffin-Lim algorithm is run to reconstruct an audio signal from the spectrogram. The reconstructed audio is finally saved to a file. A plot of the spectrogram is also displayed. """ parser = argparse.ArgumentParser() parser.add_argument('--in_file', type=str, default="bkvhi.wav", help='Input WAV file') parser.add_argument('--sample_rate_hz', default=44100, type=int, help='Sample rate in Hz') parser.add_argument('--fft_size', default=2048, type=int, help='FFT siz') parser.add_argument('--iterations', default=300, type=int, help='Number of iterations to run') parser.add_argument('--enable_filter', action='store_true', help='Apply a low-pass filter') parser.add_argument('--enable_mel_scale', action='store_true', help='Convert to mel scale and back') parser.add_argument( '--cutoff_freq', type=int, default=1000, help='If filter is enable, the low-pass cutoff frequency in Hz') args = parser.parse_args() in_file = args.in_file # Load an audio file. It must be WAV format. Multi-channel files will be # converted to mono. input_signal = audio_utilities.get_signal(in_file, expected_fs=args.sample_rate_hz) # Hopsamp is the number of samples that the analysis window is shifted after # computing the FFT. For example, if the sample rate is 44100 Hz and hopsamp is # 256, then there will be approximately 44100/256 = 172 FFTs computed per second # and thus 172 spectral slices (i.e., columns) per second in the spectrogram. hopsamp = args.fft_size // 8 # Compute the Short-Time Fourier Transform (STFT) from the audio file. This is a 2-dim Numpy array with # time_slices rows and frequency_bins columns. Thus, you will need to take the # transpose of this matrix to get the usual STFT which has frequency bins as rows # and time slices as columns. stft_full = audio_utilities.stft_for_reconstruction( input_signal, args.fft_size, hopsamp) # Note that the STFT is complex-valued. Therefore, to get the (magnitude) # spectrogram, we need to take the absolute value. print(stft_full.shape) stft_mag = abs(stft_full)**2.0 # Note that `stft_mag` only contains the magnitudes and so we have lost the # phase information. scale = 1.0 / np.amax(stft_mag) print('Maximum value in the magnitude spectrogram: ', 1 / scale) # Rescale to put all values in the range [0, 1]. stft_mag *= scale print(stft_mag.shape) # We now have a (magnitude only) spectrogram, `stft_mag` that is normalized to be within [0, 1.0]. # In a practical use case, we would probably want to perform some processing on `stft_mag` here # which would produce a modified version that we would want to reconstruct audio from. figure(1) imshow(stft_mag.T**0.125, origin='lower', cmap=cm.hot, aspect='auto', interpolation='nearest') colorbar() title('Unmodified spectrogram') xlabel('time index') ylabel('frequency bin index') savefig('unmodified_spectrogram.png', dpi=150) # If the mel scale option is selected, apply a perceptual frequency scale. if args.enable_mel_scale: min_freq_hz = 70 max_freq_hz = 8000 mel_bin_count = 200 linear_bin_count = 1 + args.fft_size // 2 filterbank = audio_utilities.make_mel_filterbank( min_freq_hz, max_freq_hz, mel_bin_count, linear_bin_count, args.sample_rate_hz) figure(2) imshow(filterbank, origin='lower', cmap=cm.hot, aspect='auto', interpolation='nearest') colorbar() title('Mel scale filter bank') xlabel('linear frequency index') ylabel('mel frequency index') savefig('mel_scale_filterbank.png', dpi=150) mel_spectrogram = np.dot(filterbank, stft_mag.T) clf() figure(3) imshow(mel_spectrogram**0.125, origin='lower', cmap=cm.hot, aspect='auto', interpolation='nearest') colorbar() title('Mel scale spectrogram') xlabel('time index') ylabel('mel frequency bin index') savefig('mel_scale_spectrogram.png', dpi=150) inverted_mel_to_linear_freq_spectrogram = np.dot( filterbank.T, mel_spectrogram) clf() figure(4) imshow(inverted_mel_to_linear_freq_spectrogram**0.125, origin='lower', cmap=cm.hot, aspect='auto', interpolation='nearest') colorbar() title('Linear scale spectrogram obtained from mel scale spectrogram') xlabel('time index') ylabel('frequency bin index') savefig('inverted_mel_to_linear_freq_spectrogram.png', dpi=150) stft_modified = inverted_mel_to_linear_freq_spectrogram.T else: stft_modified = stft_mag savefig('stft_modified.png', dpi=150) ###### Optional: modify the spectrogram # For example, we can implement a low-pass filter by simply setting all frequency bins above # some threshold frequency (args.cutoff_freq) to 0 as follows. if args.enable_filter: # Calculate corresponding bin index. cutoff_bin = round(args.cutoff_freq * args.fft_size / args.sample_rate_hz) stft_modified[:, cutoff_bin:] = 0 ########### # Undo the rescaling. stft_modified_scaled = stft_modified / scale stft_modified_scaled = stft_modified_scaled**0.5 # Use the Griffin&Lim algorithm to reconstruct an audio signal from the # magnitude spectrogram. x_reconstruct = audio_utilities.reconstruct_signal_griffin_lim( stft_modified_scaled, args.fft_size, hopsamp, args.iterations) # The output signal must be in the range [-1, 1], otherwise we need to clip or normalize. max_sample = np.max(abs(x_reconstruct)) if max_sample > 1.0: x_reconstruct = x_reconstruct / max_sample # Save the reconstructed signal to a WAV file. audio_utilities.save_audio_to_file(x_reconstruct, args.sample_rate_hz) # Save the spectrogram image also. clf() figure(5) imshow(stft_modified.T**0.125, origin='lower', cmap=cm.hot, aspect='auto', interpolation='nearest') colorbar() title('Spectrogram used to reconstruct audio') xlabel('time index') ylabel('frequency bin index') savefig('reconstruction_spectrogram.png', dpi=150)
data = {} print("Loading audio...") fft_size = 2048 sample_rate = 8000 max_len = 80 mel = False for root, dirs, files in os.walk("./audio"): if len(files) == 0: continue group = os.path.split(root)[-1] if group == "audio": continue data[group] = [] for f in sorted(files): if f.endswith(".wav"): input_signal = audio_utilities.get_signal(os.path.join(root, f), expected_fs=sample_rate) # Hopsamp is the number of samples that the analysis window is shifted after # computing the FFT. For example, if the sample rate is 44100 Hz and hopsamp is # 256, then there will be approximately 44100/256 = 172 FFTs computed per second # and thus 172 spectral slices (i.e., columns) per second in the spectrogram. hopsamp = fft_size // 8 # Compute the Short-Time Fourier Transform (STFT) from the audio file. This is a 2-dim Numpy array with # time_slices rows and frequency_bins columns. Thus, you will need to take the # transpose of this matrix to get the usual STFT which has frequency bins as rows # and time slices as columns. stft_full = audio_utilities.stft_for_reconstruction( input_signal, fft_size, hopsamp) # If maximum length exceeds mfcc lengths then pad the remaining ones if (max_len > stft_full.shape[0]):
'--pad_length', type=int, help='if defined pad up to this length (in frame) or if longer, fail') args = parser.parse_args() in_file = args.in_file out_file_prefix = args.out_file_prefix if out_file_prefix is None or out_file_prefix.endswith(".wav"): out_file_prefix = in_file[:-4] if args.out_file_dir is not None: out_file_prefix = os.path.join(args.out_file_dir, out_file_prefix) # Load an audio file. It must be WAV format. Multi-channel files will be # converted to mono. input_signal = audio_utilities.get_signal(in_file, expected_fs=args.sample_rate_hz) print("processing %s - signal %fs long" % (in_file, len(input_signal) * 1.0 / args.sample_rate_hz)) if args.pad_length: if len(input_signal) > args.pad_length: print("signal too long... skipping") sys.exit(0) else: max = np.amax(input_signal) whitenoise = np.random.normal(0, max / 100, size=args.pad_length - len(input_signal)) input_signal = np.concatenate((input_signal, whitenoise)) # Hopsamp is the number of samples that the analysis window is shifted after