Exemple #1
0
def get_stft_modified(fname,
                      hopsamp=hopsamp,
                      rate_hz=rate_hz,
                      fft_size=fft_size,
                      enableMel=False,
                      enableFilter=False):

    input_signal = audio_utilities.get_signal(fname, expected_fs=rate_hz)
    stft_full = audio_utilities.stft_for_reconstruction(
        input_signal, fft_size, hopsamp)

    stft_mag = abs(stft_full)**2.0
    scale = 1.0 / np.amax(stft_mag)

    stft_mag *= scale
    stft_modified = stft_mag

    print('[*]stft_modified : ', stft_modified.shape)
    '''
    imshow(stft_mag.T**0.125, origin='lower', cmap=cm.hot, aspect='auto',
           interpolation='nearest')
    colorbar()
    title('Unmodified spectrogram')
    xlabel('time index')
    ylabel('frequency bin index')
    savefig(fname+'unmodified_spectrogram.png', dpi=150)
    '''

    if enableMel:
        min_freq_hz = 30
        max_freq_hz = 1000
        mel_bin_count = 200
        linear_bin_count = 1 + fft_size // 2
        filterbank = audio_utilities.make_mel_filterbank(
            min_freq_hz, max_freq_hz, mel_bin_count, linear_bin_count, rate_hz)

        mel_spectrogram = np.dot(filterbank, stft_mag.T)
        inverted_mel_to_linear_freq_spectrogram = np.dot(
            filterbank.T, mel_spectrogram)

        stft_modified = inverted_mel_to_linear_freq_spectrogram.T

    if enableFilter:

        cutoff_freq = 1000
        cutoff_bin = round(cutoff_freq * fft_size / rate_hz)
        stft_modified[:, cutoff_bin:] = 0

    return stft_modified, scale
Exemple #2
0
def run_demo():
    """Test Griffin & Lim method for reconstructing audio from a magnitude spectrogram.

        Example of using the Griffin-Lim algorithm. The input file is loaded, the
        spectrogram is computed (note that we discard the phase information). Then,
        using only the (magnitude) spectrogram, the Griffin-Lim algorithm is run
        to reconstruct an audio signal from the spectrogram. The reconstructed audio
        is finally saved to a file.

        A plot of the spectrogram is also displayed.
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('--in_file',
                        type=str,
                        default="bkvhi.wav",
                        help='Input WAV file')
    parser.add_argument('--sample_rate_hz',
                        default=44100,
                        type=int,
                        help='Sample rate in Hz')
    parser.add_argument('--fft_size', default=2048, type=int, help='FFT siz')
    parser.add_argument('--iterations',
                        default=300,
                        type=int,
                        help='Number of iterations to run')
    parser.add_argument('--enable_filter',
                        action='store_true',
                        help='Apply a low-pass filter')
    parser.add_argument('--enable_mel_scale',
                        action='store_true',
                        help='Convert to mel scale and back')
    parser.add_argument(
        '--cutoff_freq',
        type=int,
        default=1000,
        help='If filter is enable, the low-pass cutoff frequency in Hz')
    args = parser.parse_args()

    in_file = args.in_file

    # Load an audio file. It must be WAV format. Multi-channel files will be
    # converted to mono.
    input_signal = audio_utilities.get_signal(in_file,
                                              expected_fs=args.sample_rate_hz)

    # Hopsamp is the number of samples that the analysis window is shifted after
    # computing the FFT. For example, if the sample rate is 44100 Hz and hopsamp is
    # 256, then there will be approximately 44100/256 = 172 FFTs computed per second
    # and thus 172 spectral slices (i.e., columns) per second in the spectrogram.
    hopsamp = args.fft_size // 8

    # Compute the Short-Time Fourier Transform (STFT) from the audio file. This is a 2-dim Numpy array with
    # time_slices rows and frequency_bins columns. Thus, you will need to take the
    # transpose of this matrix to get the usual STFT which has frequency bins as rows
    # and time slices as columns.
    stft_full = audio_utilities.stft_for_reconstruction(
        input_signal, args.fft_size, hopsamp)
    # Note that the STFT is complex-valued. Therefore, to get the (magnitude)
    # spectrogram, we need to take the absolute value.
    print(stft_full.shape)
    stft_mag = abs(stft_full)**2.0
    # Note that `stft_mag` only contains the magnitudes and so we have lost the
    # phase information.
    scale = 1.0 / np.amax(stft_mag)
    print('Maximum value in the magnitude spectrogram: ', 1 / scale)
    # Rescale to put all values in the range [0, 1].
    stft_mag *= scale
    print(stft_mag.shape)
    # We now have a (magnitude only) spectrogram, `stft_mag` that is normalized to be within [0, 1.0].
    # In a practical use case, we would probably want to perform some processing on `stft_mag` here
    # which would produce a modified version that we would want to reconstruct audio from.
    figure(1)
    imshow(stft_mag.T**0.125,
           origin='lower',
           cmap=cm.hot,
           aspect='auto',
           interpolation='nearest')
    colorbar()
    title('Unmodified spectrogram')
    xlabel('time index')
    ylabel('frequency bin index')
    savefig('unmodified_spectrogram.png', dpi=150)

    # If the mel scale option is selected, apply a perceptual frequency scale.
    if args.enable_mel_scale:
        min_freq_hz = 70
        max_freq_hz = 8000
        mel_bin_count = 200

        linear_bin_count = 1 + args.fft_size // 2
        filterbank = audio_utilities.make_mel_filterbank(
            min_freq_hz, max_freq_hz, mel_bin_count, linear_bin_count,
            args.sample_rate_hz)
        figure(2)
        imshow(filterbank,
               origin='lower',
               cmap=cm.hot,
               aspect='auto',
               interpolation='nearest')
        colorbar()
        title('Mel scale filter bank')
        xlabel('linear frequency index')
        ylabel('mel frequency index')
        savefig('mel_scale_filterbank.png', dpi=150)

        mel_spectrogram = np.dot(filterbank, stft_mag.T)

        clf()
        figure(3)
        imshow(mel_spectrogram**0.125,
               origin='lower',
               cmap=cm.hot,
               aspect='auto',
               interpolation='nearest')
        colorbar()
        title('Mel scale spectrogram')
        xlabel('time index')
        ylabel('mel frequency bin index')
        savefig('mel_scale_spectrogram.png', dpi=150)

        inverted_mel_to_linear_freq_spectrogram = np.dot(
            filterbank.T, mel_spectrogram)

        clf()
        figure(4)
        imshow(inverted_mel_to_linear_freq_spectrogram**0.125,
               origin='lower',
               cmap=cm.hot,
               aspect='auto',
               interpolation='nearest')
        colorbar()
        title('Linear scale spectrogram obtained from mel scale spectrogram')
        xlabel('time index')
        ylabel('frequency bin index')
        savefig('inverted_mel_to_linear_freq_spectrogram.png', dpi=150)

        stft_modified = inverted_mel_to_linear_freq_spectrogram.T
    else:
        stft_modified = stft_mag
    savefig('stft_modified.png', dpi=150)
    ###### Optional: modify the spectrogram
    # For example, we can implement a low-pass filter by simply setting all frequency bins above
    # some threshold frequency (args.cutoff_freq) to 0 as follows.
    if args.enable_filter:
        # Calculate corresponding bin index.
        cutoff_bin = round(args.cutoff_freq * args.fft_size /
                           args.sample_rate_hz)
        stft_modified[:, cutoff_bin:] = 0
    ###########

    # Undo the rescaling.
    stft_modified_scaled = stft_modified / scale
    stft_modified_scaled = stft_modified_scaled**0.5
    # Use the Griffin&Lim algorithm to reconstruct an audio signal from the
    # magnitude spectrogram.
    x_reconstruct = audio_utilities.reconstruct_signal_griffin_lim(
        stft_modified_scaled, args.fft_size, hopsamp, args.iterations)

    # The output signal must be in the range [-1, 1], otherwise we need to clip or normalize.
    max_sample = np.max(abs(x_reconstruct))
    if max_sample > 1.0:
        x_reconstruct = x_reconstruct / max_sample

    # Save the reconstructed signal to a WAV file.
    audio_utilities.save_audio_to_file(x_reconstruct, args.sample_rate_hz)

    # Save the spectrogram image also.
    clf()
    figure(5)
    imshow(stft_modified.T**0.125,
           origin='lower',
           cmap=cm.hot,
           aspect='auto',
           interpolation='nearest')
    colorbar()
    title('Spectrogram used to reconstruct audio')
    xlabel('time index')
    ylabel('frequency bin index')
    savefig('reconstruction_spectrogram.png', dpi=150)
Exemple #3
0
data = {}
print("Loading audio...")
fft_size = 2048
sample_rate = 8000
max_len = 80
mel = False
for root, dirs, files in os.walk("./audio"):
    if len(files) == 0:
        continue
    group = os.path.split(root)[-1]
    if group == "audio":
        continue
    data[group] = []
    for f in sorted(files):
        if f.endswith(".wav"):
            input_signal = audio_utilities.get_signal(os.path.join(root, f),
                                                      expected_fs=sample_rate)
            # Hopsamp is the number of samples that the analysis window is shifted after
            # computing the FFT. For example, if the sample rate is 44100 Hz and hopsamp is
            # 256, then there will be approximately 44100/256 = 172 FFTs computed per second
            # and thus 172 spectral slices (i.e., columns) per second in the spectrogram.
            hopsamp = fft_size // 8

            # Compute the Short-Time Fourier Transform (STFT) from the audio file. This is a 2-dim Numpy array with
            # time_slices rows and frequency_bins columns. Thus, you will need to take the
            # transpose of this matrix to get the usual STFT which has frequency bins as rows
            # and time slices as columns.
            stft_full = audio_utilities.stft_for_reconstruction(
                input_signal, fft_size, hopsamp)

            # If maximum length exceeds mfcc lengths then pad the remaining ones
            if (max_len > stft_full.shape[0]):
    '--pad_length',
    type=int,
    help='if defined pad up to this length (in frame) or if longer, fail')
args = parser.parse_args()

in_file = args.in_file
out_file_prefix = args.out_file_prefix
if out_file_prefix is None or out_file_prefix.endswith(".wav"):
    out_file_prefix = in_file[:-4]

if args.out_file_dir is not None:
    out_file_prefix = os.path.join(args.out_file_dir, out_file_prefix)

# Load an audio file. It must be WAV format. Multi-channel files will be
# converted to mono.
input_signal = audio_utilities.get_signal(in_file,
                                          expected_fs=args.sample_rate_hz)

print("processing %s - signal %fs long" %
      (in_file, len(input_signal) * 1.0 / args.sample_rate_hz))
if args.pad_length:
    if len(input_signal) > args.pad_length:
        print("signal too long... skipping")
        sys.exit(0)
    else:
        max = np.amax(input_signal)
        whitenoise = np.random.normal(0,
                                      max / 100,
                                      size=args.pad_length - len(input_signal))
        input_signal = np.concatenate((input_signal, whitenoise))

# Hopsamp is the number of samples that the analysis window is shifted after