Example #1
0
def get_wav_from_stft(stft_modified,
                      rate_hz,
                      modified_scale,
                      fft_size,
                      hopsamp,
                      iterations=2000,
                      outfile=None):

    stft_modified_scaled = stft_modified / modified_scale
    stft_modified_scaled = stft_modified_scaled**0.5

    x_reconstruct = audio_utilities.reconstruct_signal_griffin_lim\
            (stft_modified_scaled,
             fft_size=fft_size,
             hopsamp=hopsamp,
             iterations=iterations)

    max_sample = np.max(abs(x_reconstruct))
    if max_sample > 1.0:
        x_reconstruct = x_reconstruct / max_sample

    audio_utilities.save_audio_to_file(x_reconstruct, rate_hz, outfile=outfile)

    return stft_modified_scaled
Example #2
0
def run_demo():
    """Test Griffin & Lim method for reconstructing audio from a magnitude spectrogram.

        Example of using the Griffin-Lim algorithm. The input file is loaded, the
        spectrogram is computed (note that we discard the phase information). Then,
        using only the (magnitude) spectrogram, the Griffin-Lim algorithm is run
        to reconstruct an audio signal from the spectrogram. The reconstructed audio
        is finally saved to a file.

        A plot of the spectrogram is also displayed.
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('--in_file',
                        type=str,
                        default="bkvhi.wav",
                        help='Input WAV file')
    parser.add_argument('--sample_rate_hz',
                        default=44100,
                        type=int,
                        help='Sample rate in Hz')
    parser.add_argument('--fft_size', default=2048, type=int, help='FFT siz')
    parser.add_argument('--iterations',
                        default=300,
                        type=int,
                        help='Number of iterations to run')
    parser.add_argument('--enable_filter',
                        action='store_true',
                        help='Apply a low-pass filter')
    parser.add_argument('--enable_mel_scale',
                        action='store_true',
                        help='Convert to mel scale and back')
    parser.add_argument(
        '--cutoff_freq',
        type=int,
        default=1000,
        help='If filter is enable, the low-pass cutoff frequency in Hz')
    args = parser.parse_args()

    in_file = args.in_file

    # Load an audio file. It must be WAV format. Multi-channel files will be
    # converted to mono.
    input_signal = audio_utilities.get_signal(in_file,
                                              expected_fs=args.sample_rate_hz)

    # Hopsamp is the number of samples that the analysis window is shifted after
    # computing the FFT. For example, if the sample rate is 44100 Hz and hopsamp is
    # 256, then there will be approximately 44100/256 = 172 FFTs computed per second
    # and thus 172 spectral slices (i.e., columns) per second in the spectrogram.
    hopsamp = args.fft_size // 8

    # Compute the Short-Time Fourier Transform (STFT) from the audio file. This is a 2-dim Numpy array with
    # time_slices rows and frequency_bins columns. Thus, you will need to take the
    # transpose of this matrix to get the usual STFT which has frequency bins as rows
    # and time slices as columns.
    stft_full = audio_utilities.stft_for_reconstruction(
        input_signal, args.fft_size, hopsamp)
    # Note that the STFT is complex-valued. Therefore, to get the (magnitude)
    # spectrogram, we need to take the absolute value.
    print(stft_full.shape)
    stft_mag = abs(stft_full)**2.0
    # Note that `stft_mag` only contains the magnitudes and so we have lost the
    # phase information.
    scale = 1.0 / np.amax(stft_mag)
    print('Maximum value in the magnitude spectrogram: ', 1 / scale)
    # Rescale to put all values in the range [0, 1].
    stft_mag *= scale
    print(stft_mag.shape)
    # We now have a (magnitude only) spectrogram, `stft_mag` that is normalized to be within [0, 1.0].
    # In a practical use case, we would probably want to perform some processing on `stft_mag` here
    # which would produce a modified version that we would want to reconstruct audio from.
    figure(1)
    imshow(stft_mag.T**0.125,
           origin='lower',
           cmap=cm.hot,
           aspect='auto',
           interpolation='nearest')
    colorbar()
    title('Unmodified spectrogram')
    xlabel('time index')
    ylabel('frequency bin index')
    savefig('unmodified_spectrogram.png', dpi=150)

    # If the mel scale option is selected, apply a perceptual frequency scale.
    if args.enable_mel_scale:
        min_freq_hz = 70
        max_freq_hz = 8000
        mel_bin_count = 200

        linear_bin_count = 1 + args.fft_size // 2
        filterbank = audio_utilities.make_mel_filterbank(
            min_freq_hz, max_freq_hz, mel_bin_count, linear_bin_count,
            args.sample_rate_hz)
        figure(2)
        imshow(filterbank,
               origin='lower',
               cmap=cm.hot,
               aspect='auto',
               interpolation='nearest')
        colorbar()
        title('Mel scale filter bank')
        xlabel('linear frequency index')
        ylabel('mel frequency index')
        savefig('mel_scale_filterbank.png', dpi=150)

        mel_spectrogram = np.dot(filterbank, stft_mag.T)

        clf()
        figure(3)
        imshow(mel_spectrogram**0.125,
               origin='lower',
               cmap=cm.hot,
               aspect='auto',
               interpolation='nearest')
        colorbar()
        title('Mel scale spectrogram')
        xlabel('time index')
        ylabel('mel frequency bin index')
        savefig('mel_scale_spectrogram.png', dpi=150)

        inverted_mel_to_linear_freq_spectrogram = np.dot(
            filterbank.T, mel_spectrogram)

        clf()
        figure(4)
        imshow(inverted_mel_to_linear_freq_spectrogram**0.125,
               origin='lower',
               cmap=cm.hot,
               aspect='auto',
               interpolation='nearest')
        colorbar()
        title('Linear scale spectrogram obtained from mel scale spectrogram')
        xlabel('time index')
        ylabel('frequency bin index')
        savefig('inverted_mel_to_linear_freq_spectrogram.png', dpi=150)

        stft_modified = inverted_mel_to_linear_freq_spectrogram.T
    else:
        stft_modified = stft_mag
    savefig('stft_modified.png', dpi=150)
    ###### Optional: modify the spectrogram
    # For example, we can implement a low-pass filter by simply setting all frequency bins above
    # some threshold frequency (args.cutoff_freq) to 0 as follows.
    if args.enable_filter:
        # Calculate corresponding bin index.
        cutoff_bin = round(args.cutoff_freq * args.fft_size /
                           args.sample_rate_hz)
        stft_modified[:, cutoff_bin:] = 0
    ###########

    # Undo the rescaling.
    stft_modified_scaled = stft_modified / scale
    stft_modified_scaled = stft_modified_scaled**0.5
    # Use the Griffin&Lim algorithm to reconstruct an audio signal from the
    # magnitude spectrogram.
    x_reconstruct = audio_utilities.reconstruct_signal_griffin_lim(
        stft_modified_scaled, args.fft_size, hopsamp, args.iterations)

    # The output signal must be in the range [-1, 1], otherwise we need to clip or normalize.
    max_sample = np.max(abs(x_reconstruct))
    if max_sample > 1.0:
        x_reconstruct = x_reconstruct / max_sample

    # Save the reconstructed signal to a WAV file.
    audio_utilities.save_audio_to_file(x_reconstruct, args.sample_rate_hz)

    # Save the spectrogram image also.
    clf()
    figure(5)
    imshow(stft_modified.T**0.125,
           origin='lower',
           cmap=cm.hot,
           aspect='auto',
           interpolation='nearest')
    colorbar()
    title('Spectrogram used to reconstruct audio')
    xlabel('time index')
    ylabel('frequency bin index')
    savefig('reconstruction_spectrogram.png', dpi=150)
Example #3
0
def run_recon():
    """Test Griffin & Lim method for reconstructing audio from a magnitude spectrogram.

        Example of using the Griffin-Lim algorithm. The input file is loaded, the
        spectrogram is computed (note that we discard the phase information). Then,
        using only the (magnitude) spectrogram, the Griffin-Lim algorithm is run
        to reconstruct an audio signal from the spectrogram. The reconstructed audio
        is finally saved to a file.

        A plot of the spectrogram is also displayed.
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('--in_file',
                        type=str,
                        default="bkvhi.wav",
                        help='Input WAV file')
    parser.add_argument('--sample_rate_hz',
                        default=44100,
                        type=int,
                        help='Sample rate in Hz')
    parser.add_argument('--fft_size', default=2048, type=int, help='FFT siz')
    parser.add_argument('--iterations',
                        default=300,
                        type=int,
                        help='Number of iterations to run')
    parser.add_argument('--enable_filter',
                        action='store_true',
                        help='Apply a low-pass filter')
    parser.add_argument('--enable_mel_scale',
                        action='store_true',
                        help='Convert to mel scale and back')
    parser.add_argument(
        '--cutoff_freq',
        type=int,
        default=1000,
        help='If filter is enable, the low-pass cutoff frequency in Hz')
    args = parser.parse_args()

    in_file = Image.open(args.in_file)

    #print(in_file.shape)
    in_file = in_file.resize((1025, 640), Image.ANTIALIAS)
    ext = ".png"
    in_file.save("rescaledimage" + ext)
    in_file = plt.imread("rescaledimage.png")
    print(in_file.shape)
    in_file = rgb2gray(in_file)
    hopsamp = args.fft_size // 8
    print(in_file.shape)
    # Use the Griffin&Lim algorithm to reconstruct an audio signal from the
    # magnitude spectrogram.
    x_reconstruct = audio_utilities.reconstruct_signal_griffin_lim(
        in_file, args.fft_size, hopsamp, args.iterations)

    # The output signal must be in the range [-1, 1], otherwise we need to clip or normalize.
    max_sample = np.max(abs(x_reconstruct))
    if max_sample > 1.0:
        x_reconstruct = x_reconstruct / max_sample

    # Save the reconstructed signal to a WAV file.
    audio_utilities.save_audio_to_file(x_reconstruct, args.sample_rate_hz)
        for wav_path in wavs
    ]

    melSpectrum = np.load('mel-batch_0_sentence_0.npy')
    imshow(melSpectrum.T**0.125,
           origin='lower',
           cmap=cm.hot,
           aspect='auto',
           interpolation='nearest')
    stft_modified = melSpectrum.T

    filterbank = audio_utilities.make_mel_filterbank(70, 8000, 80, 123, 44100)
    inverted_mel_to_linear_freq_spectrogram = np.dot(filterbank.T,
                                                     stft_modified)

    spectrogram = inverted_mel_to_linear_freq_spectrogram
    print("Linear spectrograms dim: ")
    print(spectrogram[0].shape)

    spectrogram = spectrogram.astype(np.float32)
    spectrogram = spectrogram
    # --------------------------------- librosa Version ---------------------------------
    # convert back
    x_reconstruct = audio_utilities.reconstruct_signal_griffin_lim(
        spectrogram, 244, 123, 300)
    max_sample = np.max(abs(x_reconstruct))
    if max_sample > 1.0:
        x_reconstruct = x_reconstruct / max_sample
    audio_utilities.save_audio_to_file(x_reconstruct, 44100)

    print("Done!")
Example #5
0
if not os.path.exists(input_dir):
    print("Input directory does not exist")
    sys.exit()
if input_dir[-1] != "/":
    input_dir = input_dir + '/'

if not os.path.exists(output_dir):
    print("Output directory created")
    os.makedirs(output_dir)
if output_dir[-1] != "/":
    output_dir = output_dir + '/'

for root, subdirs, files in os.walk(input_dir):
    for name in tqdm(files):

        if name.endswith(".npy"):
            iname = "%s/%s" % (root, name)
            root_out = root.replace(input_dir, output_dir)
            if not os.path.exists(root_out):
                os.makedirs(root_out)

            oname = "%s/%s-griffinlim-reconstructed.wav" % (root_out,
                                                            name[:-4])

            S = np.load(iname)
            y_inv = lr.griffinlim(S)
            audio_utilities.save_audio_to_file(y_inv, sr, outfile=oname)

print("Done!\n")
stft_modified=melSpectrum.T

if args.enable_filter:
# Calculate corresponding bin index.
    cutoff_bin = round(args.cutoff_freq*args.fft_size/args.sample_rate_hz)
    stft_modified[:, cutoff_bin:] = 0
    
stft_modified_scaled = stft_modified
stft_modified_scaled = stft_modified_scaled**0.5
# Use the Griffin&Lim algorithm to reconstruct an audio signal from the
# magnitude spectrogram.
hopsamp = args.fft_size // 8
x_reconstruct = audio_utilities.reconstruct_signal_griffin_lim(stft_modified_scaled,
                                                                   args.fft_size, hopsamp,
                                                                   args.iterations)
# The output signal must be in the range [-1, 1], otherwise we need to clip or normalize.
max_sample = np.max(abs(x_reconstruct))
if max_sample > 1.0:
    x_reconstruct = x_reconstruct / max_sample
# Save the reconstructed signal to a WAV file.
audio_utilities.save_audio_to_file(x_reconstruct, args.sample_rate_hz)
# Save the spectrogram image also.
clf()
figure(5)
imshow(stft_modified.T**0.125, origin='lower', cmap=cm.hot, aspect='auto',
           interpolation='nearest')
colorbar()
title('Spectrogram used to reconstruct audio')
xlabel('time index')
ylabel('frequency bin index')
savefig('reconstruction_spectrogram.png', dpi=150)
Example #7
0
if np.ndim(spectrogram) == 2:
    # gray
    mel_spectrogram = (1 - spectrogram / 65535.)
else:
    # color
    mel_spectrogram = rgb2gray(spectrogram / 255.)

mel_spectrogram = np.flip(mel_spectrogram, 0)
mel_spectrogram /= scale_mel
mel_spectrogram = np.exp(mel_spectrogram / 20. * np.log(10)) * 10e-6

inverted_mel_to_linear_freq_spectrogram = np.dot(filterbank.T, mel_spectrogram)

stft_modified = inverted_mel_to_linear_freq_spectrogram.T

stft_modified = stft_modified**0.5
# Use the Griffin&Lim algorithm to reconstruct an audio signal from the
# magnitude spectrogram.
x_reconstruct = audio_utilities.reconstruct_signal_griffin_lim(
    stft_modified, fft_size, hopsamp, args.iterations)

# The output signal must be in the range [-1, 1], otherwise we need to clip or normalize.
max_sample = np.max(abs(x_reconstruct))
if max_sample > 1.0:
    x_reconstruct = x_reconstruct / max_sample

# Save the reconstructed signal to a WAV file.
audio_utilities.save_audio_to_file(x_reconstruct,
                                   sample_rate_hz,
                                   outfile=args.out_file)