Esempio n. 1
0
def run_demo():
    """Test Griffin & Lim method for reconstructing audio from a magnitude spectrogram.

        Example of using the Griffin-Lim algorithm. The input file is loaded, the
        spectrogram is computed (note that we discard the phase information). Then,
        using only the (magnitude) spectrogram, the Griffin-Lim algorithm is run
        to reconstruct an audio signal from the spectrogram. The reconstructed audio
        is finally saved to a file.

        A plot of the spectrogram is also displayed.
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('--in_file',
                        type=str,
                        default="bkvhi.wav",
                        help='Input WAV file')
    parser.add_argument('--sample_rate_hz',
                        default=44100,
                        type=int,
                        help='Sample rate in Hz')
    parser.add_argument('--fft_size', default=2048, type=int, help='FFT siz')
    parser.add_argument('--iterations',
                        default=300,
                        type=int,
                        help='Number of iterations to run')
    parser.add_argument('--enable_filter',
                        action='store_true',
                        help='Apply a low-pass filter')
    parser.add_argument('--enable_mel_scale',
                        action='store_true',
                        help='Convert to mel scale and back')
    parser.add_argument(
        '--cutoff_freq',
        type=int,
        default=1000,
        help='If filter is enable, the low-pass cutoff frequency in Hz')
    args = parser.parse_args()

    in_file = args.in_file

    # Load an audio file. It must be WAV format. Multi-channel files will be
    # converted to mono.
    input_signal = audio_utilities.get_signal(in_file,
                                              expected_fs=args.sample_rate_hz)

    # Hopsamp is the number of samples that the analysis window is shifted after
    # computing the FFT. For example, if the sample rate is 44100 Hz and hopsamp is
    # 256, then there will be approximately 44100/256 = 172 FFTs computed per second
    # and thus 172 spectral slices (i.e., columns) per second in the spectrogram.
    hopsamp = args.fft_size // 8

    # Compute the Short-Time Fourier Transform (STFT) from the audio file. This is a 2-dim Numpy array with
    # time_slices rows and frequency_bins columns. Thus, you will need to take the
    # transpose of this matrix to get the usual STFT which has frequency bins as rows
    # and time slices as columns.
    stft_full = audio_utilities.stft_for_reconstruction(
        input_signal, args.fft_size, hopsamp)
    # Note that the STFT is complex-valued. Therefore, to get the (magnitude)
    # spectrogram, we need to take the absolute value.
    print(stft_full.shape)
    stft_mag = abs(stft_full)**2.0
    # Note that `stft_mag` only contains the magnitudes and so we have lost the
    # phase information.
    scale = 1.0 / np.amax(stft_mag)
    print('Maximum value in the magnitude spectrogram: ', 1 / scale)
    # Rescale to put all values in the range [0, 1].
    stft_mag *= scale
    print(stft_mag.shape)
    # We now have a (magnitude only) spectrogram, `stft_mag` that is normalized to be within [0, 1.0].
    # In a practical use case, we would probably want to perform some processing on `stft_mag` here
    # which would produce a modified version that we would want to reconstruct audio from.
    figure(1)
    imshow(stft_mag.T**0.125,
           origin='lower',
           cmap=cm.hot,
           aspect='auto',
           interpolation='nearest')
    colorbar()
    title('Unmodified spectrogram')
    xlabel('time index')
    ylabel('frequency bin index')
    savefig('unmodified_spectrogram.png', dpi=150)

    # If the mel scale option is selected, apply a perceptual frequency scale.
    if args.enable_mel_scale:
        min_freq_hz = 70
        max_freq_hz = 8000
        mel_bin_count = 200

        linear_bin_count = 1 + args.fft_size // 2
        filterbank = audio_utilities.make_mel_filterbank(
            min_freq_hz, max_freq_hz, mel_bin_count, linear_bin_count,
            args.sample_rate_hz)
        figure(2)
        imshow(filterbank,
               origin='lower',
               cmap=cm.hot,
               aspect='auto',
               interpolation='nearest')
        colorbar()
        title('Mel scale filter bank')
        xlabel('linear frequency index')
        ylabel('mel frequency index')
        savefig('mel_scale_filterbank.png', dpi=150)

        mel_spectrogram = np.dot(filterbank, stft_mag.T)

        clf()
        figure(3)
        imshow(mel_spectrogram**0.125,
               origin='lower',
               cmap=cm.hot,
               aspect='auto',
               interpolation='nearest')
        colorbar()
        title('Mel scale spectrogram')
        xlabel('time index')
        ylabel('mel frequency bin index')
        savefig('mel_scale_spectrogram.png', dpi=150)

        inverted_mel_to_linear_freq_spectrogram = np.dot(
            filterbank.T, mel_spectrogram)

        clf()
        figure(4)
        imshow(inverted_mel_to_linear_freq_spectrogram**0.125,
               origin='lower',
               cmap=cm.hot,
               aspect='auto',
               interpolation='nearest')
        colorbar()
        title('Linear scale spectrogram obtained from mel scale spectrogram')
        xlabel('time index')
        ylabel('frequency bin index')
        savefig('inverted_mel_to_linear_freq_spectrogram.png', dpi=150)

        stft_modified = inverted_mel_to_linear_freq_spectrogram.T
    else:
        stft_modified = stft_mag
    savefig('stft_modified.png', dpi=150)
    ###### Optional: modify the spectrogram
    # For example, we can implement a low-pass filter by simply setting all frequency bins above
    # some threshold frequency (args.cutoff_freq) to 0 as follows.
    if args.enable_filter:
        # Calculate corresponding bin index.
        cutoff_bin = round(args.cutoff_freq * args.fft_size /
                           args.sample_rate_hz)
        stft_modified[:, cutoff_bin:] = 0
    ###########

    # Undo the rescaling.
    stft_modified_scaled = stft_modified / scale
    stft_modified_scaled = stft_modified_scaled**0.5
    # Use the Griffin&Lim algorithm to reconstruct an audio signal from the
    # magnitude spectrogram.
    x_reconstruct = audio_utilities.reconstruct_signal_griffin_lim(
        stft_modified_scaled, args.fft_size, hopsamp, args.iterations)

    # The output signal must be in the range [-1, 1], otherwise we need to clip or normalize.
    max_sample = np.max(abs(x_reconstruct))
    if max_sample > 1.0:
        x_reconstruct = x_reconstruct / max_sample

    # Save the reconstructed signal to a WAV file.
    audio_utilities.save_audio_to_file(x_reconstruct, args.sample_rate_hz)

    # Save the spectrogram image also.
    clf()
    figure(5)
    imshow(stft_modified.T**0.125,
           origin='lower',
           cmap=cm.hot,
           aspect='auto',
           interpolation='nearest')
    colorbar()
    title('Spectrogram used to reconstruct audio')
    xlabel('time index')
    ylabel('frequency bin index')
    savefig('reconstruction_spectrogram.png', dpi=150)
def decode():
    """Decoding the inputs using current model."""
    tf.logging.info("Get TEST sets number.")
    num_batch = get_num_batch(FLAGS.test_list_file, infer=True)
    with tf.Graph().as_default():
        with tf.device('/cpu:0'):
            with tf.name_scope('input'):
                data_list = read_list(FLAGS.test_list_file)
                test_utt_id, test_inputs, _ = get_batch(
                    data_list,
                    batch_size=1,
                    input_size=FLAGS.input_dim,
                    output_size=FLAGS.output_dim,
                    left=FLAGS.left_context,
                    right=FLAGS.right_context,
                    num_enqueuing_threads=FLAGS.num_threads,
                    num_epochs=1,
                    infer=True)
                # test_inputs = tf.squeeze(test_inputs, axis=[0])
        devices = []
        for i in xrange(FLAGS.num_gpu):
            device_name = ("/gpu:%d" % i)
            print('Using device: ', device_name)
            devices.append(device_name)

        # Prevent exhausting all the gpu memories.
        config = tf.ConfigProto()
        config.gpu_options.per_process_gpu_memory_fraction = 0.4
        #config.gpu_options.allow_growth = True
        config.allow_soft_placement = True
        set_session(tf.Session(config=config))
        # execute the session
        with tf.Session(config=config) as sess:
            # Create two models with tr_inputs and cv_inputs individually.
            with tf.name_scope('model'):
                model = DNNTrainer(sess,
                                   FLAGS,
                                   devices,
                                   test_inputs,
                                   labels=None,
                                   cross_validation=True)

            show_all_variables()

            init = tf.group(tf.global_variables_initializer(),
                            tf.local_variables_initializer())
            print("Initializing variables ...")
            sess.run(init)

            if model.load(model.save_dir, moving_average=False):
                print("[*] Load Moving Average model SUCCESS")
            else:
                print("[!] Load failed. Checkpoint not found. Exit now.")
                sys.exit(1)

            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(sess=sess, coord=coord)

            cmvn_filename = os.path.join(FLAGS.data_dir, "train_cmvn.npz")
            if os.path.isfile(cmvn_filename):
                cmvn = np.load(cmvn_filename)
            else:
                tf.logging.fatal("%s not exist, exit now." % cmvn_filename)
                sys.exit(1)
            out_dir_name = os.path.join('/Work18/2017/linan/SE/my_enh',
                                        FLAGS.save_dir, FLAGS.savetestdir)
            # out_dir_name = os.path.join(FLAGS.save_dir, 'test')
            if not os.path.exists(out_dir_name):
                os.makedirs(out_dir_name)

            write_scp_path = os.path.join(out_dir_name, 'feats.scp')
            write_ark_path = os.path.join(out_dir_name, 'feats.ark')
            writer = ArkWriter(write_scp_path)

            outputs = model.generator(test_inputs, None, reuse=True)
            outputs = tf.reshape(outputs, [-1, model.output_dim])
            print('shape is', np.shape(outputs))
            try:
                for batch in range(num_batch):
                    if coord.should_stop():
                        break
                    # outputs = model.generator(test_inputs, None, reuse=True)
                    # outputs = tf.reshape(outputs, [-1, model.output_dim])
                    utt_id, activations = sess.run([test_utt_id, outputs])
                    # sequence = activations * cmvn['stddev_labels'] + \
                    # cmvn['mean_labels']
                    sequence = activations
                    save_result = np.vstack(sequence)
                    dir_load = FLAGS.savetestdir
                    dir_load = dir_load.split('/')[-1]
                    mode = FLAGS.mode
                    if mode == 'use_org':
                        inputs_path = os.path.join(
                            'workspace/features/spectrogram/test', dir_load,
                            '%s.wav.p' % utt_id[0])
                        data = cPickle.load(open(inputs_path, 'rb'))
                        [mixed_complx_x] = data
                        #tf.logging.info("Write inferred %s to %s" %(utt_id[0], np.shape(save_result)))
                        save_result = np.exp(save_result)
                        n_window = cfg.n_window
                        s = recover_wav(save_result, mixed_complx_x,
                                        cfg.n_overlap, np.hamming)
                        s *= np.sqrt(
                            (np.hamming(n_window)**2
                             ).sum())  # Scaler for compensate the amplitude
                        # change after spectrogram and IFFT.
                        print("start enhance wav file")
                        # Write out enhanced wav.
                        out_path = os.path.join("workspace", "enh_wavs",
                                                "test", dir_load,
                                                "%s.enh.wav" % utt_id[0])
                        print("have enhanced all  the wav")
                        pp_data.create_folder(os.path.dirname(out_path))
                        pp_data.write_audio(out_path, s, 16000)
                    elif mode == 'g_l':
                        inputs_path = os.path.join(
                            'workspace/features/spectrogram/test', dir_load,
                            '%s.wav.p' % utt_id[0])
                        data = cPickle.load(open(inputs_path, 'rb'))
                        [mixed_complx_x] = data
                        save_result = np.exp(save_result)
                        s = save_result
                        s = audio_utilities.reconstruct_signal_griffin_lim(
                            s, mixed_complx_x, 512, 256, 15)
                        #s = recover_wav(save_result,mixed_complx_x,cfg.n_overlap, np.hamming)
                        s *= np.sqrt((np.hamming(cfg.n_window)**2).sum())
                        #s = audio._griffin_lim(s)
                        out_path = os.path.join("workspace", "enh_wavs",
                                                "test2", dir_load,
                                                "%s.enh.wav" % utt_id[0])
                        pp_data.create_folder(os.path.dirname(out_path))
                        pp_data.write_audio(out_path, s, 16000)
                        tf.logging.info("Write inferred%s" % (np.shape(s)))
                    #writer.write_next_utt(write_ark_path, utt_id[0], save_result)
                    tf.logging.info("Write inferred %s to %s" %
                                    (utt_id[0], out_path))

            except Exception, e:
                # Report exceptions to the coordinator.
                coord.request_stop(e)
            finally:
        for wav_path in wavs
    ]

    melSpectrum = np.load('mel-batch_0_sentence_0.npy')
    imshow(melSpectrum.T**0.125,
           origin='lower',
           cmap=cm.hot,
           aspect='auto',
           interpolation='nearest')
    stft_modified = melSpectrum.T

    filterbank = audio_utilities.make_mel_filterbank(70, 8000, 80, 123, 44100)
    inverted_mel_to_linear_freq_spectrogram = np.dot(filterbank.T,
                                                     stft_modified)

    spectrogram = inverted_mel_to_linear_freq_spectrogram
    print("Linear spectrograms dim: ")
    print(spectrogram[0].shape)

    spectrogram = spectrogram.astype(np.float32)
    spectrogram = spectrogram
    # --------------------------------- librosa Version ---------------------------------
    # convert back
    x_reconstruct = audio_utilities.reconstruct_signal_griffin_lim(
        spectrogram, 244, 123, 300)
    max_sample = np.max(abs(x_reconstruct))
    if max_sample > 1.0:
        x_reconstruct = x_reconstruct / max_sample
    audio_utilities.save_audio_to_file(x_reconstruct, 44100)

    print("Done!")
Esempio n. 4
0
def run_recon():
    """Test Griffin & Lim method for reconstructing audio from a magnitude spectrogram.

        Example of using the Griffin-Lim algorithm. The input file is loaded, the
        spectrogram is computed (note that we discard the phase information). Then,
        using only the (magnitude) spectrogram, the Griffin-Lim algorithm is run
        to reconstruct an audio signal from the spectrogram. The reconstructed audio
        is finally saved to a file.

        A plot of the spectrogram is also displayed.
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('--in_file',
                        type=str,
                        default="bkvhi.wav",
                        help='Input WAV file')
    parser.add_argument('--sample_rate_hz',
                        default=44100,
                        type=int,
                        help='Sample rate in Hz')
    parser.add_argument('--fft_size', default=2048, type=int, help='FFT siz')
    parser.add_argument('--iterations',
                        default=300,
                        type=int,
                        help='Number of iterations to run')
    parser.add_argument('--enable_filter',
                        action='store_true',
                        help='Apply a low-pass filter')
    parser.add_argument('--enable_mel_scale',
                        action='store_true',
                        help='Convert to mel scale and back')
    parser.add_argument(
        '--cutoff_freq',
        type=int,
        default=1000,
        help='If filter is enable, the low-pass cutoff frequency in Hz')
    args = parser.parse_args()

    in_file = Image.open(args.in_file)

    #print(in_file.shape)
    in_file = in_file.resize((1025, 640), Image.ANTIALIAS)
    ext = ".png"
    in_file.save("rescaledimage" + ext)
    in_file = plt.imread("rescaledimage.png")
    print(in_file.shape)
    in_file = rgb2gray(in_file)
    hopsamp = args.fft_size // 8
    print(in_file.shape)
    # Use the Griffin&Lim algorithm to reconstruct an audio signal from the
    # magnitude spectrogram.
    x_reconstruct = audio_utilities.reconstruct_signal_griffin_lim(
        in_file, args.fft_size, hopsamp, args.iterations)

    # The output signal must be in the range [-1, 1], otherwise we need to clip or normalize.
    max_sample = np.max(abs(x_reconstruct))
    if max_sample > 1.0:
        x_reconstruct = x_reconstruct / max_sample

    # Save the reconstructed signal to a WAV file.
    audio_utilities.save_audio_to_file(x_reconstruct, args.sample_rate_hz)
imshow(melSpectrum.T**0.125, origin='lower', cmap=cm.hot, aspect='auto',
               interpolation='nearest')
stft_modified=melSpectrum.T

if args.enable_filter:
# Calculate corresponding bin index.
    cutoff_bin = round(args.cutoff_freq*args.fft_size/args.sample_rate_hz)
    stft_modified[:, cutoff_bin:] = 0
    
stft_modified_scaled = stft_modified
stft_modified_scaled = stft_modified_scaled**0.5
# Use the Griffin&Lim algorithm to reconstruct an audio signal from the
# magnitude spectrogram.
hopsamp = args.fft_size // 8
x_reconstruct = audio_utilities.reconstruct_signal_griffin_lim(stft_modified_scaled,
                                                                   args.fft_size, hopsamp,
                                                                   args.iterations)
# The output signal must be in the range [-1, 1], otherwise we need to clip or normalize.
max_sample = np.max(abs(x_reconstruct))
if max_sample > 1.0:
    x_reconstruct = x_reconstruct / max_sample
# Save the reconstructed signal to a WAV file.
audio_utilities.save_audio_to_file(x_reconstruct, args.sample_rate_hz)
# Save the spectrogram image also.
clf()
figure(5)
imshow(stft_modified.T**0.125, origin='lower', cmap=cm.hot, aspect='auto',
           interpolation='nearest')
colorbar()
title('Spectrogram used to reconstruct audio')
xlabel('time index')