Beispiel #1
0
def generate_filter_banks(band_sizes):
    band_sizes = sorted(band_sizes)
    total_samples = band_sizes[-1]
    n_bands = [128] * 5

    n_taps = 256
    current_low_freq = 20

    for i, size in enumerate(band_sizes):
        ratio = (total_samples / size)
        new_sr = zounds.SampleRate(sr.frequency * ratio, sr.duration * ratio)

        if size == total_samples:
            freq_band = zounds.FrequencyBand(current_low_freq,
                                             new_sr.nyquist - 20)
        else:
            freq_band = zounds.FrequencyBand(current_low_freq, new_sr.nyquist)

        bandpass = firwin(n_taps, [int(new_sr) // 4, (int(new_sr) // 2) - 1],
                          fs=int(new_sr),
                          pass_zero=False).astype(np.float32)
        bandpass = torch.from_numpy(bandpass).to(device).view(1, 1, n_taps)

        scale = zounds.GeometricScale(freq_band.start_hz, freq_band.stop_hz,
                                      0.05, n_bands[i])
        bank = zounds.learn.FilterBank(
            new_sr,
            n_taps,
            scale,
            # values close to zero get good frequency resolution.  Values close
            # to one get good time resolution
            0.25,
            normalize_filters=False,
            a_weighting=False).to(device)

        current_low_freq = freq_band.stop_hz
        yield bank, bandpass
import featureflow as ff
import numpy as np
import zounds
from torch import nn
from torch import optim
import argparse
from multiprocessing.pool import ThreadPool, cpu_count

samplerate = zounds.SR11025()
BaseModel = zounds.stft(resample_to=samplerate, store_fft=True)

scale = zounds.GeometricScale(start_center_hz=300,
                              stop_center_hz=3040,
                              bandwidth_ratio=0.07496,
                              n_bands=64)
scale.ensure_overlap_ratio(0.5)


@zounds.simple_lmdb_settings('speeches', map_size=1e10, user_supplied_id=True)
class Sound(BaseModel):
    """
    An audio processing pipeline that computes a frequency domain representation
    of the sound that follows a geometric scale
    """
    bark = zounds.ArrayWithUnitsFeature(zounds.BarkBands,
                                        samplerate=samplerate,
                                        stop_freq_hz=samplerate.nyquist,
                                        needs=BaseModel.fft,
                                        store=True)

    long_windowed = zounds.ArrayWithUnitsFeature(
Beispiel #3
0
    start = zounds.Milliseconds(500)
    end = start + zounds.Seconds(2)
    snippet = sound.weighted[start:end, :]

    # grab a subset of frequency information for the duration of the sound
    freq_band = slice(zounds.Hertz(400), zounds.Hertz(500))
    a440 = sound.mdct[:, freq_band]

    # produce a new set of coefficients where only the 440hz sine wave is
    # present
    filtered = sound.mdct.zeros_like()
    filtered[:, freq_band] = a440

    # apply a geometric scale, which more closely matches human pitch
    # perception, and apply it to the linear frequency axis
    scale = zounds.GeometricScale(50, 4000, 0.05, 100)
    log_coeffs = scale.apply(sound.mdct, zounds.HanningWindowingFunc())

    # reconstruct audio from the MDCT coefficients
    mdct_synth = zounds.MDCTSynthesizer()
    reconstructed = mdct_synth.synthesize(sound.mdct)
    filtered_reconstruction = mdct_synth.synthesize(filtered)

    # start an in-browser REPL that will allow you to listen to and visualize
    # the variables defined above (and any new ones you create in the session)
    app = zounds.ZoundsApp(model=Sound,
                           audio_feature=Sound.ogg,
                           visualization_feature=Sound.weighted,
                           globals=globals(),
                           locals=locals())
    app.start(9999)
"""

import numpy as np
import zounds
from zounds.spectral import apply_scale

samplerate = zounds.SR11025()
BaseModel = zounds.resampled(resample_to=samplerate, store_resampled=True)

scale_bands = 96
spectrogram_duration = 64

anchor_slice = slice(spectrogram_duration, spectrogram_duration * 2)

scale = zounds.GeometricScale(start_center_hz=50,
                              stop_center_hz=samplerate.nyquist,
                              bandwidth_ratio=0.115,
                              n_bands=scale_bands)
scale.ensure_overlap_ratio()

spectrogram_duration = 64

windowing_scheme = zounds.HalfLapped()
spectrogram_sample_rate = zounds.SampleRate(
    frequency=windowing_scheme.frequency * (spectrogram_duration // 2),
    duration=windowing_scheme.frequency * spectrogram_duration)


def spectrogram(x):
    x = apply_scale(np.abs(x.real),
                    scale,
                    window=zounds.OggVorbisWindowingFunc())
Beispiel #5
0
See section 3.3 Setting MDCT Sizes for information about what we're fudging/
glossing over in this implementation.  We instead use the DCT2 transform, which
makes inversion easier, at the cost of more redundancy.
"""

from __future__ import division
import zounds
import scipy

samplerate = zounds.SR11025()
BaseModel = zounds.stft(resample_to=samplerate)

windowing_func = zounds.OggVorbisWindowingFunc()

scale = zounds.GeometricScale(300, 3030, 0.05, 100)


@zounds.simple_in_memory_settings
class Document(BaseModel):
    bark = zounds.ArrayWithUnitsFeature(zounds.BarkBands,
                                        samplerate=samplerate,
                                        stop_freq_hz=samplerate.nyquist,
                                        needs=BaseModel.fft,
                                        store=True)

    long_windowed = zounds.ArrayWithUnitsFeature(
        zounds.SlidingWindow,
        wscheme=zounds.SampleRate(frequency=zounds.Milliseconds(500),
                                  duration=zounds.Seconds(1)),
        wfunc=windowing_func,
Beispiel #6
0
    @classmethod
    def from_audio(cls, samples, samplerate):
        coeffs = cls.batch_stft(samples)
        mag = np.abs(coeffs)
        coeffs = cls._embed(mag)
        coeffs = coeffs.transpose((0, 2, 1))
        coeffs = np.log(coeffs + 1e-12)
        coeffs = cls._postprocess_coeffs(coeffs)
        return cls(coeffs, samplerate)


sr = zounds.SR11025()
n_bands = 256
mel_scale = zounds.MelScale(zounds.FrequencyBand(20, sr.nyquist - 20), n_bands)
geom_scale = zounds.GeometricScale(20, sr.nyquist - 20, 0.05, n_bands)
linear_scale = zounds.LinearScale(zounds.FrequencyBand(0, sr.nyquist), 513)
mel_scale_basis = mel_scale._basis(linear_scale, zounds.HanningWindowingFunc())
geom_scale_basis = geom_scale._basis(linear_scale,
                                     zounds.HanningWindowingFunc())


class MelScalePhaseRecover(BasePhaseRecovery):
    basis = mel_scale_basis

    def __init__(self, data, samplerate):
        super().__init__(data, samplerate)


class GeometricScalePhaseRecover(BasePhaseRecovery):
    basis = geom_scale_basis