Exemple #1
0
class Document(BaseModel):
    bark = zounds.ArrayWithUnitsFeature(zounds.BarkBands,
                                        samplerate=samplerate,
                                        stop_freq_hz=samplerate.nyquist,
                                        needs=BaseModel.fft,
                                        store=True)

    long_windowed = zounds.ArrayWithUnitsFeature(
        zounds.SlidingWindow,
        wscheme=zounds.SampleRate(frequency=zounds.Milliseconds(500),
                                  duration=zounds.Seconds(1)),
        wfunc=windowing_func,
        needs=BaseModel.resampled,
        store=True)

    dct = zounds.ArrayWithUnitsFeature(zounds.DCT,
                                       scale_always_even=True,
                                       needs=long_windowed,
                                       store=True)

    mdct = zounds.FrequencyAdaptiveFeature(zounds.FrequencyAdaptiveTransform,
                                           transform=scipy.fftpack.idct,
                                           scale=scale,
                                           needs=dct,
                                           store=True)
class Sound(BaseModel):
    """
    An audio processing pipeline that computes a frequency domain representation
    of the sound that follows a geometric scale
    """
    bark = zounds.ArrayWithUnitsFeature(zounds.BarkBands,
                                        samplerate=samplerate,
                                        stop_freq_hz=samplerate.nyquist,
                                        needs=BaseModel.fft,
                                        store=True)

    long_windowed = zounds.ArrayWithUnitsFeature(
        zounds.SlidingWindow,
        wscheme=zounds.SampleRate(frequency=zounds.Milliseconds(358),
                                  duration=zounds.Milliseconds(716)),
        wfunc=zounds.OggVorbisWindowingFunc(),
        needs=BaseModel.resampled,
        store=True)

    long_fft = zounds.ArrayWithUnitsFeature(zounds.FFT,
                                            needs=long_windowed,
                                            store=True)

    freq_adaptive = zounds.FrequencyAdaptiveFeature(
        zounds.FrequencyAdaptiveTransform,
        transform=np.fft.irfft,
        scale=scale,
        window_func=np.hanning,
        needs=long_fft,
        store=False)

    rasterized = zounds.ArrayWithUnitsFeature(lambda fa: fa.rasterize(64),
                                              needs=freq_adaptive,
                                              store=False)
Exemple #3
0
class SoundWithNoSettings(BaseModel):
    short_windowed = zounds.ArrayWithUnitsFeature(
        zounds.SlidingWindow,
        wscheme=windowing_scheme,
        wfunc=zounds.OggVorbisWindowingFunc(),
        needs=BaseModel.resampled)

    fft = zounds.ArrayWithUnitsFeature(
        zounds.FFT,
        needs=short_windowed)

    geom = zounds.ArrayWithUnitsFeature(
        spectrogram,
        needs=fft,
        store=True)

    log_spectrogram = zounds.ArrayWithUnitsFeature(
        zounds.SlidingWindow,
        wscheme=zounds.SampleRate(
            frequency=windowing_scheme.frequency * (spectrogram_duration // 2),
            duration=windowing_scheme.frequency * spectrogram_duration * 3),
        needs=geom)

    ls = zounds.ArrayWithUnitsFeature(
        zounds.SlidingWindow,
        wscheme=spectrogram_sample_rate,
        needs=geom)
Exemple #4
0
def compute_embedding(samples, network):
    # TODO: resampling can fail for some odd sampling rates
    samples = zounds.soundfile.resample(samples, zounds.SR11025())
    freq = samples.frequency * 8192
    windowed = samples.sliding_window(
        zounds.SampleRate(frequency=freq, duration=freq))
    dims = windowed.dimensions
    output = zounds.learn.apply_network(network, windowed, chunksize=8)
    logger.info(output.shape)
    output = zounds.ArrayWithUnits(
        output, [dims[0], zounds.IdentityDimension()])
    return output
Exemple #5
0
class Sound(BaseModel):
    windowed = zounds.ArrayWithUnitsFeature(
        zounds.SlidingWindow,
        wscheme=zounds.SampleRate(frequency=samplerate.frequency *
                                  (SAMPLE_SIZE // 2),
                                  duration=samplerate.frequency * SAMPLE_SIZE),
        needs=BaseModel.resampled,
        store=False)

    perceptual = zounds.ArrayWithUnitsFeature(perceptual, needs=windowed)

    decomposed = zounds.ArrayWithUnitsFeature(
        lambda x: FrequencyDecomposition(x, bands).as_frequency_adaptive(),
        needs=windowed)
    def _process_samples(self, samples):
        samples = samples.mono
        samples = zounds.soundfile.resample(samples, SAMPLE_RATE)
        windowing_sample_rate = zounds.SampleRate(
            frequency=(FILTER_BANK_KERNEL_SIZE // 2) * SAMPLE_RATE.frequency,
            duration=FILTER_BANK_KERNEL_SIZE * SAMPLE_RATE.frequency)
        windowed = samples.sliding_window(windowing_sample_rate)
        windowed = np.asarray(windowed)
        spec = np.dot(FILTER_BANK, windowed.T).T
        spec = np.abs(spec)
        spec = 20 * np.log10(spec + 1)
        spec = np.ascontiguousarray(spec).astype(np.float32)
        spec = zounds.ArrayWithUnits(spec, [
            zounds.TimeDimension(*windowing_sample_rate),
            zounds.FrequencyDimension(scale)
        ])

        binary_data = BinaryData(spec)
        return binary_data
Exemple #7
0
def generate_filter_banks(band_sizes):
    band_sizes = sorted(band_sizes)
    total_samples = band_sizes[-1]
    n_bands = [128] * 5

    n_taps = 256
    current_low_freq = 20

    for i, size in enumerate(band_sizes):
        ratio = (total_samples / size)
        new_sr = zounds.SampleRate(sr.frequency * ratio, sr.duration * ratio)

        if size == total_samples:
            freq_band = zounds.FrequencyBand(current_low_freq,
                                             new_sr.nyquist - 20)
        else:
            freq_band = zounds.FrequencyBand(current_low_freq, new_sr.nyquist)

        bandpass = firwin(n_taps, [int(new_sr) // 4, (int(new_sr) // 2) - 1],
                          fs=int(new_sr),
                          pass_zero=False).astype(np.float32)
        bandpass = torch.from_numpy(bandpass).to(device).view(1, 1, n_taps)

        scale = zounds.GeometricScale(freq_band.start_hz, freq_band.stop_hz,
                                      0.05, n_bands[i])
        bank = zounds.learn.FilterBank(
            new_sr,
            n_taps,
            scale,
            # values close to zero get good frequency resolution.  Values close
            # to one get good time resolution
            0.25,
            normalize_filters=False,
            a_weighting=False).to(device)

        current_low_freq = freq_band.stop_hz
        yield bank, bandpass
Exemple #8
0
    )
    args = parser.parse_args()

    _id = Sound.process(meta=args.sound_uri)
    snd = Sound(_id)

    original = snd.resampled
    slow = zounds.AudioSamples(time_stretch(original, 0.75).squeeze(), sr)
    fast = zounds.AudioSamples(time_stretch(original, 1.25).squeeze(), sr)

    higher = zounds.AudioSamples(pitch_shift(original, 1.0).squeeze(), sr)
    lower = zounds.AudioSamples(pitch_shift(original, -1.0).squeeze(), sr)

    # apply a sliding window to demonstrate time stretch and pitch shift in
    # batch mode
    windowing_sr = zounds.SampleRate(frequency=zounds.Seconds(5),
                                     duration=zounds.Seconds(10))

    windowed = snd.resampled.sliding_window(windowing_sr)
    windowed = zounds.ArrayWithUnits(
        windowed, [zounds.IdentityDimension(), windowed.dimensions[1]])

    def samples(x):
        return zounds.AudioSamples(x, sr)

    batch_slow = map(samples, time_stretch(windowed, 0.75))
    batch_fast = map(samples, time_stretch(windowed, 1.25))

    batch_higher = map(samples, pitch_shift(windowed, 1.0))
    batch_lower = map(samples, pitch_shift(windowed, -1.0))

    app = zounds.ZoundsApp(model=Sound,
import featureflow as ff
import numpy as np
from torch import nn
from torch.nn import functional as F
from torch.optim import Adam

import zounds
from zounds.learn import Conv1d, ConvTranspose1d, to_var, from_var
from zounds.timeseries import categorical, inverse_categorical

samplerate = zounds.SR11025()
BaseModel = zounds.resampled(resample_to=samplerate, store_resampled=True)

window_size = 8192
wscheme = zounds.SampleRate(frequency=samplerate.frequency *
                            (window_size // 2),
                            duration=samplerate.frequency * window_size)


@zounds.simple_lmdb_settings('ae', map_size=1e10, user_supplied_id=True)
class Sound(BaseModel):
    windowed = zounds.ArrayWithUnitsFeature(zounds.SlidingWindow,
                                            wscheme=wscheme,
                                            needs=BaseModel.resampled)

    mu_law = zounds.ArrayWithUnitsFeature(zounds.mu_law, needs=windowed)

    categorical = zounds.ArrayWithUnitsFeature(categorical, needs=windowed)


# TODO: Factor out the part of the pipeline that starts with samples and
scale_bands = 96
spectrogram_duration = 64

anchor_slice = slice(spectrogram_duration, spectrogram_duration * 2)

scale = zounds.GeometricScale(start_center_hz=50,
                              stop_center_hz=samplerate.nyquist,
                              bandwidth_ratio=0.115,
                              n_bands=scale_bands)
scale.ensure_overlap_ratio()

spectrogram_duration = 64

windowing_scheme = zounds.HalfLapped()
spectrogram_sample_rate = zounds.SampleRate(
    frequency=windowing_scheme.frequency * (spectrogram_duration // 2),
    duration=windowing_scheme.frequency * spectrogram_duration)


def spectrogram(x):
    x = apply_scale(np.abs(x.real),
                    scale,
                    window=zounds.OggVorbisWindowingFunc())
    x = zounds.log_modulus(x * 100)
    return x * zounds.AWeighting()


@zounds.simple_lmdb_settings('spectrogram_embedding',
                             map_size=1e11,
                             user_supplied_id=True)
class Sound(BaseModel):
Exemple #11
0
import requests
import zounds
from io import BytesIO
from bot_helper import BinaryData, main, SoundListener
import numpy as np
from log import module_logger

logger = module_logger(__file__)

SAMPLE_RATE = zounds.SR11025()
FILTER_BANK_KERNEL_SIZE = 512

windowing_sample_rate = zounds.SampleRate(
    frequency=(FILTER_BANK_KERNEL_SIZE // 2) * SAMPLE_RATE.frequency,
    duration=FILTER_BANK_KERNEL_SIZE * SAMPLE_RATE.frequency)


class FFTListener(SoundListener):
    def __init__(self, client, s3_client, page_size=3, logger=None):
        super().__init__(client, s3_client, page_size, logger)

    def _process_samples(self, samples):
        samples = samples.mono
        samples = zounds.soundfile.resample(samples, SAMPLE_RATE)

        spec = zounds.spectral.stft(samples, windowing_sample_rate)
        dims = spec.dimensions
        spec = np.abs(spec)
        spec = spec.astype(np.float32)
        spec = zounds.ArrayWithUnits(spec, dims)
        binary_data = BinaryData(spec)