class Document(BaseModel): bark = zounds.ArrayWithUnitsFeature(zounds.BarkBands, samplerate=samplerate, stop_freq_hz=samplerate.nyquist, needs=BaseModel.fft, store=True) long_windowed = zounds.ArrayWithUnitsFeature( zounds.SlidingWindow, wscheme=zounds.SampleRate(frequency=zounds.Milliseconds(500), duration=zounds.Seconds(1)), wfunc=windowing_func, needs=BaseModel.resampled, store=True) dct = zounds.ArrayWithUnitsFeature(zounds.DCT, scale_always_even=True, needs=long_windowed, store=True) mdct = zounds.FrequencyAdaptiveFeature(zounds.FrequencyAdaptiveTransform, transform=scipy.fftpack.idct, scale=scale, needs=dct, store=True)
class Sound(BaseModel): """ An audio processing pipeline that computes a frequency domain representation of the sound that follows a geometric scale """ bark = zounds.ArrayWithUnitsFeature(zounds.BarkBands, samplerate=samplerate, stop_freq_hz=samplerate.nyquist, needs=BaseModel.fft, store=True) long_windowed = zounds.ArrayWithUnitsFeature( zounds.SlidingWindow, wscheme=zounds.SampleRate(frequency=zounds.Milliseconds(358), duration=zounds.Milliseconds(716)), wfunc=zounds.OggVorbisWindowingFunc(), needs=BaseModel.resampled, store=True) long_fft = zounds.ArrayWithUnitsFeature(zounds.FFT, needs=long_windowed, store=True) freq_adaptive = zounds.FrequencyAdaptiveFeature( zounds.FrequencyAdaptiveTransform, transform=np.fft.irfft, scale=scale, window_func=np.hanning, needs=long_fft, store=False) rasterized = zounds.ArrayWithUnitsFeature(lambda fa: fa.rasterize(64), needs=freq_adaptive, store=False)
class SoundWithNoSettings(BaseModel): short_windowed = zounds.ArrayWithUnitsFeature( zounds.SlidingWindow, wscheme=windowing_scheme, wfunc=zounds.OggVorbisWindowingFunc(), needs=BaseModel.resampled) fft = zounds.ArrayWithUnitsFeature( zounds.FFT, needs=short_windowed) geom = zounds.ArrayWithUnitsFeature( spectrogram, needs=fft, store=True) log_spectrogram = zounds.ArrayWithUnitsFeature( zounds.SlidingWindow, wscheme=zounds.SampleRate( frequency=windowing_scheme.frequency * (spectrogram_duration // 2), duration=windowing_scheme.frequency * spectrogram_duration * 3), needs=geom) ls = zounds.ArrayWithUnitsFeature( zounds.SlidingWindow, wscheme=spectrogram_sample_rate, needs=geom)
def compute_embedding(samples, network): # TODO: resampling can fail for some odd sampling rates samples = zounds.soundfile.resample(samples, zounds.SR11025()) freq = samples.frequency * 8192 windowed = samples.sliding_window( zounds.SampleRate(frequency=freq, duration=freq)) dims = windowed.dimensions output = zounds.learn.apply_network(network, windowed, chunksize=8) logger.info(output.shape) output = zounds.ArrayWithUnits( output, [dims[0], zounds.IdentityDimension()]) return output
class Sound(BaseModel): windowed = zounds.ArrayWithUnitsFeature( zounds.SlidingWindow, wscheme=zounds.SampleRate(frequency=samplerate.frequency * (SAMPLE_SIZE // 2), duration=samplerate.frequency * SAMPLE_SIZE), needs=BaseModel.resampled, store=False) perceptual = zounds.ArrayWithUnitsFeature(perceptual, needs=windowed) decomposed = zounds.ArrayWithUnitsFeature( lambda x: FrequencyDecomposition(x, bands).as_frequency_adaptive(), needs=windowed)
def _process_samples(self, samples): samples = samples.mono samples = zounds.soundfile.resample(samples, SAMPLE_RATE) windowing_sample_rate = zounds.SampleRate( frequency=(FILTER_BANK_KERNEL_SIZE // 2) * SAMPLE_RATE.frequency, duration=FILTER_BANK_KERNEL_SIZE * SAMPLE_RATE.frequency) windowed = samples.sliding_window(windowing_sample_rate) windowed = np.asarray(windowed) spec = np.dot(FILTER_BANK, windowed.T).T spec = np.abs(spec) spec = 20 * np.log10(spec + 1) spec = np.ascontiguousarray(spec).astype(np.float32) spec = zounds.ArrayWithUnits(spec, [ zounds.TimeDimension(*windowing_sample_rate), zounds.FrequencyDimension(scale) ]) binary_data = BinaryData(spec) return binary_data
def generate_filter_banks(band_sizes): band_sizes = sorted(band_sizes) total_samples = band_sizes[-1] n_bands = [128] * 5 n_taps = 256 current_low_freq = 20 for i, size in enumerate(band_sizes): ratio = (total_samples / size) new_sr = zounds.SampleRate(sr.frequency * ratio, sr.duration * ratio) if size == total_samples: freq_band = zounds.FrequencyBand(current_low_freq, new_sr.nyquist - 20) else: freq_band = zounds.FrequencyBand(current_low_freq, new_sr.nyquist) bandpass = firwin(n_taps, [int(new_sr) // 4, (int(new_sr) // 2) - 1], fs=int(new_sr), pass_zero=False).astype(np.float32) bandpass = torch.from_numpy(bandpass).to(device).view(1, 1, n_taps) scale = zounds.GeometricScale(freq_band.start_hz, freq_band.stop_hz, 0.05, n_bands[i]) bank = zounds.learn.FilterBank( new_sr, n_taps, scale, # values close to zero get good frequency resolution. Values close # to one get good time resolution 0.25, normalize_filters=False, a_weighting=False).to(device) current_low_freq = freq_band.stop_hz yield bank, bandpass
) args = parser.parse_args() _id = Sound.process(meta=args.sound_uri) snd = Sound(_id) original = snd.resampled slow = zounds.AudioSamples(time_stretch(original, 0.75).squeeze(), sr) fast = zounds.AudioSamples(time_stretch(original, 1.25).squeeze(), sr) higher = zounds.AudioSamples(pitch_shift(original, 1.0).squeeze(), sr) lower = zounds.AudioSamples(pitch_shift(original, -1.0).squeeze(), sr) # apply a sliding window to demonstrate time stretch and pitch shift in # batch mode windowing_sr = zounds.SampleRate(frequency=zounds.Seconds(5), duration=zounds.Seconds(10)) windowed = snd.resampled.sliding_window(windowing_sr) windowed = zounds.ArrayWithUnits( windowed, [zounds.IdentityDimension(), windowed.dimensions[1]]) def samples(x): return zounds.AudioSamples(x, sr) batch_slow = map(samples, time_stretch(windowed, 0.75)) batch_fast = map(samples, time_stretch(windowed, 1.25)) batch_higher = map(samples, pitch_shift(windowed, 1.0)) batch_lower = map(samples, pitch_shift(windowed, -1.0)) app = zounds.ZoundsApp(model=Sound,
import featureflow as ff import numpy as np from torch import nn from torch.nn import functional as F from torch.optim import Adam import zounds from zounds.learn import Conv1d, ConvTranspose1d, to_var, from_var from zounds.timeseries import categorical, inverse_categorical samplerate = zounds.SR11025() BaseModel = zounds.resampled(resample_to=samplerate, store_resampled=True) window_size = 8192 wscheme = zounds.SampleRate(frequency=samplerate.frequency * (window_size // 2), duration=samplerate.frequency * window_size) @zounds.simple_lmdb_settings('ae', map_size=1e10, user_supplied_id=True) class Sound(BaseModel): windowed = zounds.ArrayWithUnitsFeature(zounds.SlidingWindow, wscheme=wscheme, needs=BaseModel.resampled) mu_law = zounds.ArrayWithUnitsFeature(zounds.mu_law, needs=windowed) categorical = zounds.ArrayWithUnitsFeature(categorical, needs=windowed) # TODO: Factor out the part of the pipeline that starts with samples and
scale_bands = 96 spectrogram_duration = 64 anchor_slice = slice(spectrogram_duration, spectrogram_duration * 2) scale = zounds.GeometricScale(start_center_hz=50, stop_center_hz=samplerate.nyquist, bandwidth_ratio=0.115, n_bands=scale_bands) scale.ensure_overlap_ratio() spectrogram_duration = 64 windowing_scheme = zounds.HalfLapped() spectrogram_sample_rate = zounds.SampleRate( frequency=windowing_scheme.frequency * (spectrogram_duration // 2), duration=windowing_scheme.frequency * spectrogram_duration) def spectrogram(x): x = apply_scale(np.abs(x.real), scale, window=zounds.OggVorbisWindowingFunc()) x = zounds.log_modulus(x * 100) return x * zounds.AWeighting() @zounds.simple_lmdb_settings('spectrogram_embedding', map_size=1e11, user_supplied_id=True) class Sound(BaseModel):
import requests import zounds from io import BytesIO from bot_helper import BinaryData, main, SoundListener import numpy as np from log import module_logger logger = module_logger(__file__) SAMPLE_RATE = zounds.SR11025() FILTER_BANK_KERNEL_SIZE = 512 windowing_sample_rate = zounds.SampleRate( frequency=(FILTER_BANK_KERNEL_SIZE // 2) * SAMPLE_RATE.frequency, duration=FILTER_BANK_KERNEL_SIZE * SAMPLE_RATE.frequency) class FFTListener(SoundListener): def __init__(self, client, s3_client, page_size=3, logger=None): super().__init__(client, s3_client, page_size, logger) def _process_samples(self, samples): samples = samples.mono samples = zounds.soundfile.resample(samples, SAMPLE_RATE) spec = zounds.spectral.stft(samples, windowing_sample_rate) dims = spec.dimensions spec = np.abs(spec) spec = spec.astype(np.float32) spec = zounds.ArrayWithUnits(spec, dims) binary_data = BinaryData(spec)