""" Use a triplet-loss to learn a similarity metric between short spectrograms UNSUPERVISED LEARNING OF SEMANTIC AUDIO REPRESENTATIONS https://arxiv.org/pdf/1711.02209.pdf """ import numpy as np import zounds from zounds.spectral import apply_scale samplerate = zounds.SR11025() BaseModel = zounds.resampled(resample_to=samplerate, store_resampled=True) scale_bands = 96 spectrogram_duration = 64 anchor_slice = slice(spectrogram_duration, spectrogram_duration * 2) scale = zounds.GeometricScale( start_center_hz=50, stop_center_hz=samplerate.nyquist, bandwidth_ratio=0.115, n_bands=scale_bands) scale.ensure_overlap_ratio() spectrogram_duration = 64 windowing_scheme = zounds.HalfLapped() spectrogram_sample_rate = zounds.SampleRate( frequency=windowing_scheme.frequency * (spectrogram_duration // 2),
import zounds Resampled = zounds.resampled(resample_to=zounds.SR11025()) @zounds.simple_in_memory_settings class Sound(Resampled): """ A simple pipeline that computes a perceptually weighted modified discrete cosine transform, and "persists" feature data in an in-memory store. """ windowed = zounds.ArrayWithUnitsFeature( zounds.SlidingWindow, needs=Resampled.resampled, wscheme=zounds.HalfLapped(), wfunc=zounds.OggVorbisWindowingFunc(), store=True) mdct = zounds.ArrayWithUnitsFeature( zounds.MDCT, needs=windowed) weighted = zounds.ArrayWithUnitsFeature( lambda x: x * zounds.AWeighting(), needs=mdct) if __name__ == '__main__': # produce some audio to test our pipeline, and encode it as FLAC synth = zounds.SineSynthesizer(zounds.SR44100())
import zounds Resampled = zounds.resampled(resample_to=zounds.SR11025()) @zounds.simple_in_memory_settings class Sound(Resampled): """ A simple pipeline that computes a perceptually weighted modified discrete cosine transform, and "persists" feature data in an in-memory store. """ windowed = zounds.ArrayWithUnitsFeature( zounds.SlidingWindow, needs=Resampled.resampled, wscheme=zounds.HalfLapped(), wfunc=zounds.OggVorbisWindowingFunc(), store=True) mdct = zounds.ArrayWithUnitsFeature(zounds.MDCT, needs=windowed) weighted = zounds.ArrayWithUnitsFeature(lambda x: x * zounds.AWeighting(), needs=mdct) if __name__ == '__main__': # produce some audio to test our pipeline, and encode it as FLAC synth = zounds.SineSynthesizer(zounds.SR44100()) samples = synth.synthesize(zounds.Seconds(5), [220., 440., 880.]) encoded = samples.encode(fmt='FLAC')
import argparse from random import choice import featureflow as ff import numpy as np from torch import nn from torch.nn import functional as F from torch.optim import Adam import zounds from zounds.learn import Conv1d, ConvTranspose1d, to_var, from_var from zounds.timeseries import categorical, inverse_categorical samplerate = zounds.SR11025() BaseModel = zounds.resampled(resample_to=samplerate, store_resampled=True) window_size = 8192 wscheme = zounds.SampleRate(frequency=samplerate.frequency * (window_size // 2), duration=samplerate.frequency * window_size) @zounds.simple_lmdb_settings('ae', map_size=1e10, user_supplied_id=True) class Sound(BaseModel): windowed = zounds.ArrayWithUnitsFeature(zounds.SlidingWindow, wscheme=wscheme, needs=BaseModel.resampled) mu_law = zounds.ArrayWithUnitsFeature(zounds.mu_law, needs=windowed) categorical = zounds.ArrayWithUnitsFeature(categorical, needs=windowed)
import numpy as np import zounds from zounds.spectral import apply_scale samplerate = zounds.SR11025() BaseModel = zounds.resampled(resample_to=samplerate) scale_bands = 96 spectrogram_duration = 64 anchor_slice = slice(spectrogram_duration, spectrogram_duration * 2) scale = zounds.GeometricScale( start_center_hz=50, stop_center_hz=samplerate.nyquist, bandwidth_ratio=0.115, n_bands=scale_bands) scale.ensure_overlap_ratio() spectrogram_duration = 64 windowing_scheme = zounds.HalfLapped() spectrogram_sample_rate = zounds.SampleRate( frequency=windowing_scheme.frequency * (spectrogram_duration // 2), duration=windowing_scheme.frequency * spectrogram_duration) def spectrogram(x): x = apply_scale( np.abs(x.real), scale, window=zounds.OggVorbisWindowingFunc()) x = zounds.log_modulus(x * 100)