def listen(): padding = zounds.Milliseconds(250) z = np.concatenate(list(walk2(1000))) result = p.pipeline.transform(z).data.squeeze() x = np.concatenate([ zounds.AudioSamples(j, samplerate).pad_with_silence(padding) for j in result ]) return zounds.AudioSamples(x, zounds.SR11025())
def compute_embedding(samples, network): # TODO: resampling can fail for some odd sampling rates samples = zounds.soundfile.resample(samples, zounds.SR11025()) freq = samples.frequency * 8192 windowed = samples.sliding_window( zounds.SampleRate(frequency=freq, duration=freq)) dims = windowed.dimensions output = zounds.learn.apply_network(network, windowed, chunksize=8) logger.info(output.shape) output = zounds.ArrayWithUnits( output, [dims[0], zounds.IdentityDimension()]) return output
def test_spectral_filtering(): # (1, 129, 64) total_samples = 16384 window_size = 32 hop_size = 16 coeffs = get_filter_coeffs(window_size, total_samples // hop_size) noise = np.random.uniform(-1, 1, total_samples) noise = np.pad(noise, ((0, hop_size), ), mode='constant') windowed = zounds.sliding_window(noise, window_size, hop_size) # (1, 64, 256) noise_coeffs = np.fft.rfft(windowed, axis=-1, norm='ortho') # (1, 64, 129) filtered = coeffs.transpose((0, 2, 1)) * noise_coeffs recovered = np.fft.irfft(filtered, axis=-1, norm='ortho') samples = np_overlap_add(recovered[:, None, :, :], apply_window=True) samples = samples.squeeze()[:total_samples] # (1, 64, 256) return zounds.AudioSamples(samples, zounds.SR11025()).pad_with_silence()
def __init__(self): super(EmbeddingNetwork, self).__init__() frequency_channels = 128 channels = frequency_channels sr = zounds.SR11025() interval = zounds.FrequencyBand.audible_range(sr) scale = zounds.MelScale(interval, frequency_channels) self.bank = zounds.learn.FilterBank( samplerate=sr, kernel_size=512, scale=scale, scaling_factors=np.linspace(0.1, 1.0, len(scale)), normalize_filters=True, a_weighting=True) self.main = nn.Sequential( nn.Conv2d(1, channels, (13, 3), padding=(7, 1), bias=False), nn.MaxPool2d((2, 2), (2, 2), padding=(1, 1)), nn.Conv2d(channels, channels, (13, 3), padding=(7, 1), bias=False), nn.MaxPool2d((2, 2), (2, 2), padding=(1, 1)), nn.Conv2d(channels, channels, (13, 3), padding=(7, 1), bias=False), nn.MaxPool2d((2, 2), (2, 2), padding=(1, 1)), nn.Conv2d(channels, channels, (13, 3), padding=(7, 1), bias=False), nn.MaxPool2d((2, 2), (2, 2), padding=(1, 1)), ) self.final = nn.Sequential( nn.Linear(128, 64, bias=False), nn.Linear(64, 32, bias=False), nn.Linear(32, 16, bias=False), nn.Linear(16, 8, bias=False), ) self.linear = nn.Linear(8, 3, bias=False)
windowing = zounds.HalfLapped() # Segment audio files ######################################################### class Settings(ff.PersistenceSettings): id_provider = ff.UserSpecifiedIdProvider(key='_id') key_builder = ff.StringDelimitedKeyBuilder(seperator='|') database = ff.LmdbDatabase(path='onsetdata', key_builder=key_builder) event_log = ff.EventLog( path='onsetdataevents', channel=ff.InMemoryChannel()) STFT = zounds.stft( resample_to=zounds.SR11025(), wscheme=windowing) class WithOnsets(STFT, Settings): bark = zounds.ArrayWithUnitsFeature( zounds.BarkBands, needs=STFT.fft, store=True) transience = zounds.ArrayWithUnitsFeature( zounds.MeasureOfTransience, needs=STFT.fft, store=True) sliding_detection = zounds.ArrayWithUnitsFeature(
import featureflow as ff import numpy as np import zounds from torch import nn from torch import optim import argparse from multiprocessing.pool import ThreadPool, cpu_count samplerate = zounds.SR11025() BaseModel = zounds.stft(resample_to=samplerate, store_fft=True) scale = zounds.GeometricScale(start_center_hz=300, stop_center_hz=3040, bandwidth_ratio=0.07496, n_bands=64) scale.ensure_overlap_ratio(0.5) @zounds.simple_lmdb_settings('speeches', map_size=1e10, user_supplied_id=True) class Sound(BaseModel): """ An audio processing pipeline that computes a frequency domain representation of the sound that follows a geometric scale """ bark = zounds.ArrayWithUnitsFeature(zounds.BarkBands, samplerate=samplerate, stop_freq_hz=samplerate.nyquist, needs=BaseModel.fft, store=True) long_windowed = zounds.ArrayWithUnitsFeature(
import requests import zounds import numpy as np from bot_helper import BinaryData, main, AnnotationListener from log import module_logger from stft_bot import windowing_sample_rate logger = module_logger(__file__) SAMPLE_RATE = zounds.SR11025() frequency_band = zounds.FrequencyBand(20, SAMPLE_RATE.nyquist) CHROMA_SCALE = zounds.ChromaScale(frequency_band) class ChromaListener(AnnotationListener): def __init__(self, client, s3_client, page_size=3, logger=None): super().__init__('stft_bot', client, s3_client, page_size, logger=logger) self.dtype = np.float32().dtype def get_metadata(self): return { 'type': str(self.dtype), 'shape': ('variable', CHROMA_SCALE.n_bands), 'dimensions': [{ 'type': 'time',
""" Demonstrate how to download and process sounds from https://freesound.org """ import zounds import argparse BaseModel = zounds.stft(resample_to=zounds.SR11025()) @zounds.simple_lmdb_settings('freesound', map_size=1e10, user_supplied_id=True) class Sound(BaseModel): bark = zounds.ArrayWithUnitsFeature(zounds.BarkBands, needs=BaseModel.fft, store=True) chroma = zounds.ArrayWithUnitsFeature(zounds.Chroma, needs=BaseModel.fft, store=True) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument( '--api-key', help='your Freesound API key (http://freesound.org/apiv2/apply/)', type=str, required=True) parser.add_argument('--query', help='the text query to run against freesound', type=str,
import zounds from zounds.spectral import time_stretch, pitch_shift from zounds.ui import AppSettings import argparse sr = zounds.SR11025() BaseModel = zounds.stft(resample_to=sr, store_resampled=True) @zounds.simple_in_memory_settings class Sound(BaseModel): pass if __name__ == '__main__': parser = argparse.ArgumentParser(parents=[AppSettings()]) parser.add_argument( '--sound-uri', default= 'https://archive.org/download/LucaBrasi2/06-Kevin_Gates-Out_The_Mud_Prod_By_The_Runners_The_Monarch.ogg' ) args = parser.parse_args() _id = Sound.process(meta=args.sound_uri) snd = Sound(_id) original = snd.resampled slow = zounds.AudioSamples(time_stretch(original, 0.75).squeeze(), sr) fast = zounds.AudioSamples(time_stretch(original, 1.25).squeeze(), sr) higher = zounds.AudioSamples(pitch_shift(original, 1.0).squeeze(), sr)
import zounds Resampled = zounds.resampled(resample_to=zounds.SR11025()) @zounds.simple_in_memory_settings class Sound(Resampled): """ A simple pipeline that computes a perceptually weighted modified discrete cosine transform, and "persists" feature data in an in-memory store. """ windowed = zounds.ArrayWithUnitsFeature( zounds.SlidingWindow, needs=Resampled.resampled, wscheme=zounds.HalfLapped(), wfunc=zounds.OggVorbisWindowingFunc(), store=True) mdct = zounds.ArrayWithUnitsFeature(zounds.MDCT, needs=windowed) weighted = zounds.ArrayWithUnitsFeature(lambda x: x * zounds.AWeighting(), needs=mdct) if __name__ == '__main__': # produce some audio to test our pipeline, and encode it as FLAC synth = zounds.SineSynthesizer(zounds.SR44100()) samples = synth.synthesize(zounds.Seconds(5), [220., 440., 880.]) encoded = samples.encode(fmt='FLAC')
def __init__(self, generator, discriminator, learning_rate, feature_size, audio_repr_class, generator_loss, discriminator_loss, g_init=weights_init, d_init=weights_init, feature_funcs=None, total_samples=16384, feature_channels=256, inference_sequence_factor=4, samplerate=zounds.SR11025(), sub_disc_loss=hinge_discriminator_loss, sub_gen_loss=hinge_generator_loss): super().__init__() # how much longer than the test sequence should the inference sequence # be? self.sub_gen_loss = sub_gen_loss self.sub_disc_loss = sub_disc_loss self.inference_sequence_factor = inference_sequence_factor if feature_funcs is None: raise ValueError('You must provide feature funcs') self.discriminator_init = d_init self.generator_init = g_init if hasattr(generator, 'initialize_weights'): raise ValueError( 'initialize_weights() method on generators is deprecated') if hasattr(discriminator, 'initialize_weights'): raise ValueError( 'initialize_weights() method on discriminators is deprecated') self.__g = generator self.__g.apply(g_init) self.__g_optim = Adam(self.__g.parameters(), lr=learning_rate, betas=(0.5, 0.9)) self.__d = discriminator self.__d.apply(d_init) self.__d_optim = Adam(self.__d.parameters(), lr=learning_rate, betas=(0.5, 0.9)) self.__g_trainer = GeneratorTrainer(self.__g, self.__g_optim, self.__d, self.__d_optim, generator_loss, self.sub_gen_loss) self.__d_trainer = DiscriminatorTrainer(self.__g, self.__g_optim, self.__d, self.__d_optim, discriminator_loss, self.sub_disc_loss) self.__feature_size = feature_size self.__audio_repr_class = audio_repr_class self.__anchor_feature = 'spectrogram' self.__feature_funcs = feature_funcs self.training_steps = cycle( [self.discriminator_trainer, self.generator_trainer]) self.samplerate = samplerate self.total_samples = total_samples self.feature_channels = feature_channels
import os from torch import nn from featuresynth.data import TrainingData from featuresynth.generator import DDSPGenerator from featuresynth.generator.ddsp import np_overlap_add from featuresynth.util import device from featuresynth.feature import \ sr, total_samples, frequency_recomposition, feature_channels, band_sizes, \ filter_banks, bandpass_filters, slices, compute_features import numpy as np from torch.optim import Adam from featuresynth.discriminator import Discriminator import os from random import choice sr = zounds.SR11025() scale = zounds.MelScale(zounds.FrequencyBand(20, sr.nyquist), 128) fb = zounds.learn.FilterBank(sr, 128, scale, np.linspace(0.25, 0.5, len(scale)), normalize_filters=False, a_weighting=False).to(device) fb.filter_bank = fb.filter_bank / 10 def perceptual(x, window=512, log_mag=False): x = F.pad(x, (0, window // 2)) x = torch.stft(x, window, window // 2, normalized=True) if log_mag: x = torch.log(1e-12 + torch.abs(x[:, :, 0]))