Ejemplo n.º 1
0
 def listen():
     padding = zounds.Milliseconds(250)
     z = np.concatenate(list(walk2(1000)))
     result = p.pipeline.transform(z).data.squeeze()
     x = np.concatenate([
         zounds.AudioSamples(j, samplerate).pad_with_silence(padding)
         for j in result
     ])
     return zounds.AudioSamples(x, zounds.SR11025())
Ejemplo n.º 2
0
def compute_embedding(samples, network):
    # TODO: resampling can fail for some odd sampling rates
    samples = zounds.soundfile.resample(samples, zounds.SR11025())
    freq = samples.frequency * 8192
    windowed = samples.sliding_window(
        zounds.SampleRate(frequency=freq, duration=freq))
    dims = windowed.dimensions
    output = zounds.learn.apply_network(network, windowed, chunksize=8)
    logger.info(output.shape)
    output = zounds.ArrayWithUnits(
        output, [dims[0], zounds.IdentityDimension()])
    return output
Ejemplo n.º 3
0
def test_spectral_filtering():
    # (1, 129, 64)
    total_samples = 16384
    window_size = 32
    hop_size = 16

    coeffs = get_filter_coeffs(window_size, total_samples // hop_size)

    noise = np.random.uniform(-1, 1, total_samples)
    noise = np.pad(noise, ((0, hop_size), ), mode='constant')
    windowed = zounds.sliding_window(noise, window_size, hop_size)
    # (1, 64, 256)
    noise_coeffs = np.fft.rfft(windowed, axis=-1, norm='ortho')
    # (1, 64, 129)

    filtered = coeffs.transpose((0, 2, 1)) * noise_coeffs
    recovered = np.fft.irfft(filtered, axis=-1, norm='ortho')
    samples = np_overlap_add(recovered[:, None, :, :], apply_window=True)
    samples = samples.squeeze()[:total_samples]
    # (1, 64, 256)
    return zounds.AudioSamples(samples, zounds.SR11025()).pad_with_silence()
Ejemplo n.º 4
0
    def __init__(self):
        super(EmbeddingNetwork, self).__init__()

        frequency_channels = 128
        channels = frequency_channels

        sr = zounds.SR11025()
        interval = zounds.FrequencyBand.audible_range(sr)
        scale = zounds.MelScale(interval, frequency_channels)
        self.bank = zounds.learn.FilterBank(
            samplerate=sr,
            kernel_size=512,
            scale=scale,
            scaling_factors=np.linspace(0.1, 1.0, len(scale)),
            normalize_filters=True,
            a_weighting=True)

        self.main = nn.Sequential(
            nn.Conv2d(1, channels, (13, 3), padding=(7, 1), bias=False),
            nn.MaxPool2d((2, 2), (2, 2), padding=(1, 1)),
            nn.Conv2d(channels, channels, (13, 3), padding=(7, 1), bias=False),
            nn.MaxPool2d((2, 2), (2, 2), padding=(1, 1)),
            nn.Conv2d(channels, channels, (13, 3), padding=(7, 1), bias=False),
            nn.MaxPool2d((2, 2), (2, 2), padding=(1, 1)),
            nn.Conv2d(channels, channels, (13, 3), padding=(7, 1), bias=False),
            nn.MaxPool2d((2, 2), (2, 2), padding=(1, 1)),
        )

        self.final = nn.Sequential(
            nn.Linear(128, 64, bias=False),
            nn.Linear(64, 32, bias=False),
            nn.Linear(32, 16, bias=False),
            nn.Linear(16, 8, bias=False),
        )

        self.linear = nn.Linear(8, 3, bias=False)
Ejemplo n.º 5
0
windowing = zounds.HalfLapped()


# Segment audio files #########################################################

class Settings(ff.PersistenceSettings):
    id_provider = ff.UserSpecifiedIdProvider(key='_id')
    key_builder = ff.StringDelimitedKeyBuilder(seperator='|')
    database = ff.LmdbDatabase(path='onsetdata', key_builder=key_builder)
    event_log = ff.EventLog(
        path='onsetdataevents', channel=ff.InMemoryChannel())


STFT = zounds.stft(
    resample_to=zounds.SR11025(),
    wscheme=windowing)


class WithOnsets(STFT, Settings):
    bark = zounds.ArrayWithUnitsFeature(
        zounds.BarkBands,
        needs=STFT.fft,
        store=True)

    transience = zounds.ArrayWithUnitsFeature(
        zounds.MeasureOfTransience,
        needs=STFT.fft,
        store=True)

    sliding_detection = zounds.ArrayWithUnitsFeature(
Ejemplo n.º 6
0
import featureflow as ff
import numpy as np
import zounds
from torch import nn
from torch import optim
import argparse
from multiprocessing.pool import ThreadPool, cpu_count

samplerate = zounds.SR11025()
BaseModel = zounds.stft(resample_to=samplerate, store_fft=True)

scale = zounds.GeometricScale(start_center_hz=300,
                              stop_center_hz=3040,
                              bandwidth_ratio=0.07496,
                              n_bands=64)
scale.ensure_overlap_ratio(0.5)


@zounds.simple_lmdb_settings('speeches', map_size=1e10, user_supplied_id=True)
class Sound(BaseModel):
    """
    An audio processing pipeline that computes a frequency domain representation
    of the sound that follows a geometric scale
    """
    bark = zounds.ArrayWithUnitsFeature(zounds.BarkBands,
                                        samplerate=samplerate,
                                        stop_freq_hz=samplerate.nyquist,
                                        needs=BaseModel.fft,
                                        store=True)

    long_windowed = zounds.ArrayWithUnitsFeature(
Ejemplo n.º 7
0
import requests
import zounds
import numpy as np
from bot_helper import BinaryData, main, AnnotationListener
from log import module_logger
from stft_bot import windowing_sample_rate

logger = module_logger(__file__)

SAMPLE_RATE = zounds.SR11025()
frequency_band = zounds.FrequencyBand(20, SAMPLE_RATE.nyquist)
CHROMA_SCALE = zounds.ChromaScale(frequency_band)


class ChromaListener(AnnotationListener):
    def __init__(self, client, s3_client, page_size=3, logger=None):
        super().__init__('stft_bot',
                         client,
                         s3_client,
                         page_size,
                         logger=logger)
        self.dtype = np.float32().dtype

    def get_metadata(self):
        return {
            'type':
            str(self.dtype),
            'shape': ('variable', CHROMA_SCALE.n_bands),
            'dimensions': [{
                'type':
                'time',
Ejemplo n.º 8
0
"""
Demonstrate how to download and process sounds from https://freesound.org
"""

import zounds
import argparse

BaseModel = zounds.stft(resample_to=zounds.SR11025())


@zounds.simple_lmdb_settings('freesound', map_size=1e10, user_supplied_id=True)
class Sound(BaseModel):
    bark = zounds.ArrayWithUnitsFeature(zounds.BarkBands,
                                        needs=BaseModel.fft,
                                        store=True)

    chroma = zounds.ArrayWithUnitsFeature(zounds.Chroma,
                                          needs=BaseModel.fft,
                                          store=True)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--api-key',
        help='your Freesound API key (http://freesound.org/apiv2/apply/)',
        type=str,
        required=True)
    parser.add_argument('--query',
                        help='the text query to run against freesound',
                        type=str,
Ejemplo n.º 9
0
import zounds
from zounds.spectral import time_stretch, pitch_shift
from zounds.ui import AppSettings
import argparse

sr = zounds.SR11025()
BaseModel = zounds.stft(resample_to=sr, store_resampled=True)


@zounds.simple_in_memory_settings
class Sound(BaseModel):
    pass


if __name__ == '__main__':
    parser = argparse.ArgumentParser(parents=[AppSettings()])
    parser.add_argument(
        '--sound-uri',
        default=
        'https://archive.org/download/LucaBrasi2/06-Kevin_Gates-Out_The_Mud_Prod_By_The_Runners_The_Monarch.ogg'
    )
    args = parser.parse_args()

    _id = Sound.process(meta=args.sound_uri)
    snd = Sound(_id)

    original = snd.resampled
    slow = zounds.AudioSamples(time_stretch(original, 0.75).squeeze(), sr)
    fast = zounds.AudioSamples(time_stretch(original, 1.25).squeeze(), sr)

    higher = zounds.AudioSamples(pitch_shift(original, 1.0).squeeze(), sr)
Ejemplo n.º 10
0
import zounds

Resampled = zounds.resampled(resample_to=zounds.SR11025())


@zounds.simple_in_memory_settings
class Sound(Resampled):
    """
    A simple pipeline that computes a perceptually weighted modified discrete
    cosine transform, and "persists" feature data in an in-memory store.
    """

    windowed = zounds.ArrayWithUnitsFeature(
        zounds.SlidingWindow,
        needs=Resampled.resampled,
        wscheme=zounds.HalfLapped(),
        wfunc=zounds.OggVorbisWindowingFunc(),
        store=True)

    mdct = zounds.ArrayWithUnitsFeature(zounds.MDCT, needs=windowed)

    weighted = zounds.ArrayWithUnitsFeature(lambda x: x * zounds.AWeighting(),
                                            needs=mdct)


if __name__ == '__main__':

    # produce some audio to test our pipeline, and encode it as FLAC
    synth = zounds.SineSynthesizer(zounds.SR44100())
    samples = synth.synthesize(zounds.Seconds(5), [220., 440., 880.])
    encoded = samples.encode(fmt='FLAC')
Ejemplo n.º 11
0
    def __init__(self,
                 generator,
                 discriminator,
                 learning_rate,
                 feature_size,
                 audio_repr_class,
                 generator_loss,
                 discriminator_loss,
                 g_init=weights_init,
                 d_init=weights_init,
                 feature_funcs=None,
                 total_samples=16384,
                 feature_channels=256,
                 inference_sequence_factor=4,
                 samplerate=zounds.SR11025(),
                 sub_disc_loss=hinge_discriminator_loss,
                 sub_gen_loss=hinge_generator_loss):

        super().__init__()

        # how much longer than the test sequence should the inference sequence
        # be?
        self.sub_gen_loss = sub_gen_loss
        self.sub_disc_loss = sub_disc_loss
        self.inference_sequence_factor = inference_sequence_factor
        if feature_funcs is None:
            raise ValueError('You must provide feature funcs')

        self.discriminator_init = d_init
        self.generator_init = g_init

        if hasattr(generator, 'initialize_weights'):
            raise ValueError(
                'initialize_weights() method on generators is deprecated')

        if hasattr(discriminator, 'initialize_weights'):
            raise ValueError(
                'initialize_weights() method on discriminators is deprecated')

        self.__g = generator
        self.__g.apply(g_init)
        self.__g_optim = Adam(self.__g.parameters(),
                              lr=learning_rate,
                              betas=(0.5, 0.9))

        self.__d = discriminator
        self.__d.apply(d_init)
        self.__d_optim = Adam(self.__d.parameters(),
                              lr=learning_rate,
                              betas=(0.5, 0.9))

        self.__g_trainer = GeneratorTrainer(self.__g, self.__g_optim, self.__d,
                                            self.__d_optim, generator_loss,
                                            self.sub_gen_loss)

        self.__d_trainer = DiscriminatorTrainer(self.__g, self.__g_optim,
                                                self.__d, self.__d_optim,
                                                discriminator_loss,
                                                self.sub_disc_loss)

        self.__feature_size = feature_size
        self.__audio_repr_class = audio_repr_class

        self.__anchor_feature = 'spectrogram'
        self.__feature_funcs = feature_funcs

        self.training_steps = cycle(
            [self.discriminator_trainer, self.generator_trainer])

        self.samplerate = samplerate
        self.total_samples = total_samples
        self.feature_channels = feature_channels
Ejemplo n.º 12
0
import os
from torch import nn
from featuresynth.data import TrainingData
from featuresynth.generator import DDSPGenerator
from featuresynth.generator.ddsp import np_overlap_add
from featuresynth.util import device
from featuresynth.feature import \
    sr, total_samples, frequency_recomposition, feature_channels, band_sizes, \
    filter_banks, bandpass_filters, slices, compute_features
import numpy as np
from torch.optim import Adam
from featuresynth.discriminator import Discriminator
import os
from random import choice

sr = zounds.SR11025()
scale = zounds.MelScale(zounds.FrequencyBand(20, sr.nyquist), 128)
fb = zounds.learn.FilterBank(sr,
                             128,
                             scale,
                             np.linspace(0.25, 0.5, len(scale)),
                             normalize_filters=False,
                             a_weighting=False).to(device)
fb.filter_bank = fb.filter_bank / 10


def perceptual(x, window=512, log_mag=False):
    x = F.pad(x, (0, window // 2))
    x = torch.stft(x, window, window // 2, normalized=True)
    if log_mag:
        x = torch.log(1e-12 + torch.abs(x[:, :, 0]))