Esempio n. 1
0
    def __init__(self, samples, sizes, window=None):
        self.window = window
        self.sizes = sorted(sizes)
        self.samples = samples

        original = self.samples.copy()
        self.bands = []
        self.frequency_bands = []
        start_hz = 0

        for size in sizes:

            # extract a frequency band
            if size != self.size:
                s = self._resample(original, size)
            else:
                s = original

            self.bands.append(s)
            original -= self._resample(s, self.size)

            stop_hz = samplerate.nyquist * (size / self.size)
            self.frequency_bands.append(zounds.FrequencyBand(
                start_hz, stop_hz))
            start_hz = stop_hz
Esempio n. 2
0
 def make_filter_bank(cls, samplerate):
     scale = zounds.LinearScale(
         zounds.FrequencyBand(20, samplerate.nyquist - 20), 128)
     filter_bank = zounds.learn.FilterBank(samplerate,
                                           511,
                                           scale,
                                           0.9,
                                           normalize_filters=True,
                                           a_weighting=False)
     return filter_bank
Esempio n. 3
0
    def __init__(self):
        n_mels = 128
        feature_size = 32
        sr = zounds.SR22050()
        n_fft = 1024
        hop = 256
        total_samples = 8192

        freq_band = zounds.FrequencyBand(20, sr.nyquist - 20)
        n_filters = 128
        filter_taps = 511

        gen_scale = zounds.LinearScale(freq_band, n_filters)
        gen_filter_bank = zounds.learn.FilterBank(sr,
                                                  filter_taps,
                                                  gen_scale,
                                                  0.9,
                                                  normalize_filters=True,
                                                  a_weighting=False)

        disc_scale = zounds.LinearScale(freq_band, n_filters)
        disc_filter_bank = zounds.learn.FilterBank(sr,
                                                   filter_taps,
                                                   disc_scale,
                                                   0.9,
                                                   normalize_filters=True,
                                                   a_weighting=False)

        super().__init__(generator=ResidualStackFilterBankGenerator(
            gen_filter_bank,
            feature_size,
            total_samples,
            n_mels,
            add_weight_norm=True),
                         discriminator=FilterBankDiscriminator(
                             disc_filter_bank,
                             total_samples,
                             conditioning_channels=n_mels),
                         learning_rate=1e-4,
                         feature_size=feature_size,
                         audio_repr_class=RawAudio,
                         generator_loss=mel_gan_gen_loss,
                         sub_gen_loss=least_squares_generator_loss,
                         discriminator_loss=mel_gan_disc_loss,
                         sub_disc_loss=least_squares_disc_loss,
                         g_init=weights_init,
                         d_init=weights_init,
                         feature_funcs={
                             'audio': (audio, (sr, )),
                             'spectrogram': (spectrogram, (sr, ))
                         },
                         total_samples=total_samples,
                         feature_channels=n_mels,
                         samplerate=sr,
                         inference_sequence_factor=4)
Esempio n. 4
0
def generate_filter_banks(band_sizes):
    band_sizes = sorted(band_sizes)
    total_samples = band_sizes[-1]
    n_bands = [128] * 5

    n_taps = 256
    current_low_freq = 20

    for i, size in enumerate(band_sizes):
        ratio = (total_samples / size)
        new_sr = zounds.SampleRate(sr.frequency * ratio, sr.duration * ratio)

        if size == total_samples:
            freq_band = zounds.FrequencyBand(current_low_freq,
                                             new_sr.nyquist - 20)
        else:
            freq_band = zounds.FrequencyBand(current_low_freq, new_sr.nyquist)

        bandpass = firwin(n_taps, [int(new_sr) // 4, (int(new_sr) // 2) - 1],
                          fs=int(new_sr),
                          pass_zero=False).astype(np.float32)
        bandpass = torch.from_numpy(bandpass).to(device).view(1, 1, n_taps)

        scale = zounds.GeometricScale(freq_band.start_hz, freq_band.stop_hz,
                                      0.05, n_bands[i])
        bank = zounds.learn.FilterBank(
            new_sr,
            n_taps,
            scale,
            # values close to zero get good frequency resolution.  Values close
            # to one get good time resolution
            0.25,
            normalize_filters=False,
            a_weighting=False).to(device)

        current_low_freq = freq_band.stop_hz
        yield bank, bandpass
Esempio n. 5
0
    def __init__(self):
        n_mels = 128
        n_fft = 1024
        hop = 256
        samplerate = zounds.SR22050()
        feature_size = 32
        total_samples = 8192

        n_osc = 128
        scale = zounds.MelScale(
            zounds.FrequencyBand(20, samplerate.nyquist - 20), n_osc)

        filter_bank = zounds.learn.FilterBank(samplerate,
                                              511,
                                              scale,
                                              0.9,
                                              normalize_filters=True,
                                              a_weighting=False)

        super().__init__(generator=DDSPGenerator(n_osc=n_osc,
                                                 input_size=feature_size,
                                                 in_channels=n_mels,
                                                 output_size=total_samples,
                                                 scale=scale,
                                                 samplerate=samplerate),
                         discriminator=MultiScaleMultiResDiscriminator(
                             total_samples,
                             flatten_multiscale_features=False,
                             decompose=True,
                             channel_judgements=True,
                             conditioning_channels=n_mels),
                         learning_rate=1e-4,
                         feature_size=feature_size,
                         audio_repr_class=RawAudio,
                         generator_loss=mel_gan_gen_loss,
                         sub_gen_loss=least_squares_generator_loss,
                         discriminator_loss=mel_gan_disc_loss,
                         sub_disc_loss=least_squares_disc_loss,
                         g_init=weights_init,
                         d_init=weights_init,
                         feature_funcs={
                             'audio': (audio, (samplerate, )),
                             'spectrogram': (spectrogram, (samplerate, ))
                         },
                         total_samples=total_samples,
                         feature_channels=n_mels,
                         inference_sequence_factor=4,
                         samplerate=samplerate)
def make_filter_banks(taps, bands, sr, size):
    out = {}
    for tap, band in zip(taps, bands):
        # KLUDGE: Get rid of this hard-coded value
        if size == 8192:
            start = 0
        else:
            start = sr.nyquist // 2
        stop = sr.nyquist
        fb = zounds.FrequencyBand(start, stop)
        out[size] = zounds.learn.FilterBank(sr,
                                            tap,
                                            zounds.LinearScale(fb, band),
                                            0.05,
                                            normalize_filters=True,
                                            a_weighting=False)
        print(size, sr, out[size].scale)
        size = size // 2
        sr = sr * 2

    return out
Esempio n. 7
0
samplerate = zounds.SR22050()
BaseModel = zounds.stft(resample_to=samplerate, store_fft=True)


@zounds.simple_in_memory_settings
class Sound(BaseModel):
    pass


if __name__ == '__main__':
    url = 'https://ia802606.us.archive.org/9/items/AOC11B/onclassical_luisi_bach_partita_e-minor_bwv-830_3.ogg'
    _id = Sound.process(meta=url)
    snd = Sound(_id)

    band = zounds.FrequencyBand(50, samplerate.nyquist)
    bark_scale = zounds.BarkScale(band, 100)
    mel_scale = zounds.MelScale(band, 100)
    chroma_scale = zounds.ChromaScale(band)

    bark_bands = bark_scale.apply(snd.fft, zounds.HanningWindowingFunc())
    mel_bands = mel_scale.apply(snd.fft, zounds.HanningWindowingFunc())
    chroma_bands = chroma_scale.apply(snd.fft, zounds.HanningWindowingFunc())

    app = zounds.ZoundsApp(
        model=Sound,
        visualization_feature=Sound.fft,
        audio_feature=Sound.ogg,
        globals=globals(),
        locals=locals())
    app.start(9999)
Esempio n. 8
0
import requests
import zounds
import numpy as np
from bot_helper import BinaryData, main, AnnotationListener
from log import module_logger
from stft_bot import windowing_sample_rate

logger = module_logger(__file__)

SAMPLE_RATE = zounds.SR11025()
frequency_band = zounds.FrequencyBand(20, SAMPLE_RATE.nyquist)
CHROMA_SCALE = zounds.ChromaScale(frequency_band)


class ChromaListener(AnnotationListener):
    def __init__(self, client, s3_client, page_size=3, logger=None):
        super().__init__('stft_bot',
                         client,
                         s3_client,
                         page_size,
                         logger=logger)
        self.dtype = np.float32().dtype

    def get_metadata(self):
        return {
            'type':
            str(self.dtype),
            'shape': ('variable', CHROMA_SCALE.n_bands),
            'dimensions': [{
                'type':
                'time',
Esempio n. 9
0
    app = zounds.ZoundsApp(globals=globals(), locals=locals())
    app.start_in_thread(9999)

    sr = zounds.SR22050()
    # synth = zounds.SineSynthesizer(sr)
    # samples = synth.synthesize(
    #     zounds.Seconds(2), [110, 220, 440, 880]).astype(np.float32)
    file_path = next(iter_files('/hdd/LJSpeech-1.1', '*.wav'))
    samples = zounds.AudioSamples.from_file(file_path).astype(np.float32)

    r = ComplextSTFT.from_audio(samples[None, None, :], sr)
    phase = r.phase
    phase[:] = np.random.uniform(-np.pi, np.pi, phase.shape)
    recon = r.listen()

    scale = zounds.MelScale(zounds.FrequencyBand(20, sr.nyquist - 20), 256)
    filter_bank = zounds.learn.FilterBank(sr,
                                          1024,
                                          scale,
                                          0.5,
                                          normalize_filters=False,
                                          a_weighting=False)

    result = filter_bank.convolve(torch.from_numpy(samples)[None, :])
    spec = np.clip(result.data.cpu().numpy().squeeze(), 0, np.inf).T[1024:2048]

    phase_result = filter_bank.convolve(torch.from_numpy(recon)[None, :])
    phase_spec = np.clip(phase_result.data.cpu().numpy().squeeze(), 0,
                         np.inf).T[1024:2048]
    input('Waiting...')
Esempio n. 10
0
 def _scale(self, samplerate, bands, zero_start=False):
     start = 0 if zero_start else samplerate.nyquist / 2
     end = samplerate.nyquist
     return zounds.LinearScale(zounds.FrequencyBand(start, end), bands)
Esempio n. 11
0
from torch import nn
from featuresynth.data import TrainingData
from featuresynth.generator import DDSPGenerator
from featuresynth.generator.ddsp import np_overlap_add
from featuresynth.util import device
from featuresynth.feature import \
    sr, total_samples, frequency_recomposition, feature_channels, band_sizes, \
    filter_banks, bandpass_filters, slices, compute_features
import numpy as np
from torch.optim import Adam
from featuresynth.discriminator import Discriminator
import os
from random import choice

sr = zounds.SR11025()
scale = zounds.MelScale(zounds.FrequencyBand(20, sr.nyquist), 128)
fb = zounds.learn.FilterBank(sr,
                             128,
                             scale,
                             np.linspace(0.25, 0.5, len(scale)),
                             normalize_filters=False,
                             a_weighting=False).to(device)
fb.filter_bank = fb.filter_bank / 10


def perceptual(x, window=512, log_mag=False):
    x = F.pad(x, (0, window // 2))
    x = torch.stft(x, window, window // 2, normalized=True)
    if log_mag:
        x = torch.log(1e-12 + torch.abs(x[:, :, 0]))
    return x.contiguous().view(x.shape[0], -1)
def freq_band(start, stop):
    return zounds.FrequencyBand(start, stop)
Esempio n. 13
0
        return coeffs

    @classmethod
    def from_audio(cls, samples, samplerate):
        coeffs = cls.batch_stft(samples)
        mag = np.abs(coeffs)
        coeffs = cls._embed(mag)
        coeffs = coeffs.transpose((0, 2, 1))
        coeffs = np.log(coeffs + 1e-12)
        coeffs = cls._postprocess_coeffs(coeffs)
        return cls(coeffs, samplerate)


sr = zounds.SR11025()
n_bands = 256
mel_scale = zounds.MelScale(zounds.FrequencyBand(20, sr.nyquist - 20), n_bands)
geom_scale = zounds.GeometricScale(20, sr.nyquist - 20, 0.05, n_bands)
linear_scale = zounds.LinearScale(zounds.FrequencyBand(0, sr.nyquist), 513)
mel_scale_basis = mel_scale._basis(linear_scale, zounds.HanningWindowingFunc())
geom_scale_basis = geom_scale._basis(linear_scale,
                                     zounds.HanningWindowingFunc())


class MelScalePhaseRecover(BasePhaseRecovery):
    basis = mel_scale_basis

    def __init__(self, data, samplerate):
        super().__init__(data, samplerate)


class GeometricScalePhaseRecover(BasePhaseRecovery):