Exemple #1
0
    def __init__(self):
        n_mels = 128
        n_fft = 1024
        hop = 256
        samplerate = zounds.SR22050()
        feature_size = 32
        total_samples = 8192

        n_osc = 128
        scale = zounds.MelScale(
            zounds.FrequencyBand(20, samplerate.nyquist - 20), n_osc)

        filter_bank = zounds.learn.FilterBank(samplerate,
                                              511,
                                              scale,
                                              0.9,
                                              normalize_filters=True,
                                              a_weighting=False)

        super().__init__(generator=DDSPGenerator(n_osc=n_osc,
                                                 input_size=feature_size,
                                                 in_channels=n_mels,
                                                 output_size=total_samples,
                                                 scale=scale,
                                                 samplerate=samplerate),
                         discriminator=MultiScaleMultiResDiscriminator(
                             total_samples,
                             flatten_multiscale_features=False,
                             decompose=True,
                             channel_judgements=True,
                             conditioning_channels=n_mels),
                         learning_rate=1e-4,
                         feature_size=feature_size,
                         audio_repr_class=RawAudio,
                         generator_loss=mel_gan_gen_loss,
                         sub_gen_loss=least_squares_generator_loss,
                         discriminator_loss=mel_gan_disc_loss,
                         sub_disc_loss=least_squares_disc_loss,
                         g_init=weights_init,
                         d_init=weights_init,
                         feature_funcs={
                             'audio': (audio, (samplerate, )),
                             'spectrogram': (spectrogram, (samplerate, ))
                         },
                         total_samples=total_samples,
                         feature_channels=n_mels,
                         inference_sequence_factor=4,
                         samplerate=samplerate)
Exemple #2
0
    def __init__(self):
        super(EmbeddingNetwork, self).__init__()

        frequency_channels = 128
        channels = frequency_channels

        sr = zounds.SR11025()
        interval = zounds.FrequencyBand.audible_range(sr)
        scale = zounds.MelScale(interval, frequency_channels)
        self.bank = zounds.learn.FilterBank(
            samplerate=sr,
            kernel_size=512,
            scale=scale,
            scaling_factors=np.linspace(0.1, 1.0, len(scale)),
            normalize_filters=True,
            a_weighting=True)

        self.main = nn.Sequential(
            nn.Conv2d(1, channels, (13, 3), padding=(7, 1), bias=False),
            nn.MaxPool2d((2, 2), (2, 2), padding=(1, 1)),
            nn.Conv2d(channels, channels, (13, 3), padding=(7, 1), bias=False),
            nn.MaxPool2d((2, 2), (2, 2), padding=(1, 1)),
            nn.Conv2d(channels, channels, (13, 3), padding=(7, 1), bias=False),
            nn.MaxPool2d((2, 2), (2, 2), padding=(1, 1)),
            nn.Conv2d(channels, channels, (13, 3), padding=(7, 1), bias=False),
            nn.MaxPool2d((2, 2), (2, 2), padding=(1, 1)),
        )

        self.final = nn.Sequential(
            nn.Linear(128, 64, bias=False),
            nn.Linear(64, 32, bias=False),
            nn.Linear(32, 16, bias=False),
            nn.Linear(16, 8, bias=False),
        )

        self.linear = nn.Linear(8, 3, bias=False)
Exemple #3
0
samplerate = zounds.SR22050()
BaseModel = zounds.stft(resample_to=samplerate, store_fft=True)


@zounds.simple_in_memory_settings
class Sound(BaseModel):
    pass


if __name__ == '__main__':
    url = 'https://ia802606.us.archive.org/9/items/AOC11B/onclassical_luisi_bach_partita_e-minor_bwv-830_3.ogg'
    _id = Sound.process(meta=url)
    snd = Sound(_id)

    band = zounds.FrequencyBand(50, samplerate.nyquist)
    bark_scale = zounds.BarkScale(band, 100)
    mel_scale = zounds.MelScale(band, 100)
    chroma_scale = zounds.ChromaScale(band)

    bark_bands = bark_scale.apply(snd.fft, zounds.HanningWindowingFunc())
    mel_bands = mel_scale.apply(snd.fft, zounds.HanningWindowingFunc())
    chroma_bands = chroma_scale.apply(snd.fft, zounds.HanningWindowingFunc())

    app = zounds.ZoundsApp(
        model=Sound,
        visualization_feature=Sound.fft,
        audio_feature=Sound.ogg,
        globals=globals(),
        locals=locals())
    app.start(9999)
    app = zounds.ZoundsApp(globals=globals(), locals=locals())
    app.start_in_thread(9999)

    sr = zounds.SR22050()
    # synth = zounds.SineSynthesizer(sr)
    # samples = synth.synthesize(
    #     zounds.Seconds(2), [110, 220, 440, 880]).astype(np.float32)
    file_path = next(iter_files('/hdd/LJSpeech-1.1', '*.wav'))
    samples = zounds.AudioSamples.from_file(file_path).astype(np.float32)

    r = ComplextSTFT.from_audio(samples[None, None, :], sr)
    phase = r.phase
    phase[:] = np.random.uniform(-np.pi, np.pi, phase.shape)
    recon = r.listen()

    scale = zounds.MelScale(zounds.FrequencyBand(20, sr.nyquist - 20), 256)
    filter_bank = zounds.learn.FilterBank(sr,
                                          1024,
                                          scale,
                                          0.5,
                                          normalize_filters=False,
                                          a_weighting=False)

    result = filter_bank.convolve(torch.from_numpy(samples)[None, :])
    spec = np.clip(result.data.cpu().numpy().squeeze(), 0, np.inf).T[1024:2048]

    phase_result = filter_bank.convolve(torch.from_numpy(recon)[None, :])
    phase_spec = np.clip(phase_result.data.cpu().numpy().squeeze(), 0,
                         np.inf).T[1024:2048]
    input('Waiting...')
from torch import nn
from featuresynth.data import TrainingData
from featuresynth.generator import DDSPGenerator
from featuresynth.generator.ddsp import np_overlap_add
from featuresynth.util import device
from featuresynth.feature import \
    sr, total_samples, frequency_recomposition, feature_channels, band_sizes, \
    filter_banks, bandpass_filters, slices, compute_features
import numpy as np
from torch.optim import Adam
from featuresynth.discriminator import Discriminator
import os
from random import choice

sr = zounds.SR11025()
scale = zounds.MelScale(zounds.FrequencyBand(20, sr.nyquist), 128)
fb = zounds.learn.FilterBank(sr,
                             128,
                             scale,
                             np.linspace(0.25, 0.5, len(scale)),
                             normalize_filters=False,
                             a_weighting=False).to(device)
fb.filter_bank = fb.filter_bank / 10


def perceptual(x, window=512, log_mag=False):
    x = F.pad(x, (0, window // 2))
    x = torch.stft(x, window, window // 2, normalized=True)
    if log_mag:
        x = torch.log(1e-12 + torch.abs(x[:, :, 0]))
    return x.contiguous().view(x.shape[0], -1)
import requests
import zounds
from io import BytesIO
import numpy as np
from bot_helper import BinaryData, main, SoundListener
from log import module_logger

logger = module_logger(__file__)

N_FREQUENCY_BANDS = 512
SAMPLE_RATE = zounds.SR11025()
frequency_band = zounds.FrequencyBand(20, SAMPLE_RATE.nyquist)
scale = zounds.MelScale(frequency_band, N_FREQUENCY_BANDS)
FILTER_BANK_KERNEL_SIZE = 512
FILTER_BANK = zounds.spectral.morlet_filter_bank(SAMPLE_RATE,
                                                 FILTER_BANK_KERNEL_SIZE,
                                                 scale,
                                                 scaling_factor=np.linspace(
                                                     0.1, 1.0, len(scale)),
                                                 normalize=True)
FILTER_BANK *= zounds.AWeighting()
FILTER_BANK = np.array(FILTER_BANK)


class SpectrogramListener(SoundListener):
    def __init__(self, client, s3_client, page_size=3, logger=None):
        super().__init__(client, s3_client, page_size, logger)

    def _process_samples(self, samples):
        samples = samples.mono
        samples = zounds.soundfile.resample(samples, SAMPLE_RATE)
Exemple #7
0
        return coeffs

    @classmethod
    def from_audio(cls, samples, samplerate):
        coeffs = cls.batch_stft(samples)
        mag = np.abs(coeffs)
        coeffs = cls._embed(mag)
        coeffs = coeffs.transpose((0, 2, 1))
        coeffs = np.log(coeffs + 1e-12)
        coeffs = cls._postprocess_coeffs(coeffs)
        return cls(coeffs, samplerate)


sr = zounds.SR11025()
n_bands = 256
mel_scale = zounds.MelScale(zounds.FrequencyBand(20, sr.nyquist - 20), n_bands)
geom_scale = zounds.GeometricScale(20, sr.nyquist - 20, 0.05, n_bands)
linear_scale = zounds.LinearScale(zounds.FrequencyBand(0, sr.nyquist), 513)
mel_scale_basis = mel_scale._basis(linear_scale, zounds.HanningWindowingFunc())
geom_scale_basis = geom_scale._basis(linear_scale,
                                     zounds.HanningWindowingFunc())


class MelScalePhaseRecover(BasePhaseRecovery):
    basis = mel_scale_basis

    def __init__(self, data, samplerate):
        super().__init__(data, samplerate)


class GeometricScalePhaseRecover(BasePhaseRecovery):