def __init__(self): n_mels = 128 n_fft = 1024 hop = 256 samplerate = zounds.SR22050() feature_size = 32 total_samples = 8192 n_osc = 128 scale = zounds.MelScale( zounds.FrequencyBand(20, samplerate.nyquist - 20), n_osc) filter_bank = zounds.learn.FilterBank(samplerate, 511, scale, 0.9, normalize_filters=True, a_weighting=False) super().__init__(generator=DDSPGenerator(n_osc=n_osc, input_size=feature_size, in_channels=n_mels, output_size=total_samples, scale=scale, samplerate=samplerate), discriminator=MultiScaleMultiResDiscriminator( total_samples, flatten_multiscale_features=False, decompose=True, channel_judgements=True, conditioning_channels=n_mels), learning_rate=1e-4, feature_size=feature_size, audio_repr_class=RawAudio, generator_loss=mel_gan_gen_loss, sub_gen_loss=least_squares_generator_loss, discriminator_loss=mel_gan_disc_loss, sub_disc_loss=least_squares_disc_loss, g_init=weights_init, d_init=weights_init, feature_funcs={ 'audio': (audio, (samplerate, )), 'spectrogram': (spectrogram, (samplerate, )) }, total_samples=total_samples, feature_channels=n_mels, inference_sequence_factor=4, samplerate=samplerate)
def __init__(self): super(EmbeddingNetwork, self).__init__() frequency_channels = 128 channels = frequency_channels sr = zounds.SR11025() interval = zounds.FrequencyBand.audible_range(sr) scale = zounds.MelScale(interval, frequency_channels) self.bank = zounds.learn.FilterBank( samplerate=sr, kernel_size=512, scale=scale, scaling_factors=np.linspace(0.1, 1.0, len(scale)), normalize_filters=True, a_weighting=True) self.main = nn.Sequential( nn.Conv2d(1, channels, (13, 3), padding=(7, 1), bias=False), nn.MaxPool2d((2, 2), (2, 2), padding=(1, 1)), nn.Conv2d(channels, channels, (13, 3), padding=(7, 1), bias=False), nn.MaxPool2d((2, 2), (2, 2), padding=(1, 1)), nn.Conv2d(channels, channels, (13, 3), padding=(7, 1), bias=False), nn.MaxPool2d((2, 2), (2, 2), padding=(1, 1)), nn.Conv2d(channels, channels, (13, 3), padding=(7, 1), bias=False), nn.MaxPool2d((2, 2), (2, 2), padding=(1, 1)), ) self.final = nn.Sequential( nn.Linear(128, 64, bias=False), nn.Linear(64, 32, bias=False), nn.Linear(32, 16, bias=False), nn.Linear(16, 8, bias=False), ) self.linear = nn.Linear(8, 3, bias=False)
samplerate = zounds.SR22050() BaseModel = zounds.stft(resample_to=samplerate, store_fft=True) @zounds.simple_in_memory_settings class Sound(BaseModel): pass if __name__ == '__main__': url = 'https://ia802606.us.archive.org/9/items/AOC11B/onclassical_luisi_bach_partita_e-minor_bwv-830_3.ogg' _id = Sound.process(meta=url) snd = Sound(_id) band = zounds.FrequencyBand(50, samplerate.nyquist) bark_scale = zounds.BarkScale(band, 100) mel_scale = zounds.MelScale(band, 100) chroma_scale = zounds.ChromaScale(band) bark_bands = bark_scale.apply(snd.fft, zounds.HanningWindowingFunc()) mel_bands = mel_scale.apply(snd.fft, zounds.HanningWindowingFunc()) chroma_bands = chroma_scale.apply(snd.fft, zounds.HanningWindowingFunc()) app = zounds.ZoundsApp( model=Sound, visualization_feature=Sound.fft, audio_feature=Sound.ogg, globals=globals(), locals=locals()) app.start(9999)
app = zounds.ZoundsApp(globals=globals(), locals=locals()) app.start_in_thread(9999) sr = zounds.SR22050() # synth = zounds.SineSynthesizer(sr) # samples = synth.synthesize( # zounds.Seconds(2), [110, 220, 440, 880]).astype(np.float32) file_path = next(iter_files('/hdd/LJSpeech-1.1', '*.wav')) samples = zounds.AudioSamples.from_file(file_path).astype(np.float32) r = ComplextSTFT.from_audio(samples[None, None, :], sr) phase = r.phase phase[:] = np.random.uniform(-np.pi, np.pi, phase.shape) recon = r.listen() scale = zounds.MelScale(zounds.FrequencyBand(20, sr.nyquist - 20), 256) filter_bank = zounds.learn.FilterBank(sr, 1024, scale, 0.5, normalize_filters=False, a_weighting=False) result = filter_bank.convolve(torch.from_numpy(samples)[None, :]) spec = np.clip(result.data.cpu().numpy().squeeze(), 0, np.inf).T[1024:2048] phase_result = filter_bank.convolve(torch.from_numpy(recon)[None, :]) phase_spec = np.clip(phase_result.data.cpu().numpy().squeeze(), 0, np.inf).T[1024:2048] input('Waiting...')
from torch import nn from featuresynth.data import TrainingData from featuresynth.generator import DDSPGenerator from featuresynth.generator.ddsp import np_overlap_add from featuresynth.util import device from featuresynth.feature import \ sr, total_samples, frequency_recomposition, feature_channels, band_sizes, \ filter_banks, bandpass_filters, slices, compute_features import numpy as np from torch.optim import Adam from featuresynth.discriminator import Discriminator import os from random import choice sr = zounds.SR11025() scale = zounds.MelScale(zounds.FrequencyBand(20, sr.nyquist), 128) fb = zounds.learn.FilterBank(sr, 128, scale, np.linspace(0.25, 0.5, len(scale)), normalize_filters=False, a_weighting=False).to(device) fb.filter_bank = fb.filter_bank / 10 def perceptual(x, window=512, log_mag=False): x = F.pad(x, (0, window // 2)) x = torch.stft(x, window, window // 2, normalized=True) if log_mag: x = torch.log(1e-12 + torch.abs(x[:, :, 0])) return x.contiguous().view(x.shape[0], -1)
import requests import zounds from io import BytesIO import numpy as np from bot_helper import BinaryData, main, SoundListener from log import module_logger logger = module_logger(__file__) N_FREQUENCY_BANDS = 512 SAMPLE_RATE = zounds.SR11025() frequency_band = zounds.FrequencyBand(20, SAMPLE_RATE.nyquist) scale = zounds.MelScale(frequency_band, N_FREQUENCY_BANDS) FILTER_BANK_KERNEL_SIZE = 512 FILTER_BANK = zounds.spectral.morlet_filter_bank(SAMPLE_RATE, FILTER_BANK_KERNEL_SIZE, scale, scaling_factor=np.linspace( 0.1, 1.0, len(scale)), normalize=True) FILTER_BANK *= zounds.AWeighting() FILTER_BANK = np.array(FILTER_BANK) class SpectrogramListener(SoundListener): def __init__(self, client, s3_client, page_size=3, logger=None): super().__init__(client, s3_client, page_size, logger) def _process_samples(self, samples): samples = samples.mono samples = zounds.soundfile.resample(samples, SAMPLE_RATE)
return coeffs @classmethod def from_audio(cls, samples, samplerate): coeffs = cls.batch_stft(samples) mag = np.abs(coeffs) coeffs = cls._embed(mag) coeffs = coeffs.transpose((0, 2, 1)) coeffs = np.log(coeffs + 1e-12) coeffs = cls._postprocess_coeffs(coeffs) return cls(coeffs, samplerate) sr = zounds.SR11025() n_bands = 256 mel_scale = zounds.MelScale(zounds.FrequencyBand(20, sr.nyquist - 20), n_bands) geom_scale = zounds.GeometricScale(20, sr.nyquist - 20, 0.05, n_bands) linear_scale = zounds.LinearScale(zounds.FrequencyBand(0, sr.nyquist), 513) mel_scale_basis = mel_scale._basis(linear_scale, zounds.HanningWindowingFunc()) geom_scale_basis = geom_scale._basis(linear_scale, zounds.HanningWindowingFunc()) class MelScalePhaseRecover(BasePhaseRecovery): basis = mel_scale_basis def __init__(self, data, samplerate): super().__init__(data, samplerate) class GeometricScalePhaseRecover(BasePhaseRecovery):