def __init__(self, samples, sizes, window=None): self.window = window self.sizes = sorted(sizes) self.samples = samples original = self.samples.copy() self.bands = [] self.frequency_bands = [] start_hz = 0 for size in sizes: # extract a frequency band if size != self.size: s = self._resample(original, size) else: s = original self.bands.append(s) original -= self._resample(s, self.size) stop_hz = samplerate.nyquist * (size / self.size) self.frequency_bands.append(zounds.FrequencyBand( start_hz, stop_hz)) start_hz = stop_hz
def make_filter_bank(cls, samplerate): scale = zounds.LinearScale( zounds.FrequencyBand(20, samplerate.nyquist - 20), 128) filter_bank = zounds.learn.FilterBank(samplerate, 511, scale, 0.9, normalize_filters=True, a_weighting=False) return filter_bank
def __init__(self): n_mels = 128 feature_size = 32 sr = zounds.SR22050() n_fft = 1024 hop = 256 total_samples = 8192 freq_band = zounds.FrequencyBand(20, sr.nyquist - 20) n_filters = 128 filter_taps = 511 gen_scale = zounds.LinearScale(freq_band, n_filters) gen_filter_bank = zounds.learn.FilterBank(sr, filter_taps, gen_scale, 0.9, normalize_filters=True, a_weighting=False) disc_scale = zounds.LinearScale(freq_band, n_filters) disc_filter_bank = zounds.learn.FilterBank(sr, filter_taps, disc_scale, 0.9, normalize_filters=True, a_weighting=False) super().__init__(generator=ResidualStackFilterBankGenerator( gen_filter_bank, feature_size, total_samples, n_mels, add_weight_norm=True), discriminator=FilterBankDiscriminator( disc_filter_bank, total_samples, conditioning_channels=n_mels), learning_rate=1e-4, feature_size=feature_size, audio_repr_class=RawAudio, generator_loss=mel_gan_gen_loss, sub_gen_loss=least_squares_generator_loss, discriminator_loss=mel_gan_disc_loss, sub_disc_loss=least_squares_disc_loss, g_init=weights_init, d_init=weights_init, feature_funcs={ 'audio': (audio, (sr, )), 'spectrogram': (spectrogram, (sr, )) }, total_samples=total_samples, feature_channels=n_mels, samplerate=sr, inference_sequence_factor=4)
def generate_filter_banks(band_sizes): band_sizes = sorted(band_sizes) total_samples = band_sizes[-1] n_bands = [128] * 5 n_taps = 256 current_low_freq = 20 for i, size in enumerate(band_sizes): ratio = (total_samples / size) new_sr = zounds.SampleRate(sr.frequency * ratio, sr.duration * ratio) if size == total_samples: freq_band = zounds.FrequencyBand(current_low_freq, new_sr.nyquist - 20) else: freq_band = zounds.FrequencyBand(current_low_freq, new_sr.nyquist) bandpass = firwin(n_taps, [int(new_sr) // 4, (int(new_sr) // 2) - 1], fs=int(new_sr), pass_zero=False).astype(np.float32) bandpass = torch.from_numpy(bandpass).to(device).view(1, 1, n_taps) scale = zounds.GeometricScale(freq_band.start_hz, freq_band.stop_hz, 0.05, n_bands[i]) bank = zounds.learn.FilterBank( new_sr, n_taps, scale, # values close to zero get good frequency resolution. Values close # to one get good time resolution 0.25, normalize_filters=False, a_weighting=False).to(device) current_low_freq = freq_band.stop_hz yield bank, bandpass
def __init__(self): n_mels = 128 n_fft = 1024 hop = 256 samplerate = zounds.SR22050() feature_size = 32 total_samples = 8192 n_osc = 128 scale = zounds.MelScale( zounds.FrequencyBand(20, samplerate.nyquist - 20), n_osc) filter_bank = zounds.learn.FilterBank(samplerate, 511, scale, 0.9, normalize_filters=True, a_weighting=False) super().__init__(generator=DDSPGenerator(n_osc=n_osc, input_size=feature_size, in_channels=n_mels, output_size=total_samples, scale=scale, samplerate=samplerate), discriminator=MultiScaleMultiResDiscriminator( total_samples, flatten_multiscale_features=False, decompose=True, channel_judgements=True, conditioning_channels=n_mels), learning_rate=1e-4, feature_size=feature_size, audio_repr_class=RawAudio, generator_loss=mel_gan_gen_loss, sub_gen_loss=least_squares_generator_loss, discriminator_loss=mel_gan_disc_loss, sub_disc_loss=least_squares_disc_loss, g_init=weights_init, d_init=weights_init, feature_funcs={ 'audio': (audio, (samplerate, )), 'spectrogram': (spectrogram, (samplerate, )) }, total_samples=total_samples, feature_channels=n_mels, inference_sequence_factor=4, samplerate=samplerate)
def make_filter_banks(taps, bands, sr, size): out = {} for tap, band in zip(taps, bands): # KLUDGE: Get rid of this hard-coded value if size == 8192: start = 0 else: start = sr.nyquist // 2 stop = sr.nyquist fb = zounds.FrequencyBand(start, stop) out[size] = zounds.learn.FilterBank(sr, tap, zounds.LinearScale(fb, band), 0.05, normalize_filters=True, a_weighting=False) print(size, sr, out[size].scale) size = size // 2 sr = sr * 2 return out
samplerate = zounds.SR22050() BaseModel = zounds.stft(resample_to=samplerate, store_fft=True) @zounds.simple_in_memory_settings class Sound(BaseModel): pass if __name__ == '__main__': url = 'https://ia802606.us.archive.org/9/items/AOC11B/onclassical_luisi_bach_partita_e-minor_bwv-830_3.ogg' _id = Sound.process(meta=url) snd = Sound(_id) band = zounds.FrequencyBand(50, samplerate.nyquist) bark_scale = zounds.BarkScale(band, 100) mel_scale = zounds.MelScale(band, 100) chroma_scale = zounds.ChromaScale(band) bark_bands = bark_scale.apply(snd.fft, zounds.HanningWindowingFunc()) mel_bands = mel_scale.apply(snd.fft, zounds.HanningWindowingFunc()) chroma_bands = chroma_scale.apply(snd.fft, zounds.HanningWindowingFunc()) app = zounds.ZoundsApp( model=Sound, visualization_feature=Sound.fft, audio_feature=Sound.ogg, globals=globals(), locals=locals()) app.start(9999)
import requests import zounds import numpy as np from bot_helper import BinaryData, main, AnnotationListener from log import module_logger from stft_bot import windowing_sample_rate logger = module_logger(__file__) SAMPLE_RATE = zounds.SR11025() frequency_band = zounds.FrequencyBand(20, SAMPLE_RATE.nyquist) CHROMA_SCALE = zounds.ChromaScale(frequency_band) class ChromaListener(AnnotationListener): def __init__(self, client, s3_client, page_size=3, logger=None): super().__init__('stft_bot', client, s3_client, page_size, logger=logger) self.dtype = np.float32().dtype def get_metadata(self): return { 'type': str(self.dtype), 'shape': ('variable', CHROMA_SCALE.n_bands), 'dimensions': [{ 'type': 'time',
app = zounds.ZoundsApp(globals=globals(), locals=locals()) app.start_in_thread(9999) sr = zounds.SR22050() # synth = zounds.SineSynthesizer(sr) # samples = synth.synthesize( # zounds.Seconds(2), [110, 220, 440, 880]).astype(np.float32) file_path = next(iter_files('/hdd/LJSpeech-1.1', '*.wav')) samples = zounds.AudioSamples.from_file(file_path).astype(np.float32) r = ComplextSTFT.from_audio(samples[None, None, :], sr) phase = r.phase phase[:] = np.random.uniform(-np.pi, np.pi, phase.shape) recon = r.listen() scale = zounds.MelScale(zounds.FrequencyBand(20, sr.nyquist - 20), 256) filter_bank = zounds.learn.FilterBank(sr, 1024, scale, 0.5, normalize_filters=False, a_weighting=False) result = filter_bank.convolve(torch.from_numpy(samples)[None, :]) spec = np.clip(result.data.cpu().numpy().squeeze(), 0, np.inf).T[1024:2048] phase_result = filter_bank.convolve(torch.from_numpy(recon)[None, :]) phase_spec = np.clip(phase_result.data.cpu().numpy().squeeze(), 0, np.inf).T[1024:2048] input('Waiting...')
def _scale(self, samplerate, bands, zero_start=False): start = 0 if zero_start else samplerate.nyquist / 2 end = samplerate.nyquist return zounds.LinearScale(zounds.FrequencyBand(start, end), bands)
from torch import nn from featuresynth.data import TrainingData from featuresynth.generator import DDSPGenerator from featuresynth.generator.ddsp import np_overlap_add from featuresynth.util import device from featuresynth.feature import \ sr, total_samples, frequency_recomposition, feature_channels, band_sizes, \ filter_banks, bandpass_filters, slices, compute_features import numpy as np from torch.optim import Adam from featuresynth.discriminator import Discriminator import os from random import choice sr = zounds.SR11025() scale = zounds.MelScale(zounds.FrequencyBand(20, sr.nyquist), 128) fb = zounds.learn.FilterBank(sr, 128, scale, np.linspace(0.25, 0.5, len(scale)), normalize_filters=False, a_weighting=False).to(device) fb.filter_bank = fb.filter_bank / 10 def perceptual(x, window=512, log_mag=False): x = F.pad(x, (0, window // 2)) x = torch.stft(x, window, window // 2, normalized=True) if log_mag: x = torch.log(1e-12 + torch.abs(x[:, :, 0])) return x.contiguous().view(x.shape[0], -1)
def freq_band(start, stop): return zounds.FrequencyBand(start, stop)
return coeffs @classmethod def from_audio(cls, samples, samplerate): coeffs = cls.batch_stft(samples) mag = np.abs(coeffs) coeffs = cls._embed(mag) coeffs = coeffs.transpose((0, 2, 1)) coeffs = np.log(coeffs + 1e-12) coeffs = cls._postprocess_coeffs(coeffs) return cls(coeffs, samplerate) sr = zounds.SR11025() n_bands = 256 mel_scale = zounds.MelScale(zounds.FrequencyBand(20, sr.nyquist - 20), n_bands) geom_scale = zounds.GeometricScale(20, sr.nyquist - 20, 0.05, n_bands) linear_scale = zounds.LinearScale(zounds.FrequencyBand(0, sr.nyquist), 513) mel_scale_basis = mel_scale._basis(linear_scale, zounds.HanningWindowingFunc()) geom_scale_basis = geom_scale._basis(linear_scale, zounds.HanningWindowingFunc()) class MelScalePhaseRecover(BasePhaseRecovery): basis = mel_scale_basis def __init__(self, data, samplerate): super().__init__(data, samplerate) class GeometricScalePhaseRecover(BasePhaseRecovery):