def __init__(self): feature_channels = 128 feature_size = 32 samplerate = zounds.SR22050() n_fft = 1024 hop = 256 total_samples = 8192 super().__init__(generator=GroupedMDCTGenerator(feature_channels), discriminator=MDCTDiscriminator( MDCT.mdct_bins(), feature_size, conditioning_channels=feature_channels), learning_rate=1e-4, feature_size=feature_size, audio_repr_class=MDCT, generator_loss=mel_gan_gen_loss, sub_gen_loss=least_squares_generator_loss, discriminator_loss=mel_gan_disc_loss, sub_disc_loss=least_squares_disc_loss, g_init=weights_init, d_init=weights_init, feature_funcs={ 'audio': (audio, (samplerate, )), 'spectrogram': (spectrogram, (samplerate, )) }, total_samples=total_samples, feature_channels=feature_channels, samplerate=samplerate, inference_sequence_factor=4)
def __init__(self): total_samples = 8192 samplerate = zounds.SR22050() n_fft = 1024 hop = 256 n_mels = 128 feature_size = total_samples // hop super().__init__(generator=MelGanGenerator(feature_size, n_mels), discriminator=MelGanDiscriminator(), learning_rate=1e-4, feature_size=feature_size, audio_repr_class=RawAudio, generator_loss=mel_gan_gen_loss, discriminator_loss=mel_gan_disc_loss, g_init=weights_init, d_init=weights_init, feature_funcs={ 'audio': (audio, (samplerate, )), 'spectrogram': (spectrogram, (samplerate, n_fft, hop, n_mels)) }, total_samples=total_samples, feature_channels=n_mels, samplerate=samplerate)
def __init__(self): n_mels = 128 size = 32 samplerate = zounds.SR22050() n_fft = 1024 hop = 256 total_samples = 8192 super().__init__(Generator(n_mels, size, n_residual_layers=3), Discriminator(num_D=3, ndf=16, n_layers=4, downsampling_factor=4), learning_rate=1e-4, feature_size=size, audio_repr_class=RawAudio, generator_loss=mel_gan_gen_loss, discriminator_loss=mel_gan_disc_loss, g_init=weights_init, d_init=weights_init, feature_funcs={ 'audio': (audio, (samplerate, )), 'spectrogram': (spectrogram, (samplerate, )) }, total_samples=total_samples, feature_channels=n_mels, samplerate=samplerate, inference_sequence_factor=4)
def __init__(self): n_mels = 128 feature_size = 32 samplerate = zounds.SR22050() n_fft = 1024 hop = 256 total_samples = 8192 super().__init__(generator=MultiScaleGenerator(n_mels, feature_size, total_samples, transposed_conv=True, recompose=True), discriminator=ComplextSTFTDiscriminator(n_fft, hop, n_mels, do_fft=True), learning_rate=1e-4, feature_size=feature_size, audio_repr_class=RawAudio, generator_loss=mel_gan_gen_loss, sub_gen_loss=least_squares_generator_loss, discriminator_loss=mel_gan_disc_loss, sub_disc_loss=least_squares_disc_loss, g_init=weights_init, d_init=weights_init, feature_funcs={ 'audio': (audio, (samplerate, )), 'spectrogram': (spectrogram, (samplerate, )) }, total_samples=total_samples, feature_channels=n_mels, samplerate=samplerate, inference_sequence_factor=4)
def __init__(self): n_mels = 128 feature_size = 32 sr = zounds.SR22050() n_fft = 1024 hop = 256 total_samples = 8192 freq_band = zounds.FrequencyBand(20, sr.nyquist - 20) n_filters = 128 filter_taps = 511 gen_scale = zounds.LinearScale(freq_band, n_filters) gen_filter_bank = zounds.learn.FilterBank(sr, filter_taps, gen_scale, 0.9, normalize_filters=True, a_weighting=False) disc_scale = zounds.LinearScale(freq_band, n_filters) disc_filter_bank = zounds.learn.FilterBank(sr, filter_taps, disc_scale, 0.9, normalize_filters=True, a_weighting=False) super().__init__(generator=ResidualStackFilterBankGenerator( gen_filter_bank, feature_size, total_samples, n_mels, add_weight_norm=True), discriminator=FilterBankDiscriminator( disc_filter_bank, total_samples, conditioning_channels=n_mels), learning_rate=1e-4, feature_size=feature_size, audio_repr_class=RawAudio, generator_loss=mel_gan_gen_loss, sub_gen_loss=least_squares_generator_loss, discriminator_loss=mel_gan_disc_loss, sub_disc_loss=least_squares_disc_loss, g_init=weights_init, d_init=weights_init, feature_funcs={ 'audio': (audio, (sr, )), 'spectrogram': (spectrogram, (sr, )) }, total_samples=total_samples, feature_channels=n_mels, samplerate=sr, inference_sequence_factor=4)
def stream(batch_size=64): path = '/hdd/musicnet/train_data' pattern = '*.wav' samplerate = zounds.SR22050() feature_spec = {'spectrogram': (256, 128)} feature_funcs = {'spectrogram': (spectrogram, (samplerate, ))} bs = batch_stream(path, pattern, batch_size, feature_spec, 'spectrogram', feature_funcs) return bs
def __init__(self): n_mels = 128 n_fft = 1024 hop = 256 samplerate = zounds.SR22050() feature_size = 32 total_samples = 8192 n_osc = 128 scale = zounds.MelScale( zounds.FrequencyBand(20, samplerate.nyquist - 20), n_osc) filter_bank = zounds.learn.FilterBank(samplerate, 511, scale, 0.9, normalize_filters=True, a_weighting=False) super().__init__(generator=DDSPGenerator(n_osc=n_osc, input_size=feature_size, in_channels=n_mels, output_size=total_samples, scale=scale, samplerate=samplerate), discriminator=MultiScaleMultiResDiscriminator( total_samples, flatten_multiscale_features=False, decompose=True, channel_judgements=True, conditioning_channels=n_mels), learning_rate=1e-4, feature_size=feature_size, audio_repr_class=RawAudio, generator_loss=mel_gan_gen_loss, sub_gen_loss=least_squares_generator_loss, discriminator_loss=mel_gan_disc_loss, sub_disc_loss=least_squares_disc_loss, g_init=weights_init, d_init=weights_init, feature_funcs={ 'audio': (audio, (samplerate, )), 'spectrogram': (spectrogram, (samplerate, )) }, total_samples=total_samples, feature_channels=n_mels, inference_sequence_factor=4, samplerate=samplerate)
def stream(total_samples=8192, batch_size=32): path = '/hdd/musicnet/train_data' pattern = '*.wav' samplerate = zounds.SR22050() # total_samples = 8192 feature_spec = {'audio': (total_samples, 1)} feature_funcs = {'audio': (audio, (samplerate, ))} # batch_size = 32 bs = batch_stream(path, pattern, batch_size, feature_spec, 'audio', feature_funcs) for batch, in bs: transformed = IdentityPhaseReovery.from_audio(batch, samplerate) yield batch, transformed
class FilterBankMultiscaleExperiment(Experiment): AUDIO_REPR_CLASS = MultiScale SAMPLERATE = zounds.SR22050() N_MELS = 128 feature_size = 32 total_samples = 8192 @classmethod def make_generator(cls): return FilterBankMultiScaleGenerator(cls.SAMPLERATE, cls.N_MELS, cls.feature_size, cls.total_samples, recompose=False) def __init__(self): super().__init__(generator=FilterBankMultiScaleGenerator( self.SAMPLERATE, self.N_MELS, self.feature_size, self.total_samples, recompose=False), discriminator=FilterBankMultiScaleDiscriminator( self.total_samples, self.SAMPLERATE, decompose=False, conditioning_channels=self.N_MELS), learning_rate=1e-4, feature_size=self.feature_size, audio_repr_class=self.AUDIO_REPR_CLASS, generator_loss=mel_gan_gen_loss, sub_gen_loss=least_squares_generator_loss, discriminator_loss=mel_gan_disc_loss, sub_disc_loss=least_squares_disc_loss, g_init=weights_init, d_init=weights_init, feature_funcs={ 'audio': (audio, (self.SAMPLERATE, )), 'spectrogram': (spectrogram, (self.SAMPLERATE, )) }, total_samples=self.total_samples, feature_channels=self.N_MELS, samplerate=self.SAMPLERATE, inference_sequence_factor=4)
def __init__(self): noise_dim = 128 samplerate = zounds.SR22050() repr_class = PcaRepresentation vocoder = DeterministicVocoder(repr_class, samplerate) n_features = repr_class.pca.n_components def gen_loss(r_features, f_features, r_score, f_score, gan_loss): return least_squares_generator_loss(f_score) def disc_loss(r_score, f_score, gan_loss): return least_squares_disc_loss(r_score, f_score) disc_channels = 256 super().__init__( vocoder=vocoder, feature_generator=PredictiveGenerator(), generator_init=weights_init, generator_loss=gen_loss, feature_disc=SpectrogramFeatureDiscriminator( n_features, disc_channels), disc_init=weights_init, disc_loss=disc_loss, feature_funcs={ 'audio': (audio, (samplerate,)) }, feature_spec={ 'audio': (2**16, 1) }, audio_repr_class=PcaRepresentation, learning_rate=1e-4, condition_shape=(noise_dim, 1), samplerate=samplerate, anchor_feature='audio')
def __init__(self): n_mels = 128 feature_size = 32 samplerate = zounds.SR22050() n_fft = 1024 hop = 256 total_samples = 8192 super().__init__(generator=MultiScaleGenerator(n_mels, feature_size, total_samples, transposed_conv=True, recompose=False, kernel_size=8), discriminator=MultiScaleMultiResDiscriminator( total_samples, flatten_multiscale_features=False, channel_judgements=True, conditioning_channels=n_mels, decompose=False, kernel_size=9), learning_rate=1e-4, feature_size=feature_size, audio_repr_class=MultiScale, generator_loss=mel_gan_gen_loss, sub_gen_loss=least_squares_generator_loss, discriminator_loss=mel_gan_disc_loss, sub_disc_loss=least_squares_disc_loss, g_init=weights_init, d_init=weights_init, feature_funcs={ 'audio': (audio, (samplerate, )), 'spectrogram': (spectrogram, (samplerate, )) }, total_samples=total_samples, feature_channels=n_mels, samplerate=samplerate, inference_sequence_factor=4)
class ComplexSTFTExperiment(Experiment): N_MELS = 128 FEATURE_SIZE = 32 SAMPLERATE = zounds.SR22050() N_FFT = 1024 HOP = 256 TOTAL_SAMPLES = 8192 AUDIO_REPR_CLASS = ComplextSTFT @classmethod def make_generator(cls): return ComplextSTFTGenerator(cls.N_MELS, cls.N_FFT, cls.HOP) def __init__(self): super().__init__(generator=self.make_generator(), discriminator=ComplextSTFTDiscriminator( window_size=self.N_FFT, hop=self.HOP, conditioning_channels=self.N_MELS), learning_rate=1e-4, feature_size=self.FEATURE_SIZE, audio_repr_class=self.AUDIO_REPR_CLASS, generator_loss=mel_gan_gen_loss, sub_gen_loss=least_squares_generator_loss, discriminator_loss=mel_gan_disc_loss, sub_disc_loss=least_squares_disc_loss, g_init=weights_init, d_init=weights_init, feature_funcs={ 'audio': (audio, (self.SAMPLERATE, )), 'spectrogram': (spectrogram, (self.SAMPLERATE, )) }, total_samples=self.TOTAL_SAMPLES, feature_channels=self.N_MELS, samplerate=self.SAMPLERATE, inference_sequence_factor=4)
import argparse import featureflow as ff import zounds class Settings(ff.PersistenceSettings): id_provider = ff.UuidProvider() key_builder = ff.StringDelimitedKeyBuilder() database = ff.LmdbDatabase(path='timbre', key_builder=key_builder) windowing = zounds.HalfLapped() STFT = zounds.stft(resample_to=zounds.SR22050(), wscheme=windowing) class WithTimbre(STFT, Settings): bark = zounds.ConstantRateTimeSeriesFeature( zounds.BarkBands, needs=STFT.fft, store=True) bfcc = zounds.ConstantRateTimeSeriesFeature( zounds.BFCC, needs=bark, store=True) @zounds.simple_settings class BfccKmeans(ff.BaseModel): docs = ff.Feature( ff.IteratorNode,
from featuresynth.data import batch_stream from featuresynth.feature import audio from featuresynth.audio.transform import fft_frequency_decompose, fft_resample from featuresynth.audio import RawAudio import zounds import torch import numpy as np from matplotlib import pyplot as plt path = '/hdd/musicnet/train_data' pattern = '*.wav' total_samples = 2**17 samplerate = zounds.SR22050() feature_spec = {'audio': (total_samples, 1)} feature_funcs = {'audio': (audio, (samplerate, ))} batch_size = 1 bs = batch_stream(path, pattern, batch_size, feature_spec, 'audio', feature_funcs) if __name__ == '__main__': # app = zounds.ZoundsApp(locals=locals(), globals=globals()) # app.start_in_thread(9999) # samples, = next(bs) # samples = torch.from_numpy(samples) # min_size = 2 ** (np.log2(total_samples) - 4) # bands = fft_frequency_decompose(samples, min_size) # samples = zounds.AudioSamples(samples.squeeze(), samplerate) # input('Waiting...')
real_part, imag_part = fft.unbind(-1) magnitude = torch.sqrt(real_part**2 + imag_part**2) mel_output = torch.matmul(self.mel_basis, magnitude) log_mel_spec = torch.log10(torch.clamp(mel_output, min=1e-5)) return log_mel_spec data_cache = LmdbCollection('datacache') @cache(data_cache) def audio(file_chunk, samplerate): file_path, start, stop = file_chunk samples = zounds.AudioSamples.from_file(file_path).mono[start:stop] samples = librosa.resample(samples, int(samples.samplerate), int(samplerate)) samples = librosa.util.normalize(samples, axis=-1) * 0.95 return samples.astype(np.float32) audio_to_mel_22050 = Audio2Mel(1024, 256, 1024, int(zounds.SR22050()), 128) @cache(data_cache) def spectrogram(file_chunk, samplerate): print(file_chunk) samples = audio(file_chunk, samplerate)[:] spec = audio_to_mel_22050(samples) spec = spec.data.cpu().numpy().T.astype(np.float32) return spec
class FilterBankExperiment(Experiment): """ This is probably the best audio quality yet. The audio is relatively crisp, spectrograms are indistinguishable from real speech, although they are hard to understand. There are definite phase issues here and there after 12 hours. Overall, the texture of the speech is more realistic than what's produced by the basic MelGAN setup. """ N_MELS = 128 FEATURE_SIZE = 32 SAMPLERATE = zounds.SR22050() N_FFT = 1024 HOP = 256 TOTAL_SAMPLES = 8192 AUDIO_REPR_CLASS = RawAudio @classmethod def make_filter_bank(cls, samplerate): scale = zounds.LinearScale( zounds.FrequencyBand(20, samplerate.nyquist - 20), 128) filter_bank = zounds.learn.FilterBank(samplerate, 511, scale, 0.9, normalize_filters=True, a_weighting=False) return filter_bank @classmethod def make_generator(cls, filter_bank=None): filter_bank = filter_bank or cls.make_filter_bank(cls.SAMPLERATE) return FilterBankGenerator(filter_bank, cls.FEATURE_SIZE, cls.TOTAL_SAMPLES, cls.N_MELS) def __init__(self): filter_bank = self.make_filter_bank(self.SAMPLERATE) super().__init__(generator=self.make_generator(), discriminator=FilterBankDiscriminator( filter_bank, self.TOTAL_SAMPLES), learning_rate=1e-4, feature_size=self.FEATURE_SIZE, audio_repr_class=self.AUDIO_REPR_CLASS, generator_loss=mel_gan_gen_loss, sub_gen_loss=least_squares_generator_loss, discriminator_loss=mel_gan_disc_loss, sub_disc_loss=least_squares_disc_loss, g_init=weights_init, d_init=weights_init, feature_funcs={ 'audio': (audio, (self.SAMPLERATE, )), 'spectrogram': (spectrogram, (self.SAMPLERATE, )) }, total_samples=self.TOTAL_SAMPLES, feature_channels=self.N_MELS, samplerate=self.SAMPLERATE, inference_sequence_factor=4)
dct = zounds.ArrayWithUnitsFeature(zounds.DCT, scale_always_even=True, needs=long_windowed, store=True) mdct = zounds.FrequencyAdaptiveFeature(zounds.FrequencyAdaptiveTransform, transform=scipy.fftpack.idct, scale=scale, needs=dct, store=True) if __name__ == '__main__': # generate some audio synth = zounds.TickSynthesizer(zounds.SR22050()) orig_audio = synth.synthesize(zounds.Seconds(5), zounds.Milliseconds(200)) # analyze the audio _id = Document.process(meta=orig_audio.encode()) doc = Document(_id) synth = zounds.FrequencyAdaptiveDCTSynthesizer(scale, samplerate) recon_audio = synth.synthesize(doc.mdct) # get a rasterized visualization of the representation img = doc.mdct.square(100, do_overlap_add=True) app = zounds.ZoundsApp(model=Document, audio_feature=Document.ogg, visualization_feature=Document.bark,
import librosa # from featuresynth.data import DataStore # from featuresynth.feature.spectrogram import FilterBankSpectrogram # from featuresynth.audio import MelScalePhaseRecover, GeometricScalePhaseRecover import time from featuresynth.data.conjure import cache, LmdbCollection from featuresynth.data.filesystem import iter_files from featuresynth.audio import ComplextSTFT import zounds import torch if __name__ == '__main__': app = zounds.ZoundsApp(globals=globals(), locals=locals()) app.start_in_thread(9999) sr = zounds.SR22050() # synth = zounds.SineSynthesizer(sr) # samples = synth.synthesize( # zounds.Seconds(2), [110, 220, 440, 880]).astype(np.float32) file_path = next(iter_files('/hdd/LJSpeech-1.1', '*.wav')) samples = zounds.AudioSamples.from_file(file_path).astype(np.float32) r = ComplextSTFT.from_audio(samples[None, None, :], sr) phase = r.phase phase[:] = np.random.uniform(-np.pi, np.pi, phase.shape) recon = r.listen() scale = zounds.MelScale(zounds.FrequencyBand(20, sr.nyquist - 20), 256) filter_bank = zounds.learn.FilterBank(sr, 1024, scale,