def listen(): padding = zounds.Milliseconds(250) z = np.concatenate(list(walk2(1000))) result = p.pipeline.transform(z).data.squeeze() x = np.concatenate([ zounds.AudioSamples(j, samplerate).pad_with_silence(padding) for j in result ]) return zounds.AudioSamples(x, zounds.SR11025())
def view_band(index): from scipy.signal import resample band = bands[index].squeeze() band = resample(band, total_samples) samples = zounds.AudioSamples(band, sr) coeffs = np.abs(zounds.spectral.stft(samples)) return coeffs
def preview(fake_batch): # fake_batch = fake_batch * spec_std # fake_batch = fake_batch + spec_mean fake_batch = torch.from_numpy(fake_batch) inp = fake_batch[0] # (256, 512) window = audio_generator_input_size inp = inp.unfold(1, window, window) # (256, 8, 64) inp = inp.permute(1, 0, 2) # (8, 256, 64) bands = audio_generator(inp) samples = frequency_recomposition([ np.concatenate(b.data.cpu().numpy().reshape(1, 1, -1), axis=-1) for b in bands.values() ], total_feature_samples) # synth = zounds.MDCTSynthesizer() # coeffs = zounds.ArrayWithUnits(fake_batch[0].T, [ # zounds.TimeDimension(frequency=sr.frequency * 256, duration=sr.frequency * 512), # zounds.IdentityDimension() # ]) # samples = synth.synthesize(coeffs) return zounds.AudioSamples(samples.squeeze(), sr).pad_with_silence()
def g_sample(): recmposed = frequency_recomposition(bands, total_samples) index = np.random.randint(0, len(recmposed)) fake_sample = zounds.AudioSamples(recmposed[index], sr) fake_sample /= fake_sample.max() coeffs = np.abs(zounds.spectral.stft(fake_sample)) return fake_sample, coeffs
def hear_real_band(samples, index): from scipy.signal import resample band = samples[index][0].data.cpu().numpy().squeeze() if len(band) != total_samples: band = resample(band, total_samples) samples = zounds.AudioSamples(band, sr) samples /= (samples.max() + 1e-12) return samples
def hear_band(index): from scipy.signal import resample band = bands[index][0].squeeze() if len(band) != total_samples: band = resample(band, total_samples) samples = zounds.AudioSamples(band, sr) samples /= (samples.max() + 1e-12) return samples
def to_audio(self): log_mag, phase = self.data[..., 0], self.data[..., 1] mag = np.exp(log_mag) phase = np.cumsum(phase, axis=0) phase = (phase + np.pi) % (2 * np.pi) - np.pi coeffs = mag * np.exp(1j * phase) samples = self.proc.istft(coeffs) return zounds.AudioSamples(samples, self.samplerate)[None, :]
def view_real_band(samples, index): from scipy.signal import resample band = samples[index][0].data.cpu().numpy().squeeze() if len(band) != total_samples: band = resample(band, total_samples) samples = zounds.AudioSamples(band, sr) coeffs = np.abs(zounds.spectral.stft(samples)) return coeffs
def test_synthetic(batch_size): synth = zounds.SineSynthesizer(sr) samples = synth.synthesize(sr.frequency * total_samples, [55, 110, 220, 440, 880, 1660, 1660 * 2]) batch = np.repeat(samples[None, :], batch_size, axis=0) bands = frequency_decomposition(batch, band_sizes) recomposed = frequency_recomposition(bands, total_samples) recomposed = zounds.AudioSamples(recomposed[0], sr).pad_with_silence() return samples, recomposed
def test_filter_bank_recon(samples, return_spectral=False): samples = samples[:1, ...] samples /= samples.max() bands = frequency_decomposition(samples, band_sizes) new_bands = [] spectral = [] for band, fb in zip(bands, filter_banks): band = torch.from_numpy(band).float().to(device) sp = fb.convolve(band) spectral.append(sp.data.cpu().numpy()) band = fb.transposed_convolve(sp) new_bands.append(band.data.cpu().numpy()) final = frequency_recomposition(new_bands, total_samples) orig = zounds.AudioSamples(samples.squeeze(), sr) final = zounds.AudioSamples(final.squeeze(), sr) final /= final.max() if return_spectral: return orig, final, spectral else: return orig, final
def load_and_play(): files = sorted( glob.glob('*.npy'), cmp=lambda x, y: int(os.stat(x).st_ctime - os.stat(y).st_ctime)) most_recent = files[-1] print 'loading generated examples from', most_recent results = np.load(most_recent) # synthesized = FrequencyDecomposition.synthesize_block(results) synthesized = results for raw, result in zip(results, synthesized): windowed = zounds.sliding_window(result, 512, 256) spec = np.abs(np.fft.rfft(windowed)) audio_samples = zounds.AudioSamples(result, samplerate) \ .pad_with_silence(zounds.Seconds(1)) yield raw, result, audio_samples / audio_samples.max(), spec
def check_recon(): spec, = next(stream(batch_size=1)) batch, channels, time = spec.shape spec = spec.transpose((0, 2, 1)).reshape((batch * time, channels)) norms = np.linalg.norm(spec, axis=-1, keepdims=True) spec /= norms + 1e-12 indices = kmeans.predict(spec) centers = kmeans.cluster_centers_[indices] centers *= norms recon = centers.reshape((batch, time, channels)).transpose((0, 2, 1)) bands = generator.forward(torch.from_numpy(recon)) audio = fft_frequency_recompose(bands, 256 * 256).data.cpu().numpy() audio = zounds.AudioSamples(audio.squeeze(), samplerate) return spec, recon.squeeze().T, audio
def test_spectral_filtering(): # (1, 129, 64) total_samples = 16384 window_size = 32 hop_size = 16 coeffs = get_filter_coeffs(window_size, total_samples // hop_size) noise = np.random.uniform(-1, 1, total_samples) noise = np.pad(noise, ((0, hop_size), ), mode='constant') windowed = zounds.sliding_window(noise, window_size, hop_size) # (1, 64, 256) noise_coeffs = np.fft.rfft(windowed, axis=-1, norm='ortho') # (1, 64, 129) filtered = coeffs.transpose((0, 2, 1)) * noise_coeffs recovered = np.fft.irfft(filtered, axis=-1, norm='ortho') samples = np_overlap_add(recovered[:, None, :, :], apply_window=True) samples = samples.squeeze()[:total_samples] # (1, 64, 256) return zounds.AudioSamples(samples, zounds.SR11025()).pad_with_silence()
def display(self): raw = self.to_audio()[0] audio = zounds.AudioSamples(raw, self.samplerate) return spectrogram(audio)
resampled = fft_resample(band, desired_size, size == first_band) # if size != desired_size: # resampled = torch.zeros_like(resampled) bands.append(resampled) return sum(bands) if __name__ == '__main__': app = zounds.ZoundsApp(globals=globals(), locals=locals()) app.start_in_thread(9999) sr = zounds.SR11025() synth = zounds.NoiseSynthesizer(sr) noise = synth.synthesize(sr.frequency * 16385) signal = torch.from_numpy(noise).view(1, 1, 16384).float() rs = fft_resample(signal, 16384, is_lowest_band=False) rs = zounds.AudioSamples(rs.data.cpu().numpy().squeeze(), sr) bands = fft_frequency_decompose(signal, 512) recon = {} for k, v in bands.items(): print(k, v.shape) recon[k] = zounds.AudioSamples( fft_resample(v, 16384, k == 512).data.cpu().numpy().squeeze(), sr) r = fft_frequency_recompose(bands, 16384) r = zounds.AudioSamples(r.data.cpu().numpy().squeeze(), sr) input('waiting...')
def listen(self): return zounds.AudioSamples(self.to_audio()[0], self.samplerate)\ .pad_with_silence(zounds.Seconds(1))
def synthesize_iter(self): fa = self.as_frequency_adaptive() samples = self.__class__.synthesize_block(fa) for sample in samples: yield sample, zounds.AudioSamples(sample, samplerate) \ .pad_with_silence(zounds.Seconds(1))
# noise = torch.FloatTensor(16384).uniform_(-1, 1) # windowed = noise.unfold(-1, 256, 256) # noise_coeffs = torch.rfft(windowed, 1, normalized=True) # noise_coeffs = noise_coeffs.view(1, 64, 129, 2) # # coeffs = coeffs.permute(0, 2, 1)[..., None] # # filtered = coeffs * noise_coeffs # recovered = torch.irfft(filtered, 1, normalized=True, signal_sizes=(256,)) # recovered = recovered.view(-1) # return zounds.AudioSamples( # recovered.data.cpu().numpy().squeeze(), # zounds.SR11025() # ).pad_with_silence() real_noise = zounds.AudioSamples(np.random.uniform(-1, 1, 16384), zounds.SR11025()).pad_with_silence() spec_test = test_spectral_filtering() # spec_test /= (spec_test.max() + 1e-12) # torch_spec_test = test_spectral_filtering_torch() # torch_spec_test /= (torch_spec_test.max() + 1e-12) if __name__ == '__main__': app = zounds.ZoundsApp(globals=globals(), locals=locals()) app.start_in_thread(8888) feature_size = 64 g = DDSPGenerator(feature_size, feature_channels, 128, None, None, None, None) \ .to(device) \ .initialize_weights()
if __name__ == '__main__': parser = argparse.ArgumentParser(parents=[AppSettings()]) parser.add_argument( '--sound-uri', default= 'https://archive.org/download/LucaBrasi2/06-Kevin_Gates-Out_The_Mud_Prod_By_The_Runners_The_Monarch.ogg' ) args = parser.parse_args() _id = Sound.process(meta=args.sound_uri) snd = Sound(_id) original = snd.resampled slow = zounds.AudioSamples(time_stretch(original, 0.75).squeeze(), sr) fast = zounds.AudioSamples(time_stretch(original, 1.25).squeeze(), sr) higher = zounds.AudioSamples(pitch_shift(original, 1.0).squeeze(), sr) lower = zounds.AudioSamples(pitch_shift(original, -1.0).squeeze(), sr) # apply a sliding window to demonstrate time stretch and pitch shift in # batch mode windowing_sr = zounds.SampleRate(frequency=zounds.Seconds(5), duration=zounds.Seconds(10)) windowed = snd.resampled.sliding_window(windowing_sr) windowed = zounds.ArrayWithUnits( windowed, [zounds.IdentityDimension(), windowed.dimensions[1]]) def samples(x):
def fake_audio(): samples = zounds.AudioSamples(generated[0].squeeze(), sr) return samples.pad_with_silence()
def samples(x): return zounds.AudioSamples(x, sr)
samples = zounds.AudioSamples(generated[0].squeeze(), sr) return samples.pad_with_silence() def fake_spec(): return zounds.log_modulus(spectrogram(fake_audio()) * 100) def r_spec(): return zounds.log_modulus(spectrogram(real_audio) * 100) for samples, features in batch_stream: samples /= np.abs(samples).max(axis=-1, keepdims=True) + 1e-12 features /= features.max(axis=(1, 2), keepdims=True) + 1e-12 real_spec = features[0].T real_audio = zounds.AudioSamples(samples[0].squeeze(), sr).pad_with_silence() samples = torch.from_numpy(samples).to(device) # samples = normalize(samples) features = torch.from_numpy(features).to(device) # features = normalize(features) step = next(steps) data = step(samples, features) print({k: v for k, v in data.items() if 'loss' in k}) try: generated = data['fake'] except KeyError: pass batch_count += 1