def _transform_indices(self, indices): for index in indices: sound_id, time_slice = self.time_slices[index] start = time_slice.start / zounds.Seconds(1) duration = time_slice.duration / zounds.Seconds(1) data = { 'created_by': self.user_uri, 'sound': sound_id, 'start_seconds': start, 'duration_seconds': duration, 'end_seconds': start + duration } yield data
def get_metadata(self): return { 'type': str(self.dtype), 'shape': ('variable', CHROMA_SCALE.n_bands), 'dimensions': [{ 'type': 'time', 'sample_frequency_seconds': windowing_sample_rate.frequency / zounds.Seconds(1), 'sample_duration_seconds': windowing_sample_rate.duration / zounds.Seconds(1) }, { 'type': 'identity' }] }
class Document(BaseModel): bark = zounds.ArrayWithUnitsFeature(zounds.BarkBands, samplerate=samplerate, stop_freq_hz=samplerate.nyquist, needs=BaseModel.fft, store=True) long_windowed = zounds.ArrayWithUnitsFeature( zounds.SlidingWindow, wscheme=zounds.SampleRate(frequency=zounds.Milliseconds(500), duration=zounds.Seconds(1)), wfunc=windowing_func, needs=BaseModel.resampled, store=True) dct = zounds.ArrayWithUnitsFeature(zounds.DCT, scale_always_even=True, needs=long_windowed, store=True) mdct = zounds.FrequencyAdaptiveFeature(zounds.FrequencyAdaptiveTransform, transform=scipy.fftpack.idct, scale=scale, needs=dct, store=True)
def total_duration(doc, ts): try: duration = duration_cache[doc._id] except KeyError: duration = doc.geom.dimensions[0].end / zounds.Seconds(1) duration_cache[doc._id] = duration return duration
def synthetic(): for i in xrange(100): duration = zounds.Seconds(np.random.randint(2, 20)) root = np.random.randint(50, 400) hz = [root] for _ in xrange(0): hz.append(hz[-1] * 2) synth = zounds.SineSynthesizer(samplerate) s = synth.synthesize(duration, hz) yield s.encode()
def transform_search_result(result, req, nresults): _id, ts, extra_data = result ts = WebTimeSlice(ts) quoted_id = urllib.quote(_id, safe='') qs = ts.to_query_string() start = ts.start / zounds.Seconds(1) duration = ts.duration / zounds.Seconds(1) end = start + duration return dict(_id=_id, start=start, duration=duration, end=end, search=str( SearchUri(quoted_id, req=req, timeslice=ts, nresults=nresults)), bark=str( FeatureUri(quoted_id=quoted_id, feature='geom', timeslice_query_string=qs, req=req)), hashed=str( FeatureUri(quoted_id=quoted_id, feature='hashed', timeslice_query_string=qs, req=req)), audio=str( FeatureUri(quoted_id=quoted_id, timeslice_query_string=qs, feature='ogg', req=req)), meta=str( FeatureUri(quoted_id=quoted_id, feature='meta', timeslice_query_string=qs, req=req)), **extra_data)
def main(user_name, bucket_name, email, about_me, info_url, listener_cls, page_size=100, logger=None): parser = argparse.ArgumentParser(parents=[DefaultArgumentParser()]) args = parser.parse_args() client = Client(args.annotate_api_endpoint, logger=logger) object_storage_client = ObjectStorageClient( endpoint=args.s3_endpoint, region=args.s3_region, access_key=args.aws_access_key_id, secret=args.aws_secret_access_key, bucket=bucket_name) listener = listener_cls(client, object_storage_client, page_size, logger=logger) # get metadata describing feature shape and dimensions try: metadata = listener.get_metadata() except AttributeError: # the listener does not provide metadata explicitly, but in the case # of listeners that accept audio samples directly, we can infer the # metadata samples = zounds.AudioSamples.silence(zounds.SR44100(), zounds.Seconds(10)) binary_data = listener._process_samples(samples) metadata = about_me_metadata(binary_data) try: with open(about_me, 'r') as f: about_me = f.read() except IOError: pass about_me = about_me.format(metadata=json.dumps(metadata, indent=4)) object_storage_client.ensure_bucket_exists() # TODO: Some kind of structured information about transformation pipeline # in about me and/or info url client.upsert_featurebot(user_name, email, args.password, about_me, info_url) with listener.run(): pass
def about_me_metadata(binary_data): dims = binary_data.arr.dimensions shape = list(binary_data.arr.shape) # The first time dimension should be displayed as variable since it depends # on the length of the audio input if isinstance(dims[0], zounds.TimeDimension): shape[0] = 'variable' metadata_dims = [] for dim in dims: if isinstance(dim, zounds.TimeDimension): metadata_dims.append({ 'type': 'time', 'sample_frequency_seconds': dim.frequency / zounds.Seconds(1), 'sample_duration_seconds': dim.duration / zounds.Seconds(1) }) elif isinstance(dim, zounds.FrequencyDimension): scale = dim.scale metadata_dims.append({ 'type': 'frequency', 'start_hz': scale.start_hz, 'stop_hz': scale.stop_hz, 'n_bands': scale.n_bands, 'scale_type': scale.__class__.__name__ }) else: metadata_dims.append({'type': 'identity'}) return { 'type': str(binary_data.arr.dtype), 'shape': shape, 'dimensions': metadata_dims }
def time_generator(): inp = torch.FloatTensor( args.time_generator_batch_size, feature_channels, feature_size)\ .normal_(0, 1) total_audio_time = \ (sr.frequency * total_samples * inp.shape[0]) / zounds.Seconds(1) start = time.time() bands = generator(inp) samples = frequency_recomposition( [b.data.cpu().numpy().squeeze() for b in bands.values()], total_samples) stop = time.time() wall_time = stop - start print(f'CPU Generated {total_audio_time} seconds of audio in {wall_time} seconds') return samples
def load_and_play(): files = sorted( glob.glob('*.npy'), cmp=lambda x, y: int(os.stat(x).st_ctime - os.stat(y).st_ctime)) most_recent = files[-1] print 'loading generated examples from', most_recent results = np.load(most_recent) # synthesized = FrequencyDecomposition.synthesize_block(results) synthesized = results for raw, result in zip(results, synthesized): windowed = zounds.sliding_window(result, 512, 256) spec = np.abs(np.fft.rfft(windowed)) audio_samples = zounds.AudioSamples(result, samplerate) \ .pad_with_silence(zounds.Seconds(1)) yield raw, result, audio_samples / audio_samples.max(), spec
def feature_hop_hz(self): return \ zounds.Seconds(1) / (self.samplerate.frequency * self.feature_hop)
def _get_code(self, sound, seconds): frequency = self.frequency / zounds.Seconds(1) offset = self.sound_offsets[sound] window_index = offset + int(seconds / frequency) return self.index[window_index]
args = parser.parse_args() _id = Sound.process(meta=args.sound_uri) snd = Sound(_id) original = snd.resampled slow = zounds.AudioSamples(time_stretch(original, 0.75).squeeze(), sr) fast = zounds.AudioSamples(time_stretch(original, 1.25).squeeze(), sr) higher = zounds.AudioSamples(pitch_shift(original, 1.0).squeeze(), sr) lower = zounds.AudioSamples(pitch_shift(original, -1.0).squeeze(), sr) # apply a sliding window to demonstrate time stretch and pitch shift in # batch mode windowing_sr = zounds.SampleRate( frequency=zounds.Seconds(5), duration=zounds.Seconds(10)) windowed = snd.resampled.sliding_window(windowing_sr) windowed = zounds.ArrayWithUnits( windowed, [zounds.IdentityDimension(), windowed.dimensions[1]]) def samples(x): return zounds.AudioSamples(x, sr) batch_slow = list(map(samples, time_stretch(windowed, 0.75))) batch_fast = list(map(samples, time_stretch(windowed, 1.25))) batch_higher = list(map(samples, pitch_shift(windowed, 1.0))) batch_lower = list(map(samples, pitch_shift(windowed, -1.0)))
def feature_window_len(self): return (self.samplerate.frequency * self.feature_window) / zounds.Seconds(1)
def training_sample_len(self): return (self.samplerate.frequency * self.training_sample_win) / zounds.Seconds(1)
loss, lambda model: Adam(model.parameters(), lr=0.0001), epochs=args.epochs, batch_size=batch_size, holdout_percent=0.25, data_preprocessor=data_preprocessor, label_preprocessor=label_preprocessor) gen = (snd.windowed for snd in Sound if args.internet_archive_id in snd._id) pipeline_cls.process(samples=gen, trainer=trainer) # instantiate the trained pipeline pipeline = pipeline_cls() snds = [snd for snd in Sound if args.internet_archive_id in snd._id] snd = choice(snds) time_slice = zounds.TimeSlice(duration=zounds.Seconds(10)) encoded = pipeline.pipeline.transform( data_preprocessor(snd.windowed[time_slice])) recon = encoded.inverse_transform() samples = synthesize(recon) # start up an in-browser REPL to interact with the results app = zounds.ZoundsApp(model=Sound, audio_feature=Sound.ogg, visualization_feature=Sound.windowed, globals=globals(), locals=locals()) app.start(8888)
code = urllib.unquote(code) return Code(base64.urlsafe_b64decode(code)) @classmethod def from_expanded_array(cls, arr): packed = arr.packbits(axis=1) for p in packed: yield Code(p) @classmethod def from_packed_array(cls, arr): for x in arr: yield Code(x) ONE_SECOND = zounds.Seconds(1) class WebTimeSlice(zounds.TimeSlice): def __init__(self, request_or_ts): if isinstance(request_or_ts, zounds.TimeSlice): ts = request_or_ts start = ts.start duration = ts.duration else: request = request_or_ts try: start = float(request.params['start']) start = zounds.Picoseconds(int(start * 1e12)) except (KeyError, ValueError): start = zounds.Picoseconds(0)
def synthesize_iter(self): fa = self.as_frequency_adaptive() samples = self.__class__.synthesize_block(fa) for sample in samples: yield sample, zounds.AudioSamples(sample, samplerate) \ .pad_with_silence(zounds.Seconds(1))
def listen(self): return zounds.AudioSamples(self.to_audio()[0], self.samplerate)\ .pad_with_silence(zounds.Seconds(1))
def judgement_hz(self): return [zounds.Seconds(1) / (self.samplerate.frequency * dsr) for dsr in self.downsampling_ratios]
needs=Resampled.resampled, wscheme=zounds.HalfLapped(), wfunc=zounds.OggVorbisWindowingFunc(), store=True) mdct = zounds.ArrayWithUnitsFeature(zounds.MDCT, needs=windowed) weighted = zounds.ArrayWithUnitsFeature(lambda x: x * zounds.AWeighting(), needs=mdct) if __name__ == '__main__': # produce some audio to test our pipeline, and encode it as FLAC synth = zounds.SineSynthesizer(zounds.SR44100()) samples = synth.synthesize(zounds.Seconds(5), [220., 440., 880.]) encoded = samples.encode(fmt='FLAC') # process the audio, and fetch features from our in-memory store _id = Sound.process(meta=encoded) sound = Sound(_id) # grab all the frequency information, for a subset of the duration start = zounds.Milliseconds(500) end = start + zounds.Seconds(2) snippet = sound.weighted[start:end, :] # grab a subset of frequency information for the duration of the sound freq_band = slice(zounds.Hertz(400), zounds.Hertz(500)) a440 = sound.mdct[:, freq_band]
def total_duration(doc, ts): return doc.fake_hash.dimensions[0].end / zounds.Seconds(1)
) args = parser.parse_args() _id = Sound.process(meta=args.sound_uri) snd = Sound(_id) original = snd.resampled slow = zounds.AudioSamples(time_stretch(original, 0.75).squeeze(), sr) fast = zounds.AudioSamples(time_stretch(original, 1.25).squeeze(), sr) higher = zounds.AudioSamples(pitch_shift(original, 1.0).squeeze(), sr) lower = zounds.AudioSamples(pitch_shift(original, -1.0).squeeze(), sr) # apply a sliding window to demonstrate time stretch and pitch shift in # batch mode windowing_sr = zounds.SampleRate(frequency=zounds.Seconds(5), duration=zounds.Seconds(10)) windowed = snd.resampled.sliding_window(windowing_sr) windowed = zounds.ArrayWithUnits( windowed, [zounds.IdentityDimension(), windowed.dimensions[1]]) def samples(x): return zounds.AudioSamples(x, sr) batch_slow = map(samples, time_stretch(windowed, 0.75)) batch_fast = map(samples, time_stretch(windowed, 1.25)) batch_higher = map(samples, pitch_shift(windowed, 1.0)) batch_lower = map(samples, pitch_shift(windowed, -1.0))
app = zounds.ZoundsApp(globals=globals(), locals=locals()) app.start_in_thread(8888) feature_size = 64 g = DDSPGenerator(feature_size, feature_channels, 128, None, None, None, None) \ .to(device) \ .initialize_weights() g_optim = Adam(g.parameters(), lr=0.001, betas=(0, 0.9)) base_path = '/hdd/musicnet/train_data' files = os.listdir(base_path) file = choice(files) samples = zounds.AudioSamples.from_file(os.path.join( base_path, file))[:zounds.Seconds(10)] samples = zounds.soundfile.resample(samples, zounds.SR11025()) start = np.random.randint(0, len(samples) - 16384) chunk = samples[start:start + 16384] chunk /= (chunk.max() + 1e-12) # chunk = spec_test[:16384].astype(np.float32) orig = chunk.pad_with_silence() target = torch.from_numpy(chunk).to(device).view(1, -1) current = None inp = compute_features(chunk) inp = torch.from_numpy(inp).to(device) cond = inp.data.cpu().numpy().squeeze().T
path = '/hdd/musicnet/train_data' pattern = '*.wav' total_samples = 2**17 samplerate = zounds.SR22050() feature_spec = {'audio': (total_samples, 1)} feature_funcs = {'audio': (audio, (samplerate, ))} batch_size = 1 bs = batch_stream(path, pattern, batch_size, feature_spec, 'audio', feature_funcs) if __name__ == '__main__': # app = zounds.ZoundsApp(locals=locals(), globals=globals()) # app.start_in_thread(9999) # samples, = next(bs) # samples = torch.from_numpy(samples) # min_size = 2 ** (np.log2(total_samples) - 4) # bands = fft_frequency_decompose(samples, min_size) # samples = zounds.AudioSamples(samples.squeeze(), samplerate) # input('Waiting...') n_bands = 5 sr = samplerate for i in range(n_bands): start_hz = 0 if i == (n_bands - 1) else sr.nyquist / 2 stop_hz = sr.nyquist n_samples = int(zounds.Seconds(1) / sr.frequency) print(n_samples, start_hz, stop_hz) sr *= 2
dct = zounds.ArrayWithUnitsFeature(zounds.DCT, scale_always_even=True, needs=long_windowed, store=True) mdct = zounds.FrequencyAdaptiveFeature(zounds.FrequencyAdaptiveTransform, transform=scipy.fftpack.idct, scale=scale, needs=dct, store=True) if __name__ == '__main__': # generate some audio synth = zounds.TickSynthesizer(zounds.SR22050()) orig_audio = synth.synthesize(zounds.Seconds(5), zounds.Milliseconds(200)) # analyze the audio _id = Document.process(meta=orig_audio.encode()) doc = Document(_id) synth = zounds.FrequencyAdaptiveDCTSynthesizer(scale, samplerate) recon_audio = synth.synthesize(doc.mdct) # get a rasterized visualization of the representation img = doc.mdct.square(100, do_overlap_add=True) app = zounds.ZoundsApp(model=Document, audio_feature=Document.ogg, visualization_feature=Document.bark, globals=globals(),