def _process_annotation(self, annotation): # fetch the fft data resp = requests.get(annotation['data_url']) fft_feature = BinaryData.unpack(resp.content) # compute the chroma feature chroma = CHROMA_SCALE.apply(fft_feature, zounds.HanningWindowingFunc()) chroma = zounds.ArrayWithUnits( chroma, [fft_feature.dimensions[0], zounds.IdentityDimension()]).astype(self.dtype) # pack the chroma data and create the resources binary_data = BinaryData(chroma) sound_id = self._sound_id_from_uri(annotation['sound']) # push output to s3 data_url = self.s3_client.put_object( sound_id, binary_data.packed_file_like_object(), 'application/octet-stream') logger.info(f'pushed binary data to {data_url}') # create annotation self.client.create_annotations( sound_id, { 'start_seconds': annotation['start_seconds'], 'duration_seconds': annotation['duration_seconds'], 'data_url': data_url }) logger.info('created annotation')
def produce_fake_hash(x): """ Produce random, binary features, totally irrespective of the content of x, but in the same shape as x. """ h = np.random.binomial(1, 0.5, (x.shape[0], 1024)) packed = np.packbits(h, axis=-1).view(np.uint64) return zounds.ArrayWithUnits( packed, [x.dimensions[0], zounds.IdentityDimension()])
def compute_embedding(samples, network): # TODO: resampling can fail for some odd sampling rates samples = zounds.soundfile.resample(samples, zounds.SR11025()) freq = samples.frequency * 8192 windowed = samples.sliding_window( zounds.SampleRate(frequency=freq, duration=freq)) dims = windowed.dimensions output = zounds.learn.apply_network(network, windowed, chunksize=8) logger.info(output.shape) output = zounds.ArrayWithUnits( output, [dims[0], zounds.IdentityDimension()]) return output
def _process_annotation(self, annotation): # fetch the fft data resp = requests.get(annotation['data_url']) fft_feature = BinaryData.unpack(resp.content) # compute the chroma feature mel_spectrogram = scale.apply(fft_feature, zounds.HanningWindowingFunc()) mel_spectrogram = zounds.ArrayWithUnits( mel_spectrogram, [fft_feature.dimensions[0], zounds.FrequencyDimension(scale)]) mel_spectrogram = 20 * np.log10(mel_spectrogram + 1) mfcc = np.abs(dct(mel_spectrogram, axis=1)[:, 1:14]) mfcc = zounds.ArrayWithUnits( mfcc, [fft_feature.dimensions[0], zounds.IdentityDimension()]).astype(np.float32) # pack the chroma data and create the resources binary_data = BinaryData(mfcc) sound_id = self._sound_id_from_uri(annotation['sound']) # push output to s3 data_url = self.s3_client.put_object( sound_id, binary_data.packed_file_like_object(), 'application/octet-stream') logger.info(f'pushed binary data to {data_url}') # create annotation self.client.create_annotations( sound_id, { 'start_seconds': annotation['start_seconds'], 'duration_seconds': annotation['duration_seconds'], 'data_url': data_url }) logger.info('created annotation')
original = snd.resampled slow = zounds.AudioSamples(time_stretch(original, 0.75).squeeze(), sr) fast = zounds.AudioSamples(time_stretch(original, 1.25).squeeze(), sr) higher = zounds.AudioSamples(pitch_shift(original, 1.0).squeeze(), sr) lower = zounds.AudioSamples(pitch_shift(original, -1.0).squeeze(), sr) # apply a sliding window to demonstrate time stretch and pitch shift in # batch mode windowing_sr = zounds.SampleRate(frequency=zounds.Seconds(5), duration=zounds.Seconds(10)) windowed = snd.resampled.sliding_window(windowing_sr) windowed = zounds.ArrayWithUnits( windowed, [zounds.IdentityDimension(), windowed.dimensions[1]]) def samples(x): return zounds.AudioSamples(x, sr) batch_slow = map(samples, time_stretch(windowed, 0.75)) batch_fast = map(samples, time_stretch(windowed, 1.25)) batch_higher = map(samples, pitch_shift(windowed, 1.0)) batch_lower = map(samples, pitch_shift(windowed, -1.0)) app = zounds.ZoundsApp(model=Sound, visualization_feature=Sound.fft, audio_feature=Sound.resampled, globals=globals(), locals=locals(),