def __init__(self, fs, hopsize_t): from madmom.audio.signal import SignalProcessor, FramedSignalProcessor from madmom.audio.stft import ShortTimeFourierTransformProcessor from madmom.audio.filters import MelFilterbank from madmom.audio.spectrogram import (FilteredSpectrogramProcessor, LogarithmicSpectrogramProcessor) # define pre-processing chain sig = SignalProcessor(num_channels=1, sample_rate=fs) frames = FramedSignalProcessor(frame_size=2048, hopsize=int(fs * hopsize_t)) stft = ShortTimeFourierTransformProcessor() # caching FFT window filt = FilteredSpectrogramProcessor(filterbank=MelFilterbank, num_bands=80, fmin=27.5, fmax=16000, norm_filters=True, unique_filters=False) spec = LogarithmicSpectrogramProcessor(log=np.log, add=EPSILON) single = SequentialProcessor([frames, stft, filt, spec]) pre_processor = SequentialProcessor([sig, single]) super(MadmomMelbankProcessor, self).__init__([pre_processor])
def CreateProcesser(fps=100): # define pre-processing chain sig = SignalProcessor(num_channels=1, sample_rate=44100) # process the multi-resolution spec & diff in parallel # process the multi-resolution spec & diff in parallel multi = ParallelProcessor([]) frame_sizes = [1024, 2048, 4096] num_bands = [3, 6, 12] for frame_size, num_bands in zip(frame_sizes, num_bands): frames = FramedSignalProcessor(frame_size=frame_size, fps=fps) stft = ShortTimeFourierTransformProcessor() # caching FFT window filt = FilteredSpectrogramProcessor(num_bands=num_bands, fmin=30, fmax=17000, norm_filters=True) spec = LogarithmicSpectrogramProcessor(mul=1, add=1) diff = SpectrogramDifferenceProcessor(diff_ratio=0.5, positive_diffs=True, stack_diffs=np.hstack) # process each frame size with spec and diff sequentially multi.append(SequentialProcessor((frames, stft, filt, spec, diff))) # stack the features and processes everything sequentially pre_processor = SequentialProcessor((sig, multi, np.hstack)) return pre_processor
def __init__(self, spectrogram_path=None, version=1, test=False, dump=False, preprocessing=True, sample_rate=32000, silence_threshold=40): if (version != 1 and version != 2): raise NameError("version must be 1 or 2") self.version = version self.spectrogram_path = spectrogram_path self.sample_rate = sample_rate self.preprocessing = preprocessing self.test = test self.dump = dump self.silence_threshold = silence_threshold sig_proc = SignalProcessor(num_channels=1, sample_rate=self.sample_rate, norm=True) fsig_proc = FramedSignalProcessor(frame_size=1024, hop_size=128, origin='future') spec_proc = SpectrogramProcessor(frame_size=1024) filt_proc = LogarithmicFilteredSpectrogramProcessor( filterbank=LogFilterbank, num_bands=26, fmin=20, fmax=14000) processor_pipeline = [sig_proc, fsig_proc, spec_proc, filt_proc] self.processor_version2 = SequentialProcessor(processor_pipeline)
def __init__(self, **kwargs): from madmom.audio.signal import SignalProcessor, FramedSignalProcessor from madmom.audio.stft import ShortTimeFourierTransformProcessor from madmom.audio.spectrogram import (FilteredSpectrogramProcessor, LogarithmicSpectrogramProcessor, SpectrogramDifferenceProcessor) from madmom.processors import SequentialProcessor, ParallelProcessor # define pre-processing chain sig = SignalProcessor(num_channels=1, sample_rate=44100) # process the multi-resolution spec & diff in parallel multi = ParallelProcessor([]) for frame_size in [4096]: frames = FramedSignalProcessor(frame_size=frame_size, fps=100) stft = ShortTimeFourierTransformProcessor( window=np.hamming(frame_size)) # caching FFT window filt = FilteredSpectrogramProcessor(num_bands=12, fmin=30, fmax=16000, norm_filters=True) spec = LogarithmicSpectrogramProcessor(mul=5, add=1) #diff = SpectrogramDifferenceProcessor(diff_ratio=0.5, positive_diffs=True, stack_diffs=np.hstack) # process each frame size with spec and diff sequentially multi.append(SequentialProcessor((frames, stft, filt, spec))) #multi.append(SequentialProcessor((frames, stft, filt))) # stack the features and processes everything sequentially pre_processor = SequentialProcessor((sig, multi, np.hstack)) super(PianoNoteProcessor, self).__init__(pre_processor)
def __init__(self, fs, hopsize_t): from madmom.audio.signal import SignalProcessor, FramedSignalProcessor from madmom.audio.stft import ShortTimeFourierTransformProcessor from madmom.audio.filters import MelFilterbank from madmom.audio.spectrogram import (FilteredSpectrogramProcessor, LogarithmicSpectrogramProcessor) # from madmom.features.onsets import _cnn_onset_processor_pad # define pre-processing chain sig = SignalProcessor(num_channels=1, sample_rate=fs) # process the multi-resolution spec in parallel frames = FramedSignalProcessor(frame_size=2048, hopsize=int(fs * hopsize_t)) stft = ShortTimeFourierTransformProcessor() # caching FFT window filt = FilteredSpectrogramProcessor(filterbank=MelFilterbank, num_bands=80, fmin=27.5, fmax=16000, norm_filters=True, unique_filters=False) spec = LogarithmicSpectrogramProcessor(log=np.log, add=EPSILON) # process each frame size with spec and diff sequentially single = SequentialProcessor([frames, stft, filt, spec]) # pre-processes everything sequentially pre_processor = SequentialProcessor([sig, single]) # instantiate a SequentialProcessor super(MadmomMelbankProcessor, self).__init__([pre_processor])
def create_feature_extraction_pipeline(sr=44100, frame_sizes=[1024, 2048, 4096], fps_hz=100.): audio_loading = Pipeline([ ("load_audio", FeatureExtractor(librosa.load, sr=sr, mono=True)), ("normalize", FeatureExtractor(librosa.util.normalize, norm=np.inf)) ]) sig = SignalProcessor(num_channels=1, sample_rate=sr) multi = ParallelProcessor([]) for frame_size in frame_sizes: frames = FramedSignalProcessor(frame_size=frame_size, fps=fps_hz) stft = ShortTimeFourierTransformProcessor() # caching FFT window filt = FilteredSpectrogramProcessor(filterbank=LogarithmicFilterbank, num_bands=12, fmin=30, fmax=17000, norm_filters=True, unique_filters=True) spec = LogarithmicSpectrogramProcessor(log=np.log10, mul=5, add=1) diff = SpectrogramDifferenceProcessor(diff_ratio=0.5, positive_diffs=True, stack_diffs=np.hstack) # process each frame size with spec and diff sequentially multi.append(SequentialProcessor([frames, stft, filt, spec, diff])) feature_extractor = FeatureExtractor( SequentialProcessor([sig, multi, np.hstack])) feature_extraction_pipeline = Pipeline([("audio_loading", audio_loading), ("feature_extractor", feature_extractor)]) return feature_extraction_pipeline
def spec_from_midi(midi_file): sig_proc = SignalProcessor(num_channels=1, sample_rate=spec_params["sample_rate"]) fsig_proc = FramedSignalProcessor(frame_size=spec_params["frame_size"], fps=spec_params["fps"]) spec_proc = FilteredSpectrogramProcessor(filterbank=LogarithmicFilterbank, num_bands=12, fmin=60, fmax=6000, norm_filters=True, unique_filters=False) log_proc = LogarithmicSpectrogramProcessor() processor = SequentialProcessor([sig_proc, fsig_proc, spec_proc, log_proc]) # print(midi_file) if not os.path.isfile(midi_file.replace('.mid', '.wav')): # render audio file from midi render_audio(midi_file, sound_font=SOUND_FONT_PATH) # compute spectrogram audio_path = midi_file.replace('.mid', '.wav') # if the spectrogram doesn't exist it will be computed and stored if not os.path.isfile(midi_file.replace('.mid', '.spec.npy')): spec = processor.process(audio_path).T np.save(midi_file.replace('.mid', '.spec'), spec) else: spec = np.load(midi_file.replace('.mid', '.spec.npy')) return spec
def __init__(self, **kwargs): # pylint: disable=unused-argument from ..audio.signal import SignalProcessor, FramedSignalProcessor from ..audio.stft import ShortTimeFourierTransformProcessor from ..audio.spectrogram import (FilteredSpectrogramProcessor, LogarithmicSpectrogramProcessor, SpectrogramDifferenceProcessor) from ..models import NOTES_BRNN from ..ml.nn import NeuralNetwork # define pre-processing chain sig = SignalProcessor(num_channels=1, sample_rate=44100) # process the multi-resolution spec & diff in parallel multi = ParallelProcessor([]) for frame_size in [1024, 2048, 4096]: frames = FramedSignalProcessor(frame_size=frame_size, fps=100) stft = ShortTimeFourierTransformProcessor() # caching FFT window filt = FilteredSpectrogramProcessor(num_bands=12, fmin=30, fmax=17000, norm_filters=True) spec = LogarithmicSpectrogramProcessor(mul=5, add=1) diff = SpectrogramDifferenceProcessor(diff_ratio=0.5, positive_diffs=True, stack_diffs=np.hstack) # process each frame size with spec and diff sequentially multi.append(SequentialProcessor((frames, stft, filt, spec, diff))) # stack the features and processes everything sequentially pre_processor = SequentialProcessor((sig, multi, np.hstack)) # process the pre-processed signal with a NN nn = NeuralNetwork.load(NOTES_BRNN[0]) # instantiate a SequentialProcessor super(RNNPianoNoteProcessor, self).__init__((pre_processor, nn))
def __init__(self, **kwargs): # pylint: disable=unused-argument from ..audio.signal import SignalProcessor, FramedSignalProcessor from ..audio.filters import MelFilterbank from ..audio.spectrogram import (FilteredSpectrogramProcessor, LogarithmicSpectrogramProcessor) from ..models import ONSETS_CNN from ..ml.nn import NeuralNetwork # define pre-processing chain sig = SignalProcessor(num_channels=1, sample_rate=44100) # process the multi-resolution spec in parallel multi = ParallelProcessor([]) for frame_size in [2048, 1024, 4096]: frames = FramedSignalProcessor(frame_size=frame_size, fps=100) filt = FilteredSpectrogramProcessor( filterbank=MelFilterbank, num_bands=80, fmin=27.5, fmax=16000, norm_filters=True, unique_filters=False) spec = LogarithmicSpectrogramProcessor(log=np.log, add=EPSILON) # process each frame size with spec and diff sequentially multi.append(SequentialProcessor((frames, filt, spec))) # stack the features (in depth) and pad at beginning and end stack = np.dstack pad = _cnn_onset_processor_pad # pre-processes everything sequentially pre_processor = SequentialProcessor((sig, multi, stack, pad)) # process the pre-processed signal with a NN ensemble nn = NeuralNetwork.load(ONSETS_CNN[0]) # instantiate a SequentialProcessor super(CNNOnsetProcessor, self).__init__((pre_processor, nn))
def __init__(self, fs, hopsize_t): from madmom.audio.signal import SignalProcessor, FramedSignalProcessor from madmom.audio.stft import ShortTimeFourierTransformProcessor from madmom.audio.filters import MelFilterbank from madmom.audio.spectrogram import (FilteredSpectrogramProcessor, LogarithmicSpectrogramProcessor) # from madmom.features.onsets import _cnn_onset_processor_pad # define pre-processing chain sig = SignalProcessor(num_channels=1, sample_rate=fs) # process the multi-resolution spec in parallel multi = ParallelProcessor([]) for frame_size in [2048, 1024, 4096]: frames = FramedSignalProcessor(frame_size=frame_size, fps=100) stft = ShortTimeFourierTransformProcessor() # caching FFT window filt = FilteredSpectrogramProcessor( filterbank=MelFilterbank, num_bands=80, fmin=27.5, fmax=16000, norm_filters=True, unique_filters=False) spec = LogarithmicSpectrogramProcessor(log=np.log, add=EPSILON) # process each frame size with spec and diff sequentially multi.append(SequentialProcessor([frames, stft, filt, spec])) # stack the features (in depth) and pad at beginning and end stack = np.dstack # pad = _cnn_onset_processor_pad # pre-processes everything sequentially pre_processor = SequentialProcessor([sig, multi, stack]) # instantiate a SequentialProcessor super(MadmomMelbank3ChannelsProcessor, self).__init__([pre_processor])
def extract(yt_id): beats = SequentialProcessor( [RNNBeatProcessor(), DBNBeatTrackingProcessor(fps=100)]) chordrec = SequentialProcessor( [CNNChordFeatureProcessor(), CRFChordRecognitionProcessor()]) processMulti = ParallelProcessor([]) processMulti.append(beats) processMulti.append(chordrec) beatSync = SequentialProcessor( [printTime, processMulti, printTime, arrange, printTime]) return beatSync('tmp/' + yt_id + '.wav')
def __init__(self, online=False, **kwargs): # pylint: disable=unused-argument from ..audio.signal import SignalProcessor, FramedSignalProcessor from ..audio.stft import ShortTimeFourierTransformProcessor from ..audio.spectrogram import (FilteredSpectrogramProcessor, LogarithmicSpectrogramProcessor, SpectrogramDifferenceProcessor) from ..models import ONSETS_RNN, ONSETS_BRNN from ..ml.nn import NeuralNetworkEnsemble # choose the appropriate models and set frame sizes accordingly if online: origin = 'online' nn_files = ONSETS_RNN frame_sizes = [512, 1024, 2048] else: origin = 'offline' nn_files = ONSETS_BRNN frame_sizes = [1024, 2048, 4096] # define pre-processing chain sig = SignalProcessor(num_channels=1, sample_rate=44100) # process the multi-resolution spec & diff in parallel multi = ParallelProcessor([]) for frame_size in frame_sizes: frames = FramedSignalProcessor(frame_size=frame_size, fps=100, origin=origin) stft = ShortTimeFourierTransformProcessor() # caching FFT window filt = FilteredSpectrogramProcessor(num_bands=6, fmin=30, fmax=17000, norm_filters=True) spec = LogarithmicSpectrogramProcessor(mul=5, add=1) diff = SpectrogramDifferenceProcessor(diff_ratio=0.25, positive_diffs=True, stack_diffs=np.hstack) # process each frame size with spec and diff sequentially multi.append(SequentialProcessor((frames, stft, filt, spec, diff))) # stack the features and processes everything sequentially pre_processor = SequentialProcessor((sig, multi, np.hstack)) # process the pre-processed signal with a NN ensemble nn = NeuralNetworkEnsemble.load(nn_files, **kwargs) # instantiate a SequentialProcessor super(RNNOnsetProcessor, self).__init__((pre_processor, nn))
def _make_preprocessor(settings, pad): from madmom.audio.spectrogram import ( LogarithmicFilteredSpectrogramProcessor, SpectrogramDifferenceProcessor) from madmom.audio.filters import LogarithmicFilterbank from madmom.audio.signal import SignalProcessor, FramedSignalProcessor from madmom.audio.stft import ShortTimeFourierTransformProcessor from madmom.processors import SequentialProcessor sig = SignalProcessor(num_channels=1, sample_rate=settings['sample_rate']) frames = FramedSignalProcessor(frame_size=settings['frame_size'], fps=settings['fps']) stft = ShortTimeFourierTransformProcessor() # caching FFT window spec = LogarithmicFilteredSpectrogramProcessor( num_channels=1, sample_rate=settings['sample_rate'], filterbank=LogarithmicFilterbank, frame_size=settings['frame_size'], fps=settings['fps'], num_bands=settings['num_bands'], fmin=settings['fmin'], fmax=settings['fmax'], norm_filters=settings['norm_filters']) if settings['diff']: if 'pad' in settings and settings['pad']: stack = _crnn_drum_processor_stack else: stack = np.hstack diff = SpectrogramDifferenceProcessor(diff_ratio=0.5, positive_diffs=True, stack_diffs=stack) # process input data if pad > 0: pre_processor = SequentialProcessor( (sig, frames, stft, spec, diff, PadProcessor(pad))) else: pre_processor = SequentialProcessor( (sig, frames, stft, spec, diff)) else: if pad > 0: pre_processor = SequentialProcessor( (sig, frames, stft, spec, PadProcessor(pad))) else: pre_processor = SequentialProcessor((sig, frames, stft, spec)) return pre_processor
def __init__(self, hparams, dataset: FreeSoundAudioDataset): super(MadmomFeatureIteratorV2, self).__init__(hparams, dataset) if not isinstance(dataset, FreeSoundAudioDataset): raise AssertionError("dataset should be FreeSoundAudioDataset") sig_proc = SignalProcessor(num_channels=1, sample_rate=32000, norm=True) fsig_proc = FramedSignalProcessor(frame_size=1024, hop_size=128, origin='future') spec_proc = SpectrogramProcessor(frame_size=1024) filt_proc = LogarithmicFilteredSpectrogramProcessor( filterbank=LogFilterbank, num_bands=26, fmin=20, fmax=14000) processor_pipeline2 = [sig_proc, fsig_proc, spec_proc, filt_proc] self.processor_version2 = SequentialProcessor(processor_pipeline2)
def get_chords_processor(): print('START CHORDS PROCESSOR >> ', str(datetime.now())) from madmom.features.chords import CNNChordFeatureProcessor, CRFChordRecognitionProcessor from madmom.processors import SequentialProcessor print('CHORDS PROCESSOR >> ', str(datetime.now())) return SequentialProcessor( [CNNChordFeatureProcessor(), CRFChordRecognitionProcessor()])
def get_beat_processor(): print('START BEAT PROCESSOR >> ', str(datetime.now())) from madmom.features.beats import RNNBeatProcessor, DBNBeatTrackingProcessor from madmom.processors import SequentialProcessor print('BEAT PROCESSOR >> ', str(datetime.now())) return SequentialProcessor( [RNNBeatProcessor(), DBNBeatTrackingProcessor(fps=100)])
class MadmomFeatureIteratorV2(FreeSoundDataIteratorBase): """ Custom feature extraction using Madmom library pipepline Reference: https://github.com/CPJKU/dcase_task2/blob/master/dcase_task2/prepare_spectrograms.py """ def __init__(self, hparams, dataset: FreeSoundAudioDataset): super(MadmomFeatureIteratorV2, self).__init__(hparams, dataset) if not isinstance(dataset, FreeSoundAudioDataset): raise AssertionError("dataset should be FreeSoundAudioDataset") sig_proc = SignalProcessor(num_channels=1, sample_rate=32000, norm=True) fsig_proc = FramedSignalProcessor(frame_size=1024, hop_size=128, origin='future') spec_proc = SpectrogramProcessor(frame_size=1024) filt_proc = LogarithmicFilteredSpectrogramProcessor( filterbank=LogFilterbank, num_bands=26, fmin=20, fmax=14000) processor_pipeline2 = [sig_proc, fsig_proc, spec_proc, filt_proc] self.processor_version2 = SequentialProcessor(processor_pipeline2) @overrides def _user_map_func(self, file_path, label): """ Function that maps the audio files into features and labels as on-hot vector :param file_path: :param label: :return: """ data = self.processor_version2.process(file_path) label = self._dataset.get_one_hot_encoded(label) return data, label @overrides def _user_resize_func(self, data, label): """ Function that sets up the sizes of the tensor, after execution of `tf.py_func` call :param data: :param label: :return: """ data = tf.reshape(data, shape=[128, 33]) label = tf.reshape(label, shape=[42]) return data, label
def spectrogram_processor(spec_params): """Helper function for our spectrogram extraction.""" sig_proc = SignalProcessor(num_channels=1, sample_rate=spec_params['sample_rate']) fsig_proc = FramedSignalProcessor(frame_size=spec_params['frame_size'], fps=spec_params['fps']) spec_proc = FilteredSpectrogramProcessor(filterbank=LogarithmicFilterbank, num_bands=12, fmin=60, fmax=6000, norm_filters=True, unique_filters=False) log_proc = LogarithmicSpectrogramProcessor() processor = SequentialProcessor([sig_proc, fsig_proc, spec_proc, log_proc]) return processor
def build_cnn(madmom_processor_filename): from madmom.audio.signal import SignalProcessor, FramedSignalProcessor from madmom.audio.stft import ShortTimeFourierTransformProcessor from madmom.audio.spectrogram import (FilteredSpectrogramProcessor, LogarithmicSpectrogramProcessor) from madmom.ml.nn import NeuralNetworkEnsemble # define pre-processing chain sig = SignalProcessor(num_channels=1, sample_rate=44100) frames = FramedSignalProcessor(frame_size=4096, hop_size=441 * 2) stft = ShortTimeFourierTransformProcessor() # caching FFT window filt = FilteredSpectrogramProcessor(num_bands=24, fmin=30, fmax=10000) # this is the money param! it was not whitelisted in 'canonicalize_audio_options'! spec = LogarithmicSpectrogramProcessor(add=1) # pre-processes everything sequentially pre_processor = SequentialProcessor([ sig, frames, stft, filt, spec, _cnn_pad ]) # process the pre-processed signal with a NN nn = NeuralNetworkEnsemble.load([madmom_processor_filename]) return madmom.processors.SequentialProcessor([pre_processor, nn])
def __init__(self, sr=44100, **kwargs): from madmom.audio.signal import SignalProcessor, FramedSignalProcessor from madmom.audio.stft import ShortTimeFourierTransformProcessor from madmom.audio.spectrogram import (FilteredSpectrogramProcessor, LogarithmicSpectrogramProcessor) from madmom.ml.nn import NeuralNetworkEnsemble sr_ratio = 44100 / sr # define pre-processing chain sig = SignalProcessor(num_channels=1, sample_rate=sr) frames = FramedSignalProcessor(frame_size=4096 // sr_ratio, fps=50 // sr_ratio) stft = ShortTimeFourierTransformProcessor() # caching FFT window filt = FilteredSpectrogramProcessor(num_bands=24, fmin=30, fmax=10000) spec = LogarithmicSpectrogramProcessor(add=1) # pre-processes everything sequentially pre_processor = SequentialProcessor( (sig, frames, stft, filt, spec, _cnn_pad)) # process the pre-processed signal with a NN nn = NeuralNetworkEnsemble.load(VIENNA_MODEL_PATH) # instantiate a SequentialProcessor super().__init__((pre_processor, nn)) self.adsr = ADSRMaestro()
class DcasePredictorProvider(PredictorContract): """ Implementation of a PredictorContract. This class makes predictions where spectrograms are considered as inputs and a convolutional neural network produces class probabilities. Attributes ---------- sig_proc : madmom.Processor processor which outputs sampled audio signals fsig_proc : madmom.Processor processor which produces overlapping frames based on sampled signals spec_proc : madmom.Processor processor which computes a spectrogram with stft based on framed signals filt_proc : madmom.Processor processor which filters and scales a spectrogram processorPipeline : SequentialProcessor creates pipeline of elements of type madmom.Processor classes : list of str class list device : str indicates the processor to be used for neural network prediction prediction_model : baseline_net.Net holds a reference to the CNN architecture sliding_window : 2d numpy array cache for previously calculated spectrograms lastProceededGroundTruth : int variable to keep track of the last processed audio chunk slidingWindowThread: reference pointing to the sliding window thread predictionThread: reference pointing to the prediction thread Methods ------- start() starts all necessary sub tasks of this predictor. stop() stops all necessary sub tasks of this predictor. computeSpectrogram() compute a spectrogram based on the most current audio chunk. predict() CNN prediction based on current spectrogram input. """ # madmom pipeline for spectrogram calculation sig_proc = SignalProcessor(num_channels=1, sample_rate=32000, norm=True) fsig_proc = FramedSignalProcessor(frame_size=1024, hop_size=128, origin='future') spec_proc = SpectrogramProcessor(frame_size=1024) filt_proc = LogarithmicFilteredSpectrogramProcessor(filterbank=LogFilterbank, num_bands=26, fmin=20, fmax=14000) processorPipeline = SequentialProcessor([sig_proc, fsig_proc, spec_proc, filt_proc]) classes = ["Acoustic_guitar", "Applause", "Bark", "Bass_drum", "Burping_or_eructation", "Bus", "Cello", "Chime", "Clarinet", "Computer_keyboard", "Cough", "Cowbell", "Double_bass", "Drawer_open_or_close", "Electric_piano", "Fart", "Finger_snapping", "Fireworks", "Flute", "Glockenspiel", "Gong", "Gunshot_or_gunfire", "Harmonica", "Hi-hat", "Keys_jangling", "Knock", "Laughter", "Meow", "Microwave_oven", "Oboe", "Saxophone", "Scissors", "Shatter", "Snare_drum", "Squeak", "Tambourine", "Tearing", "Telephone", "Trumpet", "Violin_or_fiddle", "Writing"] device = 'cuda' if torch.cuda.is_available() else 'cpu' def __init__(self, condition): """ Parameters ---------- prediction_model : baseline_net.Net holds a reference to the CNN architecture sliding_window : 2d numpy array cache for previously calculated spectrograms lastProceededGroundTruth : int variable to keep track of the last processed audio chunk """ # load model with its tuned weight parameters self.prediction_model = Net() self.prediction_model.load_state_dict( torch.load(os.path.join(PROJECT_ROOT, 'server/consumer/predictors/dcase_predictor_provider/baseline_net.pt'), map_location=lambda storage, location: storage)) self.prediction_model.to(self.device) self.prediction_model.eval() # sliding window as cache self.sliding_window = np.zeros((128, 256), dtype=np.float32) self.lastProceededGroundTruth = None self.condition = condition def start(self): """Start all sub tasks necessary for continuous prediction. """ self.slidingWindowThread = SlidingWindowThread(self) self.predictionThread = PredictionThread(self) self.slidingWindowThread.start() self.predictionThread.start() def stop(self): """Stops all sub tasks """ self.slidingWindowThread.join() self.predictionThread.join() def computeSpectrogram(self): """This methods first access the global time variable ``tGroundTruth`` and reads audio chunk the time variable points to. Afterwards, the defined madmom pipeline is processed to get the spectrogram representation of the single chunk. Finally, the sliding window is updated with the new audio chunk. """ t = self.manager.tGroundTruth # if thread faster than producer, do not consume same chunk multiple times if t != self.lastProceededGroundTruth: frame = self.manager.sharedMemory[(t - 1) % BUFFER_SIZE] # modulo avoids index under/overflow frame = np.fromstring(frame, np.int16) spectrogram = self.processorPipeline.process(frame) frame = spectrogram[0] if np.any(np.isnan(frame)): frame = np.zeros_like(frame, dtype=np.float32) # update sliding window self.sliding_window[:, 0:-1] = self.sliding_window[:, 1::] self.sliding_window[:, -1] = frame self.lastProceededGroundTruth = t def predict(self): """ This method executes the actual prediction task based on the currently available slinding window. The sliding window is sent into the CNN model and the correpsonding softmax output for the respecive classes are returned Returns ------- probs : array of list objects an array of number of classes entries where each entry consists of the class name, its predicted probability and a position index. Example: ``[["class1", 0.0006955251446925104, 0], ["class2", 0.0032770668622106314, 1], ...]`` """ input = self.sliding_window[np.newaxis, np.newaxis] cuda_torch_input = torch.from_numpy(input).to(self.device) model_output = self.prediction_model(cuda_torch_input) # prediction by model softmax = nn.Softmax(dim=1) softmax_output = softmax(model_output) predicts = softmax_output.cpu().detach().numpy().flatten() probs = [[elem, predicts[index].item(), index] for index, elem in enumerate(self.classes)] return probs
# keep spectrogram spectrograms.append(np.asarray(spectrogram)) spectrograms = np.asarray(spectrograms) return spectrograms processor_version1 = LibrosaProcessor() sig_proc = SignalProcessor(num_channels=1, sample_rate=32000, norm=True) fsig_proc = FramedSignalProcessor(frame_size=1024, hop_size=128, origin='future') spec_proc = SpectrogramProcessor(frame_size=1024) filt_proc = LogarithmicFilteredSpectrogramProcessor(filterbank=LogFilterbank, num_bands=26, fmin=20, fmax=14000) processor_pipeline2 = [sig_proc, fsig_proc, spec_proc, filt_proc] processor_version2 = SequentialProcessor(processor_pipeline2) if __name__ == "__main__": """ main """ # add argument parser parser = argparse.ArgumentParser(description='Pre-compute spectrograms for training and testing.') parser.add_argument('--audio_path', help='path to audio files.') parser.add_argument('--spec_path', help='path where to store spectrograms.') parser.add_argument('--show', help='show spectrogram plots.', type=int, default=None) parser.add_argument('--dump', help='dump spectrograms.', action='store_true') parser.add_argument('--spec_version', help='spectrogram version to compute (1 or 2).', type=int, default=1) parser.add_argument('--no_preprocessing', help='compute spectrogram for original audios.', action='store_true') args = parser.parse_args()
class MadmomSpectrogramProvider(VisualisationContract): """ Implementation of a VisualisationContract. This class computes new spectrograms based on the most current audio chunks which is indicated via ``tGroundTruth``. Attributes ---------- sig_proc : madmom.Processor processor which outputs sampled audio signals fsig_proc : madmom.Processor processor which produces overlapping frames based on sampled signals spec_proc : madmom.Processor processor which computes a spectrogram with stft based on framed signals filt_proc : madmom.Processor processor which filters and scales a spectrogram processorPipeline : SequentialProcessor creates pipeline of elements of type madmom.Processor sliding_window : 2d numpy array cache for previously calculated spectrograms lastProceededGroundTruth : int variable to keep track of the last processed audio chunk visThread: reference pointing to the sliding window thread Methods ------- start() starts all necessary sub tasks of this visualizer. stop() stops all necessary sub tasks of this visualizer. computeSpectrogram() compute a spectrogram based on the most current audio chunk. """ # madmom pipeline for spectrogram calculation sig_proc = SignalProcessor(num_channels=1, sample_rate=32000, norm=True) fsig_proc = FramedSignalProcessor(frame_size=1024, hop_size=128, origin='future') spec_proc = SpectrogramProcessor(frame_size=1024) filt_proc = LogarithmicFilteredSpectrogramProcessor(filterbank=LogFilterbank, num_bands=26, fmin=20, fmax=14000) processorPipeline = SequentialProcessor([sig_proc, fsig_proc, spec_proc, filt_proc]) def __init__(self, condition): """ Parameters ---------- sliding_window : 2d numpy array cache for previously calculated spectrograms lastProceededGroundTruth : int variable to keep track of the last processed audio chunk """ # sliding window as cache self.sliding_window = np.zeros((128, 256), dtype=np.float32) self.lastProceededGroundTruth = None self.condition = condition def start(self): """Start all sub tasks necessary for continuous spectrograms. """ self.visThread = VisualisationThread(self) self.visThread.start() def stop(self): """Stops all sub tasks """ self.visThread.join() def computeSpectrogram(self): """This methods first access the global time variable ``tGroundTruth`` and reads audio chunk the time variable points to. Afterwards, the defined madmom pipeline is processed to get the spectrogram representation of the single chunk. Finally, the sliding window is updated with the new audio chunk and a copy of the sliding window is returned to the calling thread. Returns ------- sliding_window : 2d numpy array of float values returns a copy of the current sliding window spectrogram """ # if thread faster than producer, do not consume same chunk multiple times t = self.manager.tGroundTruth if t != self.lastProceededGroundTruth: frame = self.manager.sharedMemory[(t - 1) % BUFFER_SIZE] # modulo avoids index under/overflow frame = np.fromstring(frame, np.int16) spectrogram = self.processorPipeline.process(frame) frame = spectrogram[0] if np.any(np.isnan(frame)): frame = np.zeros_like(frame, dtype=np.float32) # update sliding window self.sliding_window[:, 0:-1] = self.sliding_window[:, 1::] self.sliding_window[:, -1] = frame self.lastProceededGroundTruth = t return self.sliding_window.copy()
from madmom.processors import SequentialProcessor # init signal processing SAMPLE_RATE = 22050 FRAME_SIZE = 2048 FPS = 20 sig_proc = SignalProcessor(num_channels=1, sample_rate=SAMPLE_RATE) fsig_proc = FramedSignalProcessor(frame_size=FRAME_SIZE, fps=FPS, origin='future') spec_proc = FilteredSpectrogramProcessor( LogarithmicFilterbank, num_bands=16, fmin=30, fmax=6000) # num_bands=24, fmin=30, fmax=8000 log_spec_proc = LogarithmicSpectrogramProcessor() processor = SequentialProcessor( [sig_proc, fsig_proc, spec_proc, log_spec_proc]) colors = ['c', 'm', 'y'] def notes_to_onsets(notes, dt): """ Convert sequence of keys to onset frames """ onsets = [] for n in notes: onset = int(np.ceil(n[0] / dt)) onsets.append(onset) return np.sort(np.asarray(onsets)).astype(np.float32)
class Preprocessor(): def __init__(self, spectrogram_path=None, version=1, test=False, dump=False, preprocessing=True, sample_rate=32000, silence_threshold=40): if (version != 1 and version != 2): raise NameError("version must be 1 or 2") self.version = version self.spectrogram_path = spectrogram_path self.sample_rate = sample_rate self.preprocessing = preprocessing self.test = test self.dump = dump self.silence_threshold = silence_threshold sig_proc = SignalProcessor(num_channels=1, sample_rate=self.sample_rate, norm=True) fsig_proc = FramedSignalProcessor(frame_size=1024, hop_size=128, origin='future') spec_proc = SpectrogramProcessor(frame_size=1024) filt_proc = LogarithmicFilteredSpectrogramProcessor( filterbank=LogFilterbank, num_bands=26, fmin=20, fmax=14000) processor_pipeline = [sig_proc, fsig_proc, spec_proc, filt_proc] self.processor_version2 = SequentialProcessor(processor_pipeline) def __spectrogram_V1(self, signal, fft_window_size, hop_length, log_spectrogram, n_mels, fmax): # compute stft stft = librosa.stft(signal, n_fft=fft_window_size, hop_length=hop_length, win_length=None, window='hann', center=True, pad_mode='reflect') # keep only magnitude stft = np.abs(stft) # spectrogram weighting if log_spectrogram: stft = np.log10(stft + 1) else: freqs = librosa.core.fft_frequencies(sr=self.sample_rate, n_fft=fft_window_size) stft = librosa.perceptual_weighting(stft**2, freqs, ref=1.0, amin=1e-10, top_db=99.0) # apply mel filterbank spectrogram = librosa.feature.melspectrogram(S=stft, sr=self.sample_rate, n_mels=n_mels, fmax=fmax) spectrogram = np.asarray(spectrogram) return spectrogram def __spectrogram_V2(self, signal): spectrogram = self.processor_version2.process(signal) return spectrogram def normalize_and_trim_silence(self, signal): # trim silence at beginning and end and normalize to -0.1 signal_normalized = librosa.util.normalize(signal, norm=100) signal_normalized, index = librosa.effects.trim( signal_normalized, top_db=self.silence_threshold) return signal_normalized def compute_spectrogram(self, signal, file_name=None): if (self.dump and file_name == None): raise NameError("A file_name must be specified") if (self.preprocessing): signal = self.normalize_and_trim_silence(signal) if (self.version == 1): spectrogram = self.__spectrogram_V1(signal, fft_window_size=1024, hop_length=192, log_spectrogram=False, n_mels=128, fmax=None) else: spectrogram = self.__spectrogram_V2(signal) spectrogram = np.swapaxes(spectrogram, 0, 1) # plot spectrogram if self.test: print("Spectrogram Shape:", spectrogram.shape) plt.figure("General-Purpose ") plt.clf() plt.subplots_adjust(right=0.98, left=0.1, bottom=0.1, top=0.99) plt.imshow(spectrogram, origin="lower", interpolation="nearest", cmap="viridis") plt.xlabel("%d frames" % spectrogram.shape[1]) plt.ylabel("%d bins" % spectrogram.shape[0]) plt.colorbar() plt.show() plt.show(block=True) if self.dump: # save spectrograms if not os.path.exists(self.spectrogram_path): os.makedirs(self.spectrogram_path) spec_file = os.path.join(self.spectrogram_path, file_name) np.save(spec_file, spectrogram) return spectrogram def set_version(self, version): self.version = version def set_spectrogram_path(self, spectrogram_path): self.spectrogram_path = spectrogram_path