def __init__(self, fs, hopsize_t): from madmom.audio.signal import SignalProcessor, FramedSignalProcessor from madmom.audio.stft import ShortTimeFourierTransformProcessor from madmom.audio.filters import MelFilterbank from madmom.audio.spectrogram import (FilteredSpectrogramProcessor, LogarithmicSpectrogramProcessor) # define pre-processing chain sig = SignalProcessor(num_channels=1, sample_rate=fs) frames = FramedSignalProcessor(frame_size=2048, hopsize=int(fs * hopsize_t)) stft = ShortTimeFourierTransformProcessor() # caching FFT window filt = FilteredSpectrogramProcessor(filterbank=MelFilterbank, num_bands=80, fmin=27.5, fmax=16000, norm_filters=True, unique_filters=False) spec = LogarithmicSpectrogramProcessor(log=np.log, add=EPSILON) single = SequentialProcessor([frames, stft, filt, spec]) pre_processor = SequentialProcessor([sig, single]) super(MadmomMelbankProcessor, self).__init__([pre_processor])
def spec_from_midi(midi_file): sig_proc = SignalProcessor(num_channels=1, sample_rate=spec_params["sample_rate"]) fsig_proc = FramedSignalProcessor(frame_size=spec_params["frame_size"], fps=spec_params["fps"]) spec_proc = FilteredSpectrogramProcessor(filterbank=LogarithmicFilterbank, num_bands=12, fmin=60, fmax=6000, norm_filters=True, unique_filters=False) log_proc = LogarithmicSpectrogramProcessor() processor = SequentialProcessor([sig_proc, fsig_proc, spec_proc, log_proc]) # print(midi_file) if not os.path.isfile(midi_file.replace('.mid', '.wav')): # render audio file from midi render_audio(midi_file, sound_font=SOUND_FONT_PATH) # compute spectrogram audio_path = midi_file.replace('.mid', '.wav') # if the spectrogram doesn't exist it will be computed and stored if not os.path.isfile(midi_file.replace('.mid', '.spec.npy')): spec = processor.process(audio_path).T np.save(midi_file.replace('.mid', '.spec'), spec) else: spec = np.load(midi_file.replace('.mid', '.spec.npy')) return spec
def __init__(self, fs, hopsize_t): from madmom.audio.signal import SignalProcessor, FramedSignalProcessor from madmom.audio.stft import ShortTimeFourierTransformProcessor from madmom.audio.filters import MelFilterbank from madmom.audio.spectrogram import (FilteredSpectrogramProcessor, LogarithmicSpectrogramProcessor) # from madmom.features.onsets import _cnn_onset_processor_pad # define pre-processing chain sig = SignalProcessor(num_channels=1, sample_rate=fs) # process the multi-resolution spec in parallel frames = FramedSignalProcessor(frame_size=2048, hopsize=int(fs * hopsize_t)) stft = ShortTimeFourierTransformProcessor() # caching FFT window filt = FilteredSpectrogramProcessor(filterbank=MelFilterbank, num_bands=80, fmin=27.5, fmax=16000, norm_filters=True, unique_filters=False) spec = LogarithmicSpectrogramProcessor(log=np.log, add=EPSILON) # process each frame size with spec and diff sequentially single = SequentialProcessor([frames, stft, filt, spec]) # pre-processes everything sequentially pre_processor = SequentialProcessor([sig, single]) # instantiate a SequentialProcessor super(MadmomMelbankProcessor, self).__init__([pre_processor])
def __init__(self, fs, hopsize_t): from madmom.audio.signal import SignalProcessor, FramedSignalProcessor from madmom.audio.stft import ShortTimeFourierTransformProcessor from madmom.audio.filters import MelFilterbank from madmom.audio.spectrogram import (FilteredSpectrogramProcessor, LogarithmicSpectrogramProcessor) # from madmom.features.onsets import _cnn_onset_processor_pad # define pre-processing chain sig = SignalProcessor(num_channels=1, sample_rate=fs) # process the multi-resolution spec in parallel multi = ParallelProcessor([]) for frame_size in [2048, 1024, 4096]: frames = FramedSignalProcessor(frame_size=frame_size, fps=100) stft = ShortTimeFourierTransformProcessor() # caching FFT window filt = FilteredSpectrogramProcessor( filterbank=MelFilterbank, num_bands=80, fmin=27.5, fmax=16000, norm_filters=True, unique_filters=False) spec = LogarithmicSpectrogramProcessor(log=np.log, add=EPSILON) # process each frame size with spec and diff sequentially multi.append(SequentialProcessor([frames, stft, filt, spec])) # stack the features (in depth) and pad at beginning and end stack = np.dstack # pad = _cnn_onset_processor_pad # pre-processes everything sequentially pre_processor = SequentialProcessor([sig, multi, stack]) # instantiate a SequentialProcessor super(MadmomMelbank3ChannelsProcessor, self).__init__([pre_processor])
def __init__(self, sample_rate=44100, filter_length=8192, hop_length=8820, win_length=None, num_bands=24, fmin=65, fmax=2100, unique_filters=True): super(LMLFSpectrogram, self).__init__() self.stft = STFT(filter_length, hop_length, win_length) # filterbank from madmom fname = 'lmlf.wav' sf.write(fname, np.random.uniform(-1, 1, 100000), sample_rate) _sig = SignalProcessor(num_channels=1, sample_rate=sample_rate) _frames = FramedSignalProcessor(frame_size=filter_length, fps=sample_rate / hop_length) _stft = ShortTimeFourierTransformProcessor() # caching FFT window _spec = LogarithmicFilteredSpectrogramProcessor( num_bands=num_bands, fmin=fmin, fmax=fmax, unique_filters=unique_filters) _spec(_stft(_frames(_sig(fname)))) os.remove(fname) self.filterbank = torch.FloatTensor(np.asarray(_spec.filterbank))
def __init__(self, spectrogram_path=None, version=1, test=False, dump=False, preprocessing=True, sample_rate=32000, silence_threshold=40): if (version != 1 and version != 2): raise NameError("version must be 1 or 2") self.version = version self.spectrogram_path = spectrogram_path self.sample_rate = sample_rate self.preprocessing = preprocessing self.test = test self.dump = dump self.silence_threshold = silence_threshold sig_proc = SignalProcessor(num_channels=1, sample_rate=self.sample_rate, norm=True) fsig_proc = FramedSignalProcessor(frame_size=1024, hop_size=128, origin='future') spec_proc = SpectrogramProcessor(frame_size=1024) filt_proc = LogarithmicFilteredSpectrogramProcessor( filterbank=LogFilterbank, num_bands=26, fmin=20, fmax=14000) processor_pipeline = [sig_proc, fsig_proc, spec_proc, filt_proc] self.processor_version2 = SequentialProcessor(processor_pipeline)
def __init__(self, **kwargs): from madmom.audio.signal import SignalProcessor, FramedSignalProcessor from madmom.audio.stft import ShortTimeFourierTransformProcessor from madmom.audio.spectrogram import (FilteredSpectrogramProcessor, LogarithmicSpectrogramProcessor, SpectrogramDifferenceProcessor) from madmom.processors import SequentialProcessor, ParallelProcessor # define pre-processing chain sig = SignalProcessor(num_channels=1, sample_rate=44100) # process the multi-resolution spec & diff in parallel multi = ParallelProcessor([]) for frame_size in [4096]: frames = FramedSignalProcessor(frame_size=frame_size, fps=100) stft = ShortTimeFourierTransformProcessor( window=np.hamming(frame_size)) # caching FFT window filt = FilteredSpectrogramProcessor(num_bands=12, fmin=30, fmax=16000, norm_filters=True) spec = LogarithmicSpectrogramProcessor(mul=5, add=1) #diff = SpectrogramDifferenceProcessor(diff_ratio=0.5, positive_diffs=True, stack_diffs=np.hstack) # process each frame size with spec and diff sequentially multi.append(SequentialProcessor((frames, stft, filt, spec))) #multi.append(SequentialProcessor((frames, stft, filt))) # stack the features and processes everything sequentially pre_processor = SequentialProcessor((sig, multi, np.hstack)) super(PianoNoteProcessor, self).__init__(pre_processor)
def CreateProcesser(fps=100): # define pre-processing chain sig = SignalProcessor(num_channels=1, sample_rate=44100) # process the multi-resolution spec & diff in parallel # process the multi-resolution spec & diff in parallel multi = ParallelProcessor([]) frame_sizes = [1024, 2048, 4096] num_bands = [3, 6, 12] for frame_size, num_bands in zip(frame_sizes, num_bands): frames = FramedSignalProcessor(frame_size=frame_size, fps=fps) stft = ShortTimeFourierTransformProcessor() # caching FFT window filt = FilteredSpectrogramProcessor(num_bands=num_bands, fmin=30, fmax=17000, norm_filters=True) spec = LogarithmicSpectrogramProcessor(mul=1, add=1) diff = SpectrogramDifferenceProcessor(diff_ratio=0.5, positive_diffs=True, stack_diffs=np.hstack) # process each frame size with spec and diff sequentially multi.append(SequentialProcessor((frames, stft, filt, spec, diff))) # stack the features and processes everything sequentially pre_processor = SequentialProcessor((sig, multi, np.hstack)) return pre_processor
def _make_preprocessor(settings, pad): from madmom.audio.spectrogram import ( LogarithmicFilteredSpectrogramProcessor, SpectrogramDifferenceProcessor) from madmom.audio.filters import LogarithmicFilterbank from madmom.audio.signal import SignalProcessor, FramedSignalProcessor from madmom.audio.stft import ShortTimeFourierTransformProcessor from madmom.processors import SequentialProcessor sig = SignalProcessor(num_channels=1, sample_rate=settings['sample_rate']) frames = FramedSignalProcessor(frame_size=settings['frame_size'], fps=settings['fps']) stft = ShortTimeFourierTransformProcessor() # caching FFT window spec = LogarithmicFilteredSpectrogramProcessor( num_channels=1, sample_rate=settings['sample_rate'], filterbank=LogarithmicFilterbank, frame_size=settings['frame_size'], fps=settings['fps'], num_bands=settings['num_bands'], fmin=settings['fmin'], fmax=settings['fmax'], norm_filters=settings['norm_filters']) if settings['diff']: if 'pad' in settings and settings['pad']: stack = _crnn_drum_processor_stack else: stack = np.hstack diff = SpectrogramDifferenceProcessor(diff_ratio=0.5, positive_diffs=True, stack_diffs=stack) # process input data if pad > 0: pre_processor = SequentialProcessor( (sig, frames, stft, spec, diff, PadProcessor(pad))) else: pre_processor = SequentialProcessor( (sig, frames, stft, spec, diff)) else: if pad > 0: pre_processor = SequentialProcessor( (sig, frames, stft, spec, PadProcessor(pad))) else: pre_processor = SequentialProcessor((sig, frames, stft, spec)) return pre_processor
def __init__(self, hparams, dataset: FreeSoundAudioDataset): super(MadmomFeatureIteratorV2, self).__init__(hparams, dataset) if not isinstance(dataset, FreeSoundAudioDataset): raise AssertionError("dataset should be FreeSoundAudioDataset") sig_proc = SignalProcessor(num_channels=1, sample_rate=32000, norm=True) fsig_proc = FramedSignalProcessor(frame_size=1024, hop_size=128, origin='future') spec_proc = SpectrogramProcessor(frame_size=1024) filt_proc = LogarithmicFilteredSpectrogramProcessor( filterbank=LogFilterbank, num_bands=26, fmin=20, fmax=14000) processor_pipeline2 = [sig_proc, fsig_proc, spec_proc, filt_proc] self.processor_version2 = SequentialProcessor(processor_pipeline2)
def spectrogram_processor(spec_params): """Helper function for our spectrogram extraction.""" sig_proc = SignalProcessor(num_channels=1, sample_rate=spec_params['sample_rate']) fsig_proc = FramedSignalProcessor(frame_size=spec_params['frame_size'], fps=spec_params['fps']) spec_proc = FilteredSpectrogramProcessor(filterbank=LogarithmicFilterbank, num_bands=12, fmin=60, fmax=6000, norm_filters=True, unique_filters=False) log_proc = LogarithmicSpectrogramProcessor() processor = SequentialProcessor([sig_proc, fsig_proc, spec_proc, log_proc]) return processor
def build_cnn(madmom_processor_filename): from madmom.audio.signal import SignalProcessor, FramedSignalProcessor from madmom.audio.stft import ShortTimeFourierTransformProcessor from madmom.audio.spectrogram import (FilteredSpectrogramProcessor, LogarithmicSpectrogramProcessor) from madmom.ml.nn import NeuralNetworkEnsemble # define pre-processing chain sig = SignalProcessor(num_channels=1, sample_rate=44100) frames = FramedSignalProcessor(frame_size=4096, hop_size=441 * 2) stft = ShortTimeFourierTransformProcessor() # caching FFT window filt = FilteredSpectrogramProcessor(num_bands=24, fmin=30, fmax=10000) # this is the money param! it was not whitelisted in 'canonicalize_audio_options'! spec = LogarithmicSpectrogramProcessor(add=1) # pre-processes everything sequentially pre_processor = SequentialProcessor([ sig, frames, stft, filt, spec, _cnn_pad ]) # process the pre-processed signal with a NN nn = NeuralNetworkEnsemble.load([madmom_processor_filename]) return madmom.processors.SequentialProcessor([pre_processor, nn])
def __init__(self, sr=44100, **kwargs): from madmom.audio.signal import SignalProcessor, FramedSignalProcessor from madmom.audio.stft import ShortTimeFourierTransformProcessor from madmom.audio.spectrogram import (FilteredSpectrogramProcessor, LogarithmicSpectrogramProcessor) from madmom.ml.nn import NeuralNetworkEnsemble sr_ratio = 44100 / sr # define pre-processing chain sig = SignalProcessor(num_channels=1, sample_rate=sr) frames = FramedSignalProcessor(frame_size=4096 // sr_ratio, fps=50 // sr_ratio) stft = ShortTimeFourierTransformProcessor() # caching FFT window filt = FilteredSpectrogramProcessor(num_bands=24, fmin=30, fmax=10000) spec = LogarithmicSpectrogramProcessor(add=1) # pre-processes everything sequentially pre_processor = SequentialProcessor( (sig, frames, stft, filt, spec, _cnn_pad)) # process the pre-processed signal with a NN nn = NeuralNetworkEnsemble.load(VIENNA_MODEL_PATH) # instantiate a SequentialProcessor super().__init__((pre_processor, nn)) self.adsr = ADSRMaestro()
class MadmomSpectrogramProvider(VisualisationContract): """ Implementation of a VisualisationContract. This class computes new spectrograms based on the most current audio chunks which is indicated via ``tGroundTruth``. Attributes ---------- sig_proc : madmom.Processor processor which outputs sampled audio signals fsig_proc : madmom.Processor processor which produces overlapping frames based on sampled signals spec_proc : madmom.Processor processor which computes a spectrogram with stft based on framed signals filt_proc : madmom.Processor processor which filters and scales a spectrogram processorPipeline : SequentialProcessor creates pipeline of elements of type madmom.Processor sliding_window : 2d numpy array cache for previously calculated spectrograms lastProceededGroundTruth : int variable to keep track of the last processed audio chunk visThread: reference pointing to the sliding window thread Methods ------- start() starts all necessary sub tasks of this visualizer. stop() stops all necessary sub tasks of this visualizer. computeSpectrogram() compute a spectrogram based on the most current audio chunk. """ # madmom pipeline for spectrogram calculation sig_proc = SignalProcessor(num_channels=1, sample_rate=32000, norm=True) fsig_proc = FramedSignalProcessor(frame_size=1024, hop_size=128, origin='future') spec_proc = SpectrogramProcessor(frame_size=1024) filt_proc = LogarithmicFilteredSpectrogramProcessor(filterbank=LogFilterbank, num_bands=26, fmin=20, fmax=14000) processorPipeline = SequentialProcessor([sig_proc, fsig_proc, spec_proc, filt_proc]) def __init__(self, condition): """ Parameters ---------- sliding_window : 2d numpy array cache for previously calculated spectrograms lastProceededGroundTruth : int variable to keep track of the last processed audio chunk """ # sliding window as cache self.sliding_window = np.zeros((128, 256), dtype=np.float32) self.lastProceededGroundTruth = None self.condition = condition def start(self): """Start all sub tasks necessary for continuous spectrograms. """ self.visThread = VisualisationThread(self) self.visThread.start() def stop(self): """Stops all sub tasks """ self.visThread.join() def computeSpectrogram(self): """This methods first access the global time variable ``tGroundTruth`` and reads audio chunk the time variable points to. Afterwards, the defined madmom pipeline is processed to get the spectrogram representation of the single chunk. Finally, the sliding window is updated with the new audio chunk and a copy of the sliding window is returned to the calling thread. Returns ------- sliding_window : 2d numpy array of float values returns a copy of the current sliding window spectrogram """ # if thread faster than producer, do not consume same chunk multiple times t = self.manager.tGroundTruth if t != self.lastProceededGroundTruth: frame = self.manager.sharedMemory[(t - 1) % BUFFER_SIZE] # modulo avoids index under/overflow frame = np.fromstring(frame, np.int16) spectrogram = self.processorPipeline.process(frame) frame = spectrogram[0] if np.any(np.isnan(frame)): frame = np.zeros_like(frame, dtype=np.float32) # update sliding window self.sliding_window[:, 0:-1] = self.sliding_window[:, 1::] self.sliding_window[:, -1] = frame self.lastProceededGroundTruth = t return self.sliding_window.copy()
# ! Log-magnitude log-frequency spectrogram num_bands = 24 fmin = 65 fmax = 2100 # torch torch_lmlf = LMLFSpectrogram(sample_rate=sr, filter_length=filter_length, hop_length=hop_length, num_bands=num_bands, fmin=65, fmax=2100) lmlf = torch_lmlf(real_wave.unsqueeze(0)) # madmom _sig = SignalProcessor(num_channels=1, sample_rate=sr) _frames = FramedSignalProcessor(frame_size=filter_length, fps=sr / hop_length) _stft = ShortTimeFourierTransformProcessor() # caching FFT window _spec = LogarithmicFilteredSpectrogramProcessor(num_bands=num_bands, fmin=fmin, fmax=fmax) sig = _sig(librosa.util.example_audio_file()) frames = _frames(sig) stft = _stft(frames) spec = _spec(stft) diff = np.mean(np.abs(lmlf.squeeze(0).numpy() - spec)) print('===== log-magnitude log-frequency spectrogram =====') print('mean difference between outputs from torch and madmom : ', diff) print('shape : ', lmlf.shape)
def main(): """PianoTranscriptor""" # define parser p = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=''' The PianoTranscriptor program detects all notes (onsets) in an audio file according to the algorithm described in: "Polyphonic Piano Note Transcription with Recurrent Neural Networks" Sebastian Böck and Markus Schedl. Proceedings of the 37th International Conference on Acoustics, Speech and Signal Processing (ICASSP), 2012. Instead of 'LSTM' units, the current version uses 'tanh' units. This program can be run in 'single' file mode to process a single audio file and write the detected notes to STDOUT or the given output file. $ PianoTranscriptor single INFILE [-o OUTFILE] If multiple audio files should be processed, the program can also be run in 'batch' mode to save the detected notes to files with the given suffix. $ PianoTranscriptor batch [-o OUTPUT_DIR] [-s OUTPUT_SUFFIX] LIST OF FILES If no output directory is given, the program writes the files with the detected notes to same location as the audio files. The 'pickle' mode can be used to store the used parameters to be able to exactly reproduce experiments. ''') # version p.add_argument('--version', action='version', version='PianoTranscriptor.2013') # input/output arguments io_arguments(p, output_suffix='.notes.txt') ActivationsProcessor.add_arguments(p) # signal processing arguments SignalProcessor.add_arguments(p, norm=False, gain=0, start=True, stop=True) # peak picking arguments PeakPickingProcessor.add_arguments(p, threshold=0.35, smooth=0.09, combine=0.05) # midi arguments # import madmom.utils.midi as midi # midi.MIDIFile.add_arguments(p, length=0.6, velocity=100) p.add_argument('--midi', dest='output_format', action='store_const', const='midi', help='save as MIDI') # mirex stuff p.add_argument('--mirex', dest='output_format', action='store_const', const='mirex', help='use the MIREX output format') # parse arguments args = p.parse_args() # set immutable defaults args.fps = 100 args.pre_max = 1. / args.fps args.post_max = 1. / args.fps # set the suffix for midi files if args.output_format == 'midi': args.output_suffix = '.mid' # print arguments if args.verbose: print(args) # input processor if args.load: # load the activations from file in_processor = ActivationsProcessor(mode='r', **vars(args)) else: # use a RNN to predict the notes in_processor = RNNPianoNoteProcessor() # output processor if args.save: # save the RNN note activations to file out_processor = ActivationsProcessor(mode='w', **vars(args)) else: # perform peak picking on the activation function peak_picking = PeakPickingProcessor(**vars(args)) # output everything in the right format if args.output_format is None: output = write_notes elif args.output_format == 'midi': output = write_midi elif args.output_format == 'mirex': output = write_mirex_format else: raise ValueError('unknown output format: %s' % args.output_format) out_processor = [peak_picking, output] # create an IOProcessor processor = IOProcessor(in_processor, out_processor) # and call the processing function args.func(processor, **vars(args))
def main(): """DBNBeatTracker""" # define parser p = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=''' The DBNBeatTracker.py program detects all beats in an audio file according to the method described in: "A Multi-Model Approach to Beat Tracking Considering Heterogeneous Music Styles" Sebastian Böck, Florian Krebs and Gerhard Widmer. Proceedings of the 15th International Society for Music Information Retrieval Conference (ISMIR), 2014. It does not use the multi-model (Section 2.2.) and selection stage (Section 2.3), i.e. this version corresponds to the pure DBN version of the algorithm for which results are given in Table 2. Instead of the originally proposed state space and transition model for the DBN, the following is used: "An Efficient State Space Model for Joint Tempo and Meter Tracking" Florian Krebs, Sebastian Böck and Gerhard Widmer. Proceedings of the 16th International Society for Music Information Retrieval Conference (ISMIR), 2015. This program can be run in 'single' file mode to process a single audio file and write the detected beats to STDOUT or the given output file. $ DBNBeatTracker.py single INFILE [-o OUTFILE] If multiple audio files should be processed, the program can also be run in 'batch' mode to save the detected beats to files with the given suffix. $ DBNBeatTracker.py batch [-o OUTPUT_DIR] [-s OUTPUT_SUFFIX] FILES If no output directory is given, the program writes the files with the detected beats to the same location as the audio files. The 'pickle' mode can be used to store the used parameters to be able to exactly reproduce experiments. ''') # version p.add_argument('--version', action='version', version='DBNBeatTracker.py.2016') # input/output options io_arguments(p, output_suffix='.beats.txt', online=True) ActivationsProcessor.add_arguments(p) # signal processing arguments SignalProcessor.add_arguments(p, norm=False, gain=0) # peak picking arguments DBNBeatTrackingProcessor.add_arguments(p) NeuralNetworkEnsemble.add_arguments(p, nn_files=None) # parse arguments args = p.parse_args() # set immutable arguments args.fps = 100 # print arguments if args.verbose: print(args) # input processor if args.load: # load the activations from file in_processor = ActivationsProcessor(mode='r', **vars(args)) else: # use a RNN to predict the beats in_processor = RNNBeatProcessor(**vars(args)) # output processor if args.save: # save the RNN beat activations to file out_processor = ActivationsProcessor(mode='w', **vars(args)) else: # track the beats with a DBN beat_processor = DBNBeatTrackingProcessor(**vars(args)) # output handler from madmom.utils import write_events as writer # sequentially process everything out_processor = [beat_processor, writer] # create an IOProcessor processor = IOProcessor(in_processor, out_processor) # and call the processing function args.func(processor, **vars(args))
freqs = librosa.core.fft_frequencies(sr=sr, n_fft=n_fft) stft = librosa.perceptual_weighting(stft**2, freqs, ref=1.0, amin=1e-10, top_db=99.0) # apply mel filterbank spectrogram = librosa.feature.melspectrogram(S=stft, sr=sr, n_mels=n_mels, fmax=fmax) # keep spectrogram spectrograms.append(np.asarray(spectrogram)) spectrograms = np.asarray(spectrograms) return spectrograms processor_version1 = LibrosaProcessor() sig_proc = SignalProcessor(num_channels=1, sample_rate=32000, norm=True) fsig_proc = FramedSignalProcessor(frame_size=1024, hop_size=128, origin='future') spec_proc = SpectrogramProcessor(frame_size=1024) filt_proc = LogarithmicFilteredSpectrogramProcessor(filterbank=LogFilterbank, num_bands=26, fmin=20, fmax=14000) processor_pipeline2 = [sig_proc, fsig_proc, spec_proc, filt_proc] processor_version2 = SequentialProcessor(processor_pipeline2) if __name__ == "__main__": """ main """ # add argument parser parser = argparse.ArgumentParser(description='Pre-compute spectrograms for training and testing.') parser.add_argument('--audio_path', help='path to audio files.') parser.add_argument('--spec_path', help='path where to store spectrograms.') parser.add_argument('--show', help='show spectrogram plots.', type=int, default=None)
import music21 import madmom.utils.midi as mm_midi # import madmom.utils.midi_old as mm_midi from madmom.audio.signal import SignalProcessor, FramedSignalProcessor from madmom.audio.filters import LogarithmicFilterbank from madmom.audio.spectrogram import FilteredSpectrogramProcessor, LogarithmicSpectrogramProcessor from madmom.processors import SequentialProcessor # init signal processing SAMPLE_RATE = 22050 FRAME_SIZE = 2048 FPS = 20 sig_proc = SignalProcessor(num_channels=1, sample_rate=SAMPLE_RATE) fsig_proc = FramedSignalProcessor(frame_size=FRAME_SIZE, fps=FPS, origin='future') spec_proc = FilteredSpectrogramProcessor( LogarithmicFilterbank, num_bands=16, fmin=30, fmax=6000) # num_bands=24, fmin=30, fmax=8000 log_spec_proc = LogarithmicSpectrogramProcessor() processor = SequentialProcessor( [sig_proc, fsig_proc, spec_proc, log_spec_proc]) colors = ['c', 'm', 'y'] def notes_to_onsets(notes, dt): """ Convert sequence of keys to onset frames """
class DcasePredictorProvider(PredictorContract): """ Implementation of a PredictorContract. This class makes predictions where spectrograms are considered as inputs and a convolutional neural network produces class probabilities. Attributes ---------- sig_proc : madmom.Processor processor which outputs sampled audio signals fsig_proc : madmom.Processor processor which produces overlapping frames based on sampled signals spec_proc : madmom.Processor processor which computes a spectrogram with stft based on framed signals filt_proc : madmom.Processor processor which filters and scales a spectrogram processorPipeline : SequentialProcessor creates pipeline of elements of type madmom.Processor classes : list of str class list device : str indicates the processor to be used for neural network prediction prediction_model : baseline_net.Net holds a reference to the CNN architecture sliding_window : 2d numpy array cache for previously calculated spectrograms lastProceededGroundTruth : int variable to keep track of the last processed audio chunk slidingWindowThread: reference pointing to the sliding window thread predictionThread: reference pointing to the prediction thread Methods ------- start() starts all necessary sub tasks of this predictor. stop() stops all necessary sub tasks of this predictor. computeSpectrogram() compute a spectrogram based on the most current audio chunk. predict() CNN prediction based on current spectrogram input. """ # madmom pipeline for spectrogram calculation sig_proc = SignalProcessor(num_channels=1, sample_rate=32000, norm=True) fsig_proc = FramedSignalProcessor(frame_size=1024, hop_size=128, origin='future') spec_proc = SpectrogramProcessor(frame_size=1024) filt_proc = LogarithmicFilteredSpectrogramProcessor(filterbank=LogFilterbank, num_bands=26, fmin=20, fmax=14000) processorPipeline = SequentialProcessor([sig_proc, fsig_proc, spec_proc, filt_proc]) classes = ["Acoustic_guitar", "Applause", "Bark", "Bass_drum", "Burping_or_eructation", "Bus", "Cello", "Chime", "Clarinet", "Computer_keyboard", "Cough", "Cowbell", "Double_bass", "Drawer_open_or_close", "Electric_piano", "Fart", "Finger_snapping", "Fireworks", "Flute", "Glockenspiel", "Gong", "Gunshot_or_gunfire", "Harmonica", "Hi-hat", "Keys_jangling", "Knock", "Laughter", "Meow", "Microwave_oven", "Oboe", "Saxophone", "Scissors", "Shatter", "Snare_drum", "Squeak", "Tambourine", "Tearing", "Telephone", "Trumpet", "Violin_or_fiddle", "Writing"] device = 'cuda' if torch.cuda.is_available() else 'cpu' def __init__(self, condition): """ Parameters ---------- prediction_model : baseline_net.Net holds a reference to the CNN architecture sliding_window : 2d numpy array cache for previously calculated spectrograms lastProceededGroundTruth : int variable to keep track of the last processed audio chunk """ # load model with its tuned weight parameters self.prediction_model = Net() self.prediction_model.load_state_dict( torch.load(os.path.join(PROJECT_ROOT, 'server/consumer/predictors/dcase_predictor_provider/baseline_net.pt'), map_location=lambda storage, location: storage)) self.prediction_model.to(self.device) self.prediction_model.eval() # sliding window as cache self.sliding_window = np.zeros((128, 256), dtype=np.float32) self.lastProceededGroundTruth = None self.condition = condition def start(self): """Start all sub tasks necessary for continuous prediction. """ self.slidingWindowThread = SlidingWindowThread(self) self.predictionThread = PredictionThread(self) self.slidingWindowThread.start() self.predictionThread.start() def stop(self): """Stops all sub tasks """ self.slidingWindowThread.join() self.predictionThread.join() def computeSpectrogram(self): """This methods first access the global time variable ``tGroundTruth`` and reads audio chunk the time variable points to. Afterwards, the defined madmom pipeline is processed to get the spectrogram representation of the single chunk. Finally, the sliding window is updated with the new audio chunk. """ t = self.manager.tGroundTruth # if thread faster than producer, do not consume same chunk multiple times if t != self.lastProceededGroundTruth: frame = self.manager.sharedMemory[(t - 1) % BUFFER_SIZE] # modulo avoids index under/overflow frame = np.fromstring(frame, np.int16) spectrogram = self.processorPipeline.process(frame) frame = spectrogram[0] if np.any(np.isnan(frame)): frame = np.zeros_like(frame, dtype=np.float32) # update sliding window self.sliding_window[:, 0:-1] = self.sliding_window[:, 1::] self.sliding_window[:, -1] = frame self.lastProceededGroundTruth = t def predict(self): """ This method executes the actual prediction task based on the currently available slinding window. The sliding window is sent into the CNN model and the correpsonding softmax output for the respecive classes are returned Returns ------- probs : array of list objects an array of number of classes entries where each entry consists of the class name, its predicted probability and a position index. Example: ``[["class1", 0.0006955251446925104, 0], ["class2", 0.0032770668622106314, 1], ...]`` """ input = self.sliding_window[np.newaxis, np.newaxis] cuda_torch_input = torch.from_numpy(input).to(self.device) model_output = self.prediction_model(cuda_torch_input) # prediction by model softmax = nn.Softmax(dim=1) softmax_output = softmax(model_output) predicts = softmax_output.cpu().detach().numpy().flatten() probs = [[elem, predicts[index].item(), index] for index, elem in enumerate(self.classes)] return probs
def __init__(self, audio_path, target_sr=16000): from madmom.audio.signal import SignalProcessor self.target_sr = target_sr self.processor = SignalProcessor(num_channels=1, sample_rate=self.target_sr) super().__init__(audio_path)