def __init__(self, fs, hopsize_t):
        from madmom.audio.signal import SignalProcessor, FramedSignalProcessor
        from madmom.audio.stft import ShortTimeFourierTransformProcessor
        from madmom.audio.filters import MelFilterbank
        from madmom.audio.spectrogram import (FilteredSpectrogramProcessor,
                                              LogarithmicSpectrogramProcessor)

        # define pre-processing chain
        sig = SignalProcessor(num_channels=1, sample_rate=fs)
        frames = FramedSignalProcessor(frame_size=2048,
                                       hopsize=int(fs * hopsize_t))
        stft = ShortTimeFourierTransformProcessor()  # caching FFT window
        filt = FilteredSpectrogramProcessor(filterbank=MelFilterbank,
                                            num_bands=80,
                                            fmin=27.5,
                                            fmax=16000,
                                            norm_filters=True,
                                            unique_filters=False)
        spec = LogarithmicSpectrogramProcessor(log=np.log, add=EPSILON)

        single = SequentialProcessor([frames, stft, filt, spec])

        pre_processor = SequentialProcessor([sig, single])

        super(MadmomMelbankProcessor, self).__init__([pre_processor])
Esempio n. 2
0
    def spec_from_midi(midi_file):

        sig_proc = SignalProcessor(num_channels=1, sample_rate=spec_params["sample_rate"])
        fsig_proc = FramedSignalProcessor(frame_size=spec_params["frame_size"], fps=spec_params["fps"])
        spec_proc = FilteredSpectrogramProcessor(filterbank=LogarithmicFilterbank, num_bands=12, fmin=60, fmax=6000,
                                                 norm_filters=True, unique_filters=False)
        log_proc = LogarithmicSpectrogramProcessor()
        processor = SequentialProcessor([sig_proc, fsig_proc, spec_proc, log_proc])

        # print(midi_file)
        if not os.path.isfile(midi_file.replace('.mid', '.wav')):
            # render audio file from midi
            render_audio(midi_file, sound_font=SOUND_FONT_PATH)

        # compute spectrogram
        audio_path = midi_file.replace('.mid', '.wav')

        # if the spectrogram doesn't exist it will be computed and stored
        if not os.path.isfile(midi_file.replace('.mid', '.spec.npy')):
            spec = processor.process(audio_path).T
            np.save(midi_file.replace('.mid', '.spec'), spec)
        else:
            spec = np.load(midi_file.replace('.mid', '.spec.npy'))

        return spec
    def __init__(self, fs, hopsize_t):
        from madmom.audio.signal import SignalProcessor, FramedSignalProcessor
        from madmom.audio.stft import ShortTimeFourierTransformProcessor
        from madmom.audio.filters import MelFilterbank
        from madmom.audio.spectrogram import (FilteredSpectrogramProcessor,
                                              LogarithmicSpectrogramProcessor)
        # from madmom.features.onsets import _cnn_onset_processor_pad

        # define pre-processing chain
        sig = SignalProcessor(num_channels=1, sample_rate=fs)
        # process the multi-resolution spec in parallel
        frames = FramedSignalProcessor(frame_size=2048,
                                       hopsize=int(fs * hopsize_t))
        stft = ShortTimeFourierTransformProcessor()  # caching FFT window
        filt = FilteredSpectrogramProcessor(filterbank=MelFilterbank,
                                            num_bands=80,
                                            fmin=27.5,
                                            fmax=16000,
                                            norm_filters=True,
                                            unique_filters=False)
        spec = LogarithmicSpectrogramProcessor(log=np.log, add=EPSILON)

        # process each frame size with spec and diff sequentially
        single = SequentialProcessor([frames, stft, filt, spec])

        # pre-processes everything sequentially
        pre_processor = SequentialProcessor([sig, single])

        # instantiate a SequentialProcessor
        super(MadmomMelbankProcessor, self).__init__([pre_processor])
    def __init__(self, fs, hopsize_t):
        from madmom.audio.signal import SignalProcessor, FramedSignalProcessor
        from madmom.audio.stft import ShortTimeFourierTransformProcessor
        from madmom.audio.filters import MelFilterbank
        from madmom.audio.spectrogram import (FilteredSpectrogramProcessor,
                                              LogarithmicSpectrogramProcessor)
        # from madmom.features.onsets import _cnn_onset_processor_pad

        # define pre-processing chain
        sig = SignalProcessor(num_channels=1, sample_rate=fs)
        # process the multi-resolution spec in parallel
        multi = ParallelProcessor([])
        for frame_size in [2048, 1024, 4096]:
            frames = FramedSignalProcessor(frame_size=frame_size, fps=100)
            stft = ShortTimeFourierTransformProcessor()  # caching FFT window
            filt = FilteredSpectrogramProcessor(
                filterbank=MelFilterbank, num_bands=80, fmin=27.5, fmax=16000,
                norm_filters=True, unique_filters=False)
            spec = LogarithmicSpectrogramProcessor(log=np.log, add=EPSILON)
            # process each frame size with spec and diff sequentially
            multi.append(SequentialProcessor([frames, stft, filt, spec]))
        # stack the features (in depth) and pad at beginning and end
        stack = np.dstack
        # pad = _cnn_onset_processor_pad
        # pre-processes everything sequentially
        pre_processor = SequentialProcessor([sig, multi, stack])
        # instantiate a SequentialProcessor
        super(MadmomMelbank3ChannelsProcessor, self).__init__([pre_processor])
Esempio n. 5
0
 def __init__(self,
              sample_rate=44100,
              filter_length=8192,
              hop_length=8820,
              win_length=None,
              num_bands=24,
              fmin=65,
              fmax=2100,
              unique_filters=True):
     super(LMLFSpectrogram, self).__init__()
     self.stft = STFT(filter_length, hop_length, win_length)
     # filterbank from madmom
     fname = 'lmlf.wav'
     sf.write(fname, np.random.uniform(-1, 1, 100000), sample_rate)
     _sig = SignalProcessor(num_channels=1, sample_rate=sample_rate)
     _frames = FramedSignalProcessor(frame_size=filter_length,
                                     fps=sample_rate / hop_length)
     _stft = ShortTimeFourierTransformProcessor()  # caching FFT window
     _spec = LogarithmicFilteredSpectrogramProcessor(
         num_bands=num_bands,
         fmin=fmin,
         fmax=fmax,
         unique_filters=unique_filters)
     _spec(_stft(_frames(_sig(fname))))
     os.remove(fname)
     self.filterbank = torch.FloatTensor(np.asarray(_spec.filterbank))
    def __init__(self,
                 spectrogram_path=None,
                 version=1,
                 test=False,
                 dump=False,
                 preprocessing=True,
                 sample_rate=32000,
                 silence_threshold=40):
        if (version != 1 and version != 2):
            raise NameError("version must be 1 or 2")
        self.version = version
        self.spectrogram_path = spectrogram_path
        self.sample_rate = sample_rate
        self.preprocessing = preprocessing
        self.test = test
        self.dump = dump
        self.silence_threshold = silence_threshold

        sig_proc = SignalProcessor(num_channels=1,
                                   sample_rate=self.sample_rate,
                                   norm=True)
        fsig_proc = FramedSignalProcessor(frame_size=1024,
                                          hop_size=128,
                                          origin='future')
        spec_proc = SpectrogramProcessor(frame_size=1024)
        filt_proc = LogarithmicFilteredSpectrogramProcessor(
            filterbank=LogFilterbank, num_bands=26, fmin=20, fmax=14000)
        processor_pipeline = [sig_proc, fsig_proc, spec_proc, filt_proc]
        self.processor_version2 = SequentialProcessor(processor_pipeline)
    def __init__(self, **kwargs):
        from madmom.audio.signal import SignalProcessor, FramedSignalProcessor
        from madmom.audio.stft import ShortTimeFourierTransformProcessor
        from madmom.audio.spectrogram import (FilteredSpectrogramProcessor,
                                              LogarithmicSpectrogramProcessor,
                                              SpectrogramDifferenceProcessor)
        from madmom.processors import SequentialProcessor, ParallelProcessor

        # define pre-processing chain
        sig = SignalProcessor(num_channels=1, sample_rate=44100)
        # process the multi-resolution spec & diff in parallel
        multi = ParallelProcessor([])
        for frame_size in [4096]:
            frames = FramedSignalProcessor(frame_size=frame_size, fps=100)
            stft = ShortTimeFourierTransformProcessor(
                window=np.hamming(frame_size))  # caching FFT window
            filt = FilteredSpectrogramProcessor(num_bands=12,
                                                fmin=30,
                                                fmax=16000,
                                                norm_filters=True)
            spec = LogarithmicSpectrogramProcessor(mul=5, add=1)
            #diff = SpectrogramDifferenceProcessor(diff_ratio=0.5, positive_diffs=True, stack_diffs=np.hstack)
            # process each frame size with spec and diff sequentially
            multi.append(SequentialProcessor((frames, stft, filt, spec)))
            #multi.append(SequentialProcessor((frames, stft, filt)))

        # stack the features and processes everything sequentially
        pre_processor = SequentialProcessor((sig, multi, np.hstack))
        super(PianoNoteProcessor, self).__init__(pre_processor)
Esempio n. 8
0
def CreateProcesser(fps=100):
    # define pre-processing chain
    sig = SignalProcessor(num_channels=1, sample_rate=44100)
    # process the multi-resolution spec & diff in parallel
    # process the multi-resolution spec & diff in parallel
    multi = ParallelProcessor([])
    frame_sizes = [1024, 2048, 4096]
    num_bands = [3, 6, 12]
    for frame_size, num_bands in zip(frame_sizes, num_bands):
        frames = FramedSignalProcessor(frame_size=frame_size, fps=fps)
        stft = ShortTimeFourierTransformProcessor()  # caching FFT window
        filt = FilteredSpectrogramProcessor(num_bands=num_bands,
                                            fmin=30,
                                            fmax=17000,
                                            norm_filters=True)
        spec = LogarithmicSpectrogramProcessor(mul=1, add=1)
        diff = SpectrogramDifferenceProcessor(diff_ratio=0.5,
                                              positive_diffs=True,
                                              stack_diffs=np.hstack)
        # process each frame size with spec and diff sequentially
        multi.append(SequentialProcessor((frames, stft, filt, spec, diff)))

    # stack the features and processes everything sequentially
    pre_processor = SequentialProcessor((sig, multi, np.hstack))
    return pre_processor
Esempio n. 9
0
def _make_preprocessor(settings, pad):
    from madmom.audio.spectrogram import (
        LogarithmicFilteredSpectrogramProcessor,
        SpectrogramDifferenceProcessor)
    from madmom.audio.filters import LogarithmicFilterbank
    from madmom.audio.signal import SignalProcessor, FramedSignalProcessor
    from madmom.audio.stft import ShortTimeFourierTransformProcessor
    from madmom.processors import SequentialProcessor

    sig = SignalProcessor(num_channels=1, sample_rate=settings['sample_rate'])
    frames = FramedSignalProcessor(frame_size=settings['frame_size'],
                                   fps=settings['fps'])
    stft = ShortTimeFourierTransformProcessor()  # caching FFT window
    spec = LogarithmicFilteredSpectrogramProcessor(
        num_channels=1,
        sample_rate=settings['sample_rate'],
        filterbank=LogarithmicFilterbank,
        frame_size=settings['frame_size'],
        fps=settings['fps'],
        num_bands=settings['num_bands'],
        fmin=settings['fmin'],
        fmax=settings['fmax'],
        norm_filters=settings['norm_filters'])
    if settings['diff']:
        if 'pad' in settings and settings['pad']:
            stack = _crnn_drum_processor_stack
        else:
            stack = np.hstack
        diff = SpectrogramDifferenceProcessor(diff_ratio=0.5,
                                              positive_diffs=True,
                                              stack_diffs=stack)
        # process input data
        if pad > 0:
            pre_processor = SequentialProcessor(
                (sig, frames, stft, spec, diff, PadProcessor(pad)))
        else:
            pre_processor = SequentialProcessor(
                (sig, frames, stft, spec, diff))

    else:
        if pad > 0:
            pre_processor = SequentialProcessor(
                (sig, frames, stft, spec, PadProcessor(pad)))
        else:
            pre_processor = SequentialProcessor((sig, frames, stft, spec))

    return pre_processor
Esempio n. 10
0
    def __init__(self, hparams, dataset: FreeSoundAudioDataset):
        super(MadmomFeatureIteratorV2, self).__init__(hparams, dataset)

        if not isinstance(dataset, FreeSoundAudioDataset):
            raise AssertionError("dataset should be FreeSoundAudioDataset")

        sig_proc = SignalProcessor(num_channels=1,
                                   sample_rate=32000,
                                   norm=True)
        fsig_proc = FramedSignalProcessor(frame_size=1024,
                                          hop_size=128,
                                          origin='future')
        spec_proc = SpectrogramProcessor(frame_size=1024)
        filt_proc = LogarithmicFilteredSpectrogramProcessor(
            filterbank=LogFilterbank, num_bands=26, fmin=20, fmax=14000)
        processor_pipeline2 = [sig_proc, fsig_proc, spec_proc, filt_proc]
        self.processor_version2 = SequentialProcessor(processor_pipeline2)
Esempio n. 11
0
def spectrogram_processor(spec_params):
    """Helper function for our spectrogram extraction."""
    sig_proc = SignalProcessor(num_channels=1,
                               sample_rate=spec_params['sample_rate'])
    fsig_proc = FramedSignalProcessor(frame_size=spec_params['frame_size'],
                                      fps=spec_params['fps'])

    spec_proc = FilteredSpectrogramProcessor(filterbank=LogarithmicFilterbank,
                                             num_bands=12,
                                             fmin=60,
                                             fmax=6000,
                                             norm_filters=True,
                                             unique_filters=False)
    log_proc = LogarithmicSpectrogramProcessor()

    processor = SequentialProcessor([sig_proc, fsig_proc, spec_proc, log_proc])

    return processor
Esempio n. 12
0
def build_cnn(madmom_processor_filename):
    from madmom.audio.signal import SignalProcessor, FramedSignalProcessor
    from madmom.audio.stft import ShortTimeFourierTransformProcessor
    from madmom.audio.spectrogram import (FilteredSpectrogramProcessor,
                                          LogarithmicSpectrogramProcessor)

    from madmom.ml.nn import NeuralNetworkEnsemble
    # define pre-processing chain
    sig = SignalProcessor(num_channels=1, sample_rate=44100)
    frames = FramedSignalProcessor(frame_size=4096, hop_size=441 * 2)
    stft = ShortTimeFourierTransformProcessor()  # caching FFT window
    filt = FilteredSpectrogramProcessor(num_bands=24, fmin=30, fmax=10000)

    # this is the money param! it was not whitelisted in 'canonicalize_audio_options'!
    spec = LogarithmicSpectrogramProcessor(add=1)
    # pre-processes everything sequentially
    pre_processor = SequentialProcessor([
        sig, frames, stft, filt, spec, _cnn_pad
    ])
    # process the pre-processed signal with a NN
    nn = NeuralNetworkEnsemble.load([madmom_processor_filename])
    return madmom.processors.SequentialProcessor([pre_processor, nn])
    def __init__(self, sr=44100, **kwargs):
        from madmom.audio.signal import SignalProcessor, FramedSignalProcessor
        from madmom.audio.stft import ShortTimeFourierTransformProcessor
        from madmom.audio.spectrogram import (FilteredSpectrogramProcessor,
                                              LogarithmicSpectrogramProcessor)
        from madmom.ml.nn import NeuralNetworkEnsemble
        sr_ratio = 44100 / sr
        # define pre-processing chain
        sig = SignalProcessor(num_channels=1, sample_rate=sr)
        frames = FramedSignalProcessor(frame_size=4096 // sr_ratio,
                                       fps=50 // sr_ratio)
        stft = ShortTimeFourierTransformProcessor()  # caching FFT window
        filt = FilteredSpectrogramProcessor(num_bands=24, fmin=30, fmax=10000)
        spec = LogarithmicSpectrogramProcessor(add=1)
        # pre-processes everything sequentially
        pre_processor = SequentialProcessor(
            (sig, frames, stft, filt, spec, _cnn_pad))
        # process the pre-processed signal with a NN
        nn = NeuralNetworkEnsemble.load(VIENNA_MODEL_PATH)
        # instantiate a SequentialProcessor
        super().__init__((pre_processor, nn))

        self.adsr = ADSRMaestro()
class MadmomSpectrogramProvider(VisualisationContract):
    """
    Implementation of a VisualisationContract. This class
    computes new spectrograms based on the most current
    audio chunks which is indicated via ``tGroundTruth``.

    Attributes
    ----------
    sig_proc : madmom.Processor
        processor which outputs sampled audio signals
    fsig_proc : madmom.Processor
        processor which produces overlapping frames based on sampled signals
    spec_proc : madmom.Processor
        processor which computes a spectrogram with stft based on framed signals
    filt_proc : madmom.Processor
        processor which filters and scales a spectrogram
    processorPipeline : SequentialProcessor
        creates pipeline of elements of type madmom.Processor
    sliding_window : 2d numpy array
        cache for previously calculated spectrograms
    lastProceededGroundTruth : int
        variable to keep track of the last processed audio chunk
    visThread:
        reference pointing to the sliding window thread

    Methods
    -------
    start()
       starts all necessary sub tasks of this visualizer.
    stop()
       stops all necessary sub tasks of this visualizer.
    computeSpectrogram()
       compute a spectrogram based on the most current audio chunk.
    """

    # madmom pipeline for spectrogram calculation
    sig_proc = SignalProcessor(num_channels=1, sample_rate=32000, norm=True)
    fsig_proc = FramedSignalProcessor(frame_size=1024, hop_size=128, origin='future')
    spec_proc = SpectrogramProcessor(frame_size=1024)
    filt_proc = LogarithmicFilteredSpectrogramProcessor(filterbank=LogFilterbank, num_bands=26, fmin=20, fmax=14000)
    processorPipeline = SequentialProcessor([sig_proc, fsig_proc, spec_proc, filt_proc])

    def __init__(self, condition):
        """
        Parameters
        ----------
        sliding_window : 2d numpy array
           cache for previously calculated spectrograms
        lastProceededGroundTruth : int
           variable to keep track of the last processed audio chunk
        """

        # sliding window as cache
        self.sliding_window = np.zeros((128, 256), dtype=np.float32)
        self.lastProceededGroundTruth = None
        self.condition = condition

    def start(self):
        """Start all sub tasks necessary for continuous spectrograms.
        """
        self.visThread = VisualisationThread(self)
        self.visThread.start()

    def stop(self):
        """Stops all sub tasks
        """
        self.visThread.join()

    def computeSpectrogram(self):
        """This methods first access the global time variable ``tGroundTruth``
        and reads audio chunk the time variable points to. Afterwards, the defined
        madmom pipeline is processed to get the spectrogram representation of the
        single chunk. Finally, the sliding window is updated with the new audio chunk
        and a copy of the sliding window is returned to the calling thread.

        Returns
        -------
        sliding_window : 2d numpy array of float values
            returns a copy of the current sliding window spectrogram
        """
        # if thread faster than producer, do not consume same chunk multiple times
        t = self.manager.tGroundTruth
        if t != self.lastProceededGroundTruth:
            frame = self.manager.sharedMemory[(t - 1) % BUFFER_SIZE]   # modulo avoids index under/overflow
            frame = np.fromstring(frame, np.int16)
            spectrogram = self.processorPipeline.process(frame)

            frame = spectrogram[0]
            if np.any(np.isnan(frame)):
                frame = np.zeros_like(frame, dtype=np.float32)

            # update sliding window
            self.sliding_window[:, 0:-1] = self.sliding_window[:, 1::]
            self.sliding_window[:, -1] = frame

            self.lastProceededGroundTruth = t

        return self.sliding_window.copy()
Esempio n. 15
0
    # ! Log-magnitude log-frequency spectrogram
    num_bands = 24
    fmin = 65
    fmax = 2100

    # torch
    torch_lmlf = LMLFSpectrogram(sample_rate=sr,
                                 filter_length=filter_length,
                                 hop_length=hop_length,
                                 num_bands=num_bands,
                                 fmin=65,
                                 fmax=2100)
    lmlf = torch_lmlf(real_wave.unsqueeze(0))

    # madmom
    _sig = SignalProcessor(num_channels=1, sample_rate=sr)
    _frames = FramedSignalProcessor(frame_size=filter_length,
                                    fps=sr / hop_length)
    _stft = ShortTimeFourierTransformProcessor()  # caching FFT window
    _spec = LogarithmicFilteredSpectrogramProcessor(num_bands=num_bands,
                                                    fmin=fmin,
                                                    fmax=fmax)
    sig = _sig(librosa.util.example_audio_file())
    frames = _frames(sig)
    stft = _stft(frames)
    spec = _spec(stft)

    diff = np.mean(np.abs(lmlf.squeeze(0).numpy() - spec))
    print('===== log-magnitude log-frequency spectrogram =====')
    print('mean difference between outputs from torch and madmom : ', diff)
    print('shape : ', lmlf.shape)
def main():
    """PianoTranscriptor"""
    
    # define parser
    p = argparse.ArgumentParser(
                                formatter_class=argparse.RawDescriptionHelpFormatter, description='''
        The PianoTranscriptor program detects all notes (onsets) in an audio file
        according to the algorithm described in:
        
        "Polyphonic Piano Note Transcription with Recurrent Neural Networks"
        Sebastian Böck and Markus Schedl.
        Proceedings of the 37th International Conference on Acoustics, Speech and Signal Processing (ICASSP), 2012.
        
        Instead of 'LSTM' units, the current version uses 'tanh' units.
        
        This program can be run in 'single' file mode to process a single audio
        file and write the detected notes to STDOUT or the given output file.
        
        $ PianoTranscriptor single INFILE [-o OUTFILE]
        
        If multiple audio files should be processed, the program can also be run
        in 'batch' mode to save the detected notes to files with the given suffix.
        
        $ PianoTranscriptor batch [-o OUTPUT_DIR] [-s OUTPUT_SUFFIX] LIST OF FILES
        
        If no output directory is given, the program writes the files with the detected notes to same location as the audio files.
        
        The 'pickle' mode can be used to store the used parameters to be able to exactly reproduce experiments.
                                    ''')

    # version
    p.add_argument('--version', action='version',
                   version='PianoTranscriptor.2013')
    # input/output arguments
    io_arguments(p, output_suffix='.notes.txt')
    ActivationsProcessor.add_arguments(p)
    # signal processing arguments
    SignalProcessor.add_arguments(p, norm=False, gain=0, start=True, stop=True)
    # peak picking arguments
    PeakPickingProcessor.add_arguments(p, threshold=0.35, smooth=0.09,
                                       combine=0.05)
    # midi arguments
    # import madmom.utils.midi as midi
    # midi.MIDIFile.add_arguments(p, length=0.6, velocity=100)
    p.add_argument('--midi', dest='output_format', action='store_const',
                   const='midi', help='save as MIDI')
    # mirex stuff
    p.add_argument('--mirex', dest='output_format', action='store_const',
                   const='mirex', help='use the MIREX output format')

    # parse arguments
    args = p.parse_args()
    
    # set immutable defaults
    args.fps = 100
    args.pre_max = 1. / args.fps
    args.post_max = 1. / args.fps
                   
    # set the suffix for midi files
    if args.output_format == 'midi':
        args.output_suffix = '.mid'

    # print arguments
    if args.verbose:
        print(args)
    
        # input processor
    if args.load:
        # load the activations from file
        in_processor = ActivationsProcessor(mode='r', **vars(args))
    else:
        # use a RNN to predict the notes
        in_processor = RNNPianoNoteProcessor()
    
    # output processor
    if args.save:
        # save the RNN note activations to file
        out_processor = ActivationsProcessor(mode='w', **vars(args))
    else:
        # perform peak picking on the activation function
        peak_picking = PeakPickingProcessor(**vars(args))
        # output everything in the right format
        if args.output_format is None:
            output = write_notes
        elif args.output_format == 'midi':
            output = write_midi
        elif args.output_format == 'mirex':
            output = write_mirex_format
        else:
            raise ValueError('unknown output format: %s' % args.output_format)
        out_processor = [peak_picking, output]
    
    # create an IOProcessor
    processor = IOProcessor(in_processor, out_processor)
    
    # and call the processing function
    args.func(processor, **vars(args))
Esempio n. 17
0
def main():
    """DBNBeatTracker"""

    # define parser
    p = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description='''
    The DBNBeatTracker.py program detects all beats in an audio file according to
    the method described in:

    "A Multi-Model Approach to Beat Tracking Considering Heterogeneous Music
     Styles"
    Sebastian Böck, Florian Krebs and Gerhard Widmer.
    Proceedings of the 15th International Society for Music Information
    Retrieval Conference (ISMIR), 2014.

    It does not use the multi-model (Section 2.2.) and selection stage (Section
    2.3), i.e. this version corresponds to the pure DBN version of the
    algorithm for which results are given in Table 2.

    Instead of the originally proposed state space and transition model for the
    DBN, the following is used:

    "An Efficient State Space Model for Joint Tempo and Meter Tracking"
    Florian Krebs, Sebastian Böck and Gerhard Widmer.
    Proceedings of the 16th International Society for Music Information
    Retrieval Conference (ISMIR), 2015.

    This program can be run in 'single' file mode to process a single audio
    file and write the detected beats to STDOUT or the given output file.

      $ DBNBeatTracker.py single INFILE [-o OUTFILE]

    If multiple audio files should be processed, the program can also be run
    in 'batch' mode to save the detected beats to files with the given suffix.

      $ DBNBeatTracker.py batch [-o OUTPUT_DIR] [-s OUTPUT_SUFFIX] FILES

    If no output directory is given, the program writes the files with the
    detected beats to the same location as the audio files.

    The 'pickle' mode can be used to store the used parameters to be able to
    exactly reproduce experiments.

    ''')

    # version
    p.add_argument('--version',
                   action='version',
                   version='DBNBeatTracker.py.2016')
    # input/output options
    io_arguments(p, output_suffix='.beats.txt', online=True)
    ActivationsProcessor.add_arguments(p)
    # signal processing arguments
    SignalProcessor.add_arguments(p, norm=False, gain=0)
    # peak picking arguments
    DBNBeatTrackingProcessor.add_arguments(p)
    NeuralNetworkEnsemble.add_arguments(p, nn_files=None)

    # parse arguments
    args = p.parse_args()

    # set immutable arguments
    args.fps = 100

    # print arguments
    if args.verbose:
        print(args)

    # input processor
    if args.load:
        # load the activations from file
        in_processor = ActivationsProcessor(mode='r', **vars(args))
    else:
        # use a RNN to predict the beats
        in_processor = RNNBeatProcessor(**vars(args))

    # output processor
    if args.save:
        # save the RNN beat activations to file
        out_processor = ActivationsProcessor(mode='w', **vars(args))
    else:
        # track the beats with a DBN
        beat_processor = DBNBeatTrackingProcessor(**vars(args))
        # output handler
        from madmom.utils import write_events as writer
        # sequentially process everything
        out_processor = [beat_processor, writer]

    # create an IOProcessor
    processor = IOProcessor(in_processor, out_processor)
    # and call the processing function
    args.func(processor, **vars(args))
Esempio n. 18
0
                freqs = librosa.core.fft_frequencies(sr=sr, n_fft=n_fft)
                stft = librosa.perceptual_weighting(stft**2, freqs, ref=1.0, amin=1e-10, top_db=99.0)

            # apply mel filterbank
            spectrogram = librosa.feature.melspectrogram(S=stft, sr=sr, n_mels=n_mels, fmax=fmax)

            # keep spectrogram
            spectrograms.append(np.asarray(spectrogram))

        spectrograms = np.asarray(spectrograms)

        return spectrograms

processor_version1 = LibrosaProcessor()

sig_proc = SignalProcessor(num_channels=1, sample_rate=32000, norm=True)
fsig_proc = FramedSignalProcessor(frame_size=1024, hop_size=128, origin='future')
spec_proc = SpectrogramProcessor(frame_size=1024)
filt_proc = LogarithmicFilteredSpectrogramProcessor(filterbank=LogFilterbank, num_bands=26, fmin=20, fmax=14000)
processor_pipeline2 = [sig_proc, fsig_proc, spec_proc, filt_proc]
processor_version2 = SequentialProcessor(processor_pipeline2)


if __name__ == "__main__":
    """ main """

    # add argument parser
    parser = argparse.ArgumentParser(description='Pre-compute spectrograms for training and testing.')
    parser.add_argument('--audio_path', help='path to audio files.')
    parser.add_argument('--spec_path', help='path where to store spectrograms.')
    parser.add_argument('--show', help='show spectrogram plots.', type=int, default=None)
Esempio n. 19
0
import music21

import madmom.utils.midi as mm_midi
# import madmom.utils.midi_old as mm_midi
from madmom.audio.signal import SignalProcessor, FramedSignalProcessor
from madmom.audio.filters import LogarithmicFilterbank
from madmom.audio.spectrogram import FilteredSpectrogramProcessor, LogarithmicSpectrogramProcessor
from madmom.processors import SequentialProcessor

# init signal processing
SAMPLE_RATE = 22050
FRAME_SIZE = 2048
FPS = 20

sig_proc = SignalProcessor(num_channels=1, sample_rate=SAMPLE_RATE)
fsig_proc = FramedSignalProcessor(frame_size=FRAME_SIZE,
                                  fps=FPS,
                                  origin='future')
spec_proc = FilteredSpectrogramProcessor(
    LogarithmicFilterbank, num_bands=16, fmin=30,
    fmax=6000)  # num_bands=24, fmin=30, fmax=8000
log_spec_proc = LogarithmicSpectrogramProcessor()
processor = SequentialProcessor(
    [sig_proc, fsig_proc, spec_proc, log_spec_proc])

colors = ['c', 'm', 'y']


def notes_to_onsets(notes, dt):
    """ Convert sequence of keys to onset frames """
class DcasePredictorProvider(PredictorContract):
    """
    Implementation of a PredictorContract. This class
    makes predictions where spectrograms are considered
    as inputs and a convolutional neural network produces
    class probabilities.

    Attributes
    ----------
    sig_proc : madmom.Processor
        processor which outputs sampled audio signals
    fsig_proc : madmom.Processor
        processor which produces overlapping frames based on sampled signals
    spec_proc : madmom.Processor
        processor which computes a spectrogram with stft based on framed signals
    filt_proc : madmom.Processor
        processor which filters and scales a spectrogram
    processorPipeline : SequentialProcessor
        creates pipeline of elements of type madmom.Processor
    classes : list of str
        class list
    device : str
        indicates the processor to be used for neural network prediction
    prediction_model : baseline_net.Net
        holds a reference to the CNN architecture
    sliding_window : 2d numpy array
        cache for previously calculated spectrograms
    lastProceededGroundTruth : int
        variable to keep track of the last processed audio chunk
    slidingWindowThread:
        reference pointing to the sliding window thread
    predictionThread:
        reference pointing to the prediction thread

    Methods
    -------
    start()
       starts all necessary sub tasks of this predictor.
    stop()
       stops all necessary sub tasks of this predictor.
    computeSpectrogram()
       compute a spectrogram based on the most current audio chunk.
    predict()
       CNN prediction based on current spectrogram input.
    """
    # madmom pipeline for spectrogram calculation
    sig_proc = SignalProcessor(num_channels=1, sample_rate=32000, norm=True)
    fsig_proc = FramedSignalProcessor(frame_size=1024, hop_size=128, origin='future')
    spec_proc = SpectrogramProcessor(frame_size=1024)
    filt_proc = LogarithmicFilteredSpectrogramProcessor(filterbank=LogFilterbank, num_bands=26, fmin=20, fmax=14000)
    processorPipeline = SequentialProcessor([sig_proc, fsig_proc, spec_proc, filt_proc])

    classes = ["Acoustic_guitar", "Applause", "Bark", "Bass_drum", "Burping_or_eructation", "Bus", "Cello", "Chime",
               "Clarinet", "Computer_keyboard", "Cough", "Cowbell", "Double_bass", "Drawer_open_or_close",
               "Electric_piano",
               "Fart", "Finger_snapping", "Fireworks", "Flute", "Glockenspiel", "Gong", "Gunshot_or_gunfire",
               "Harmonica",
               "Hi-hat", "Keys_jangling", "Knock", "Laughter", "Meow", "Microwave_oven", "Oboe", "Saxophone",
               "Scissors",
               "Shatter", "Snare_drum", "Squeak", "Tambourine", "Tearing", "Telephone", "Trumpet",
               "Violin_or_fiddle",
               "Writing"]

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    def __init__(self, condition):
        """
        Parameters
        ----------
        prediction_model : baseline_net.Net
           holds a reference to the CNN architecture
        sliding_window : 2d numpy array
           cache for previously calculated spectrograms
        lastProceededGroundTruth : int
           variable to keep track of the last processed audio chunk
        """
        # load model with its tuned weight parameters
        self.prediction_model = Net()
        self.prediction_model.load_state_dict(
            torch.load(os.path.join(PROJECT_ROOT,
                                    'server/consumer/predictors/dcase_predictor_provider/baseline_net.pt'),
                       map_location=lambda storage, location: storage))
        self.prediction_model.to(self.device)
        self.prediction_model.eval()

        # sliding window as cache
        self.sliding_window = np.zeros((128, 256), dtype=np.float32)
        self.lastProceededGroundTruth = None
        self.condition = condition

    def start(self):
        """Start all sub tasks necessary for continuous prediction.
        """
        self.slidingWindowThread = SlidingWindowThread(self)
        self.predictionThread = PredictionThread(self)
        self.slidingWindowThread.start()
        self.predictionThread.start()

    def stop(self):
        """Stops all sub tasks
        """
        self.slidingWindowThread.join()
        self.predictionThread.join()

    def computeSpectrogram(self):
        """This methods first access the global time variable ``tGroundTruth``
        and reads audio chunk the time variable points to. Afterwards, the defined
        madmom pipeline is processed to get the spectrogram representation of the
        single chunk. Finally, the sliding window is updated with the new audio chunk.
        """

        t = self.manager.tGroundTruth
        # if thread faster than producer, do not consume same chunk multiple times
        if t != self.lastProceededGroundTruth:
            frame = self.manager.sharedMemory[(t - 1) % BUFFER_SIZE]   # modulo avoids index under/overflow
            frame = np.fromstring(frame, np.int16)
            spectrogram = self.processorPipeline.process(frame)

            frame = spectrogram[0]
            if np.any(np.isnan(frame)):
                frame = np.zeros_like(frame, dtype=np.float32)

            # update sliding window
            self.sliding_window[:, 0:-1] = self.sliding_window[:, 1::]
            self.sliding_window[:, -1] = frame

            self.lastProceededGroundTruth = t

    def predict(self):
        """ This method executes the actual prediction task based on the
        currently available slinding window. The sliding window is sent
        into the CNN model and the correpsonding softmax output for the
        respecive classes are returned

        Returns
        -------
        probs : array of list objects
            an array of number of classes entries where each entry consists of
            the class name, its predicted probability and a position index.
            Example:
            ``[["class1", 0.0006955251446925104, 0], ["class2", 0.0032770668622106314, 1], ...]``
        """

        input = self.sliding_window[np.newaxis, np.newaxis]
        cuda_torch_input = torch.from_numpy(input).to(self.device)
        model_output = self.prediction_model(cuda_torch_input)  # prediction by model
        softmax = nn.Softmax(dim=1)
        softmax_output = softmax(model_output)
        predicts = softmax_output.cpu().detach().numpy().flatten()
        probs = [[elem, predicts[index].item(), index] for index, elem in enumerate(self.classes)]
        return probs
 def __init__(self, audio_path, target_sr=16000):
     from madmom.audio.signal import SignalProcessor
     self.target_sr = target_sr
     self.processor = SignalProcessor(num_channels=1,
                                      sample_rate=self.target_sr)
     super().__init__(audio_path)