コード例 #1
0
def computeSegmentation(filename, pool):
    sampleRate = 44100
    frameSize = 2048
    hopSize = frameSize / 2

    audio = EqloudLoader(filename=filename,
                         downmix=pool['downmix'],
                         sampleRate=sampleRate)

    fc = FrameCutter(frameSize=frameSize, hopSize=hopSize, silentFrames='keep')
    w = Windowing(type='blackmanharris62')
    spec = Spectrum()
    mfcc = MFCC(highFrequencyBound=8000)
    tmpPool = essentia.Pool()

    audio.audio >> fc.signal
    fc.frame >> w.frame >> spec.frame
    spec.spectrum >> mfcc.spectrum
    mfcc.bands >> (tmpPool, 'mfcc_bands')
    mfcc.mfcc >> (tmpPool, 'mfcc_coeff')

    essentia.run(audio)

    # compute transpose of features array, don't call numpy.matrix.transpose
    # because essentia f***s it up!!
    features = copy.deepcopy(tmpPool['mfcc_coeff'].transpose())
    segments = std.SBic(cpw=1.5, size1=1000, inc1=300, size2=600,
                        inc2=50)(features)
    for segment in segments:
        pool.add('segments', segment * hopSize / sampleRate)
コード例 #2
0
def main_segment(args):
    """perform sgementation of a piece of audio

    The idea is to identify different parts in a recording and
    annotate them for editing further down the pipeline.

    TODO
     - framesize hierarchy/ pyramid
     - pimp criteria with existing alternative criteria
     - pimp criteria with our own predictability criteria
     - clustering on spec frame pyramid
     - recurrence plot on feature frames
    """

    # FFT framesize and hopsize parameters
    frameSize = args.frame_size_low_level
    hopSize = frameSize / 2
    if args.hop_size_low_level is not None:
        hopSize = args.hop_size_low_level

    # load the audio
    audio = loadaudio(args)
    logger.debug('audio loaded, type = %s' % (audio.dtype))
    audio_labels = None

    # check if labels exist
    labelfile = args.file[:-4] + '_labels.txt'
    if os.path.exists(labelfile):
        audio_labels_t = np.genfromtxt(labelfile, delimiter='\t')
        audio_labels = (audio_labels_t[:,0] * args.samplerate) / hopSize
        logger.debug('labels = %s', audio_labels)
    # frame = audio

    # init window func
    w = estd.Windowing(type = 'hamming')
    # init spectrum
    spectrum = estd.Spectrum()  # FFT() would return the complex FFT, here we just want the magnitude spectrum

    # feature operators
    features = {
        'BarkBands': {
            'op': estd.BarkBands,
            'opargs': {
                'numberBands': 12, # 28,
            },
            'opout': [0],
        },
        'ERBBands': {
            'op': estd.ERBBands,
            'opargs': {
                'numberBands': 12, # 40,
            },
            'opout': [0],
        },
        'MFCC':      {
            'op': estd.MFCC,
            'opargs': {
                'numberBands': 20, 'numberCoefficients': 10, 'highFrequencyBound': 10000, 'logType': 'dbamp', 'normalize': 'unit_sum', # 40 numbands
            },
            'opout': [1],
        },
        'GFCC':      {
            'op': estd.GFCC,
            'opargs': {
                'numberBands': 20, 'numberCoefficients': 10, 'highFrequencyBound': 10000, 'logType': 'dbamp', # 40 numberBands
            },
            'opout': [1],
        },
        'LPC':      {
            'op': estd.LPC,
            'opargs': {
                # 'order': 20, 'type': 'regular',
                'order': 8, 'type': 'regular',
            },
            'opout': [1],
        },
        'MelBands':  {
            'op': estd.MelBands,
            'opargs': {'numberBands': 10},
            'opout': [0],
        },
    }
    for fk, fv in list(features.items()):
        features[fk]['inst'] = fv['op'](**fv['opargs'])
        features[fk]['gram'] = []
        
        # # init mfcc features
        # mfcc = estd.MFCC()
    
    # segmentation operator
    sbic = estd.SBic(
        cpw = args.sbic_complexity_penalty_weight,
        inc1=args.sbic_inc1, inc2=args.sbic_inc2,
        minLength=args.sbic_minlength,
        size1=args.sbic_size1, size2=args.sbic_size2
    )
    
    # sbic = estd.SBic(cpw = 1.5, inc1 = 60, inc2 = 20, sbic_minlength = 10, size1 = 300, size2 = 200)
    # sbic = estd.SBic(cpw = 1.5, inc1 = 60, inc2 = 20, sbic_minlength = 80, size1 = 300, size2 = 200)
    # sbic = estd.SBic(cpw = 0.05, inc1 = 60, inc2 = 20, sbic_minlength = 120, size1 = 300, size2 = 200)
    # sbic = estd.SBic(cpw = 0.3, inc1 = 20, inc2 = 10, sbic_minlength = 10, size1 = 100, size2 = 70)

    # print "w", repr(w)
    # print "spectrum", repr(spectrum)
    # print "mfcc", repr(mfcc)

    # frame = audio[int(0.2*args.samplerate) : int(0.2*args.samplerate) + 1024]
    # print "frame.shape", frame.shape
    # spec = spectrum(w(frame))
    # mfcc_bands, mfcc_coeffs = mfcc(spec)
    
    pool = e.Pool()

    numframes = 0
    specgram = []
    # mfcc_bandsgram = []
    # mfcc_coefsgram = []
    logger.debug('main_segment: computing spec and features for audio of size %s', audio.shape)
    for frame in estd.FrameGenerator(audio, frameSize = frameSize, hopSize = hopSize, startFromZero=True):
        # mfcc_bands, mfcc_coeffs = mfcc(spectrum(w(frame)))
        # pool.add('lowlevel.mfcc', mfcc_coeffs)
        # pool.add('lowlevel.mfcc_bands', mfcc_bands)
        print(("frame", frame.shape))
        # frame = np.atleast_2d(frame)
        spec = spectrum(w(frame))
        specgram.append(spec)

        for fk, fv in list(features.items()):
            # logger.debug('computing feature %s', fk)
            fspec_ = fv['inst'](spec)
            # logger.debug('   type(fspec_) = %s', type(fspec_))
            fspec = fspec_
            
            if type(fspec_) is tuple:
                fspec = fspec_[fv['opout'][0]]

            fv['gram'].append(fspec)
            # mfcc_bands, mfcc_coefs = mfcc(spec)
            # mfcc_bandsgram.append(mfcc_bands)
            # mfcc_coefsgram.append(mfcc_coefs)

        numframes += 1
        if numframes % 10000 == 0:
            logger.debug('main_segment: crunched %d frames of shape %s', numframes, frame.shape)
    logger.debug('main_segment: crunched %d frames of shape %s', numframes, frame.shape)
    # sys.exit(0)

    specgram = np.array(specgram).T
    logger.debug("main_segment: %s-gram = %s", 'spec', specgram.shape)

    for fk, fv in list(features.items()):
        fv['gram'] = np.array(fv['gram']).T
        logger.debug("main_segment computing sbic for %s-gram = %s", fk, fv['gram'].shape)
        # mfcc_bandsgram = np.array(mfcc_bandsgram).T
        # mfcc_coefsgram = np.array(mfcc_coefsgram).T
        # print "segmenting mfcc_bandsgram", mfcc_bandsgram.shape
        # print "segmenting mfcc_coefsgram", mfcc_coefsgram.shape
        
        # segidx = sbic(specgram)
        # segidx = sbic(mfcc_bandsgram)
        # segidx = sbic(mfcc_coefsgram)
        fv['segidx'] = sbic(fv['gram'])
        # pool.add('segment.sbic', segidx)
        # logger.debug("    pool['segment.sbic'] = %s", pool['segment.sbic'])

        
        # logger.debug("%s seg indices[frame] = %s" % (fk, fv['segidx'], ))
        # logger.debug("       indices[time]  = %s" % ((fv['segidx'] * hopSize) / args.samplerate, ))
        logger.debug("%s seg |indices[frame]| = %s" % (fk, len(fv['segidx']), ))
        # logger.debug("       indices[time]  = %s" % ((fv['segidx'] * hopSize) / args.samplerate, ))
        # logger.debug("       framesize = %d, hopsize = %d" % (frameSize, hopSize))

    # copy spectrum into features_ for plotting
    features_ = features
    # features_['Spectrum'] = {'gram': specgram[1:40,...]}

    logger.debug('main_segment: starting plot of %d grams', len(features_))
    fig = plt.figure()
    fig.suptitle("part segmentation for %s with fs=%d, hs=%d" % (args.file.split('/')[-1], frameSize, hopSize))
    fig.show()
    gs = GridSpec(len(features_), 1)

    axi = 0
    for fk, fv in list(features_.items()):
        logger.debug('main_segment: plotting feature %s with shape %s', fk, fv['gram'].shape)
        ax = fig.add_subplot(gs[axi])
        ax.title.set_text(fk)
        ax.title.set_position((0.1, 0.9))
        ax.pcolormesh(fv['gram'])
        if 'segidx' in fv:
            ax.plot(fv['segidx'], fv['gram'].shape[0]/2 * np.ones_like(fv['segidx']), 'ro')
        if audio_labels is not None:
            ax.plot(audio_labels, (fv['gram'].shape[0]/2 + 1) * np.ones_like(audio_labels), 'go', alpha=0.7)
        if axi < (len(features) - 1): # all but last axis
            ax.set_xticklabels([])
        else:
            ax.set_xlabel('feature-gram, framesize = %d' % (args.frame_size_low_level, ))
        axi += 1
    # ax1 = fig.add_subplot(3, 1,1)
    # ax1.pcolormesh(specgram)
    # # for segidx in fv['segidx']:
    # ax1.plot(fv['segidx'], specgram.shape[0]/2 * np.ones_like(fv['segidx']), 'ro')
    # ax2 = fig.add_subplot(3, 1,2)
    # ax2.pcolormesh(mfcc_bandsgram)
    # ax2.plot(fv['segidx'], mfcc_bandsgram.shape[0]/2 * np.ones_like(fv['segidx']), 'ro')
    # ax3 = fig.add_subplot(3, 1,3)
    
    # ax3 = fig.add_subplot(1, 1, 1)
    # ax3.pcolormesh(mfcc_coefsgram)
    # ax3.plot(fv['segidx'], mfcc_coefsgram.shape[0]/2 * np.ones_like(fv['segidx']), 'ro')

    logger.debug('main_segment: done plotting of %d grams', len(features_))
    plt.draw()
    plt.pause(1e-9)

    logger.debug('saving figure for %s' % (args.file, ))
    fig.set_size_inches((12, 3 * len(features)))
    fig.savefig('data/music_features/segment_%s.png' % (args.file.split('/')[-1]), dpi = 100, bbox_inches = 'tight')
    logger.debug('done saving, next')