def computeSegmentation(filename, pool): sampleRate = 44100 frameSize = 2048 hopSize = frameSize / 2 audio = EqloudLoader(filename=filename, downmix=pool['downmix'], sampleRate=sampleRate) fc = FrameCutter(frameSize=frameSize, hopSize=hopSize, silentFrames='keep') w = Windowing(type='blackmanharris62') spec = Spectrum() mfcc = MFCC(highFrequencyBound=8000) tmpPool = essentia.Pool() audio.audio >> fc.signal fc.frame >> w.frame >> spec.frame spec.spectrum >> mfcc.spectrum mfcc.bands >> (tmpPool, 'mfcc_bands') mfcc.mfcc >> (tmpPool, 'mfcc_coeff') essentia.run(audio) # compute transpose of features array, don't call numpy.matrix.transpose # because essentia f***s it up!! features = copy.deepcopy(tmpPool['mfcc_coeff'].transpose()) segments = std.SBic(cpw=1.5, size1=1000, inc1=300, size2=600, inc2=50)(features) for segment in segments: pool.add('segments', segment * hopSize / sampleRate)
def main_segment(args): """perform sgementation of a piece of audio The idea is to identify different parts in a recording and annotate them for editing further down the pipeline. TODO - framesize hierarchy/ pyramid - pimp criteria with existing alternative criteria - pimp criteria with our own predictability criteria - clustering on spec frame pyramid - recurrence plot on feature frames """ # FFT framesize and hopsize parameters frameSize = args.frame_size_low_level hopSize = frameSize / 2 if args.hop_size_low_level is not None: hopSize = args.hop_size_low_level # load the audio audio = loadaudio(args) logger.debug('audio loaded, type = %s' % (audio.dtype)) audio_labels = None # check if labels exist labelfile = args.file[:-4] + '_labels.txt' if os.path.exists(labelfile): audio_labels_t = np.genfromtxt(labelfile, delimiter='\t') audio_labels = (audio_labels_t[:,0] * args.samplerate) / hopSize logger.debug('labels = %s', audio_labels) # frame = audio # init window func w = estd.Windowing(type = 'hamming') # init spectrum spectrum = estd.Spectrum() # FFT() would return the complex FFT, here we just want the magnitude spectrum # feature operators features = { 'BarkBands': { 'op': estd.BarkBands, 'opargs': { 'numberBands': 12, # 28, }, 'opout': [0], }, 'ERBBands': { 'op': estd.ERBBands, 'opargs': { 'numberBands': 12, # 40, }, 'opout': [0], }, 'MFCC': { 'op': estd.MFCC, 'opargs': { 'numberBands': 20, 'numberCoefficients': 10, 'highFrequencyBound': 10000, 'logType': 'dbamp', 'normalize': 'unit_sum', # 40 numbands }, 'opout': [1], }, 'GFCC': { 'op': estd.GFCC, 'opargs': { 'numberBands': 20, 'numberCoefficients': 10, 'highFrequencyBound': 10000, 'logType': 'dbamp', # 40 numberBands }, 'opout': [1], }, 'LPC': { 'op': estd.LPC, 'opargs': { # 'order': 20, 'type': 'regular', 'order': 8, 'type': 'regular', }, 'opout': [1], }, 'MelBands': { 'op': estd.MelBands, 'opargs': {'numberBands': 10}, 'opout': [0], }, } for fk, fv in list(features.items()): features[fk]['inst'] = fv['op'](**fv['opargs']) features[fk]['gram'] = [] # # init mfcc features # mfcc = estd.MFCC() # segmentation operator sbic = estd.SBic( cpw = args.sbic_complexity_penalty_weight, inc1=args.sbic_inc1, inc2=args.sbic_inc2, minLength=args.sbic_minlength, size1=args.sbic_size1, size2=args.sbic_size2 ) # sbic = estd.SBic(cpw = 1.5, inc1 = 60, inc2 = 20, sbic_minlength = 10, size1 = 300, size2 = 200) # sbic = estd.SBic(cpw = 1.5, inc1 = 60, inc2 = 20, sbic_minlength = 80, size1 = 300, size2 = 200) # sbic = estd.SBic(cpw = 0.05, inc1 = 60, inc2 = 20, sbic_minlength = 120, size1 = 300, size2 = 200) # sbic = estd.SBic(cpw = 0.3, inc1 = 20, inc2 = 10, sbic_minlength = 10, size1 = 100, size2 = 70) # print "w", repr(w) # print "spectrum", repr(spectrum) # print "mfcc", repr(mfcc) # frame = audio[int(0.2*args.samplerate) : int(0.2*args.samplerate) + 1024] # print "frame.shape", frame.shape # spec = spectrum(w(frame)) # mfcc_bands, mfcc_coeffs = mfcc(spec) pool = e.Pool() numframes = 0 specgram = [] # mfcc_bandsgram = [] # mfcc_coefsgram = [] logger.debug('main_segment: computing spec and features for audio of size %s', audio.shape) for frame in estd.FrameGenerator(audio, frameSize = frameSize, hopSize = hopSize, startFromZero=True): # mfcc_bands, mfcc_coeffs = mfcc(spectrum(w(frame))) # pool.add('lowlevel.mfcc', mfcc_coeffs) # pool.add('lowlevel.mfcc_bands', mfcc_bands) print(("frame", frame.shape)) # frame = np.atleast_2d(frame) spec = spectrum(w(frame)) specgram.append(spec) for fk, fv in list(features.items()): # logger.debug('computing feature %s', fk) fspec_ = fv['inst'](spec) # logger.debug(' type(fspec_) = %s', type(fspec_)) fspec = fspec_ if type(fspec_) is tuple: fspec = fspec_[fv['opout'][0]] fv['gram'].append(fspec) # mfcc_bands, mfcc_coefs = mfcc(spec) # mfcc_bandsgram.append(mfcc_bands) # mfcc_coefsgram.append(mfcc_coefs) numframes += 1 if numframes % 10000 == 0: logger.debug('main_segment: crunched %d frames of shape %s', numframes, frame.shape) logger.debug('main_segment: crunched %d frames of shape %s', numframes, frame.shape) # sys.exit(0) specgram = np.array(specgram).T logger.debug("main_segment: %s-gram = %s", 'spec', specgram.shape) for fk, fv in list(features.items()): fv['gram'] = np.array(fv['gram']).T logger.debug("main_segment computing sbic for %s-gram = %s", fk, fv['gram'].shape) # mfcc_bandsgram = np.array(mfcc_bandsgram).T # mfcc_coefsgram = np.array(mfcc_coefsgram).T # print "segmenting mfcc_bandsgram", mfcc_bandsgram.shape # print "segmenting mfcc_coefsgram", mfcc_coefsgram.shape # segidx = sbic(specgram) # segidx = sbic(mfcc_bandsgram) # segidx = sbic(mfcc_coefsgram) fv['segidx'] = sbic(fv['gram']) # pool.add('segment.sbic', segidx) # logger.debug(" pool['segment.sbic'] = %s", pool['segment.sbic']) # logger.debug("%s seg indices[frame] = %s" % (fk, fv['segidx'], )) # logger.debug(" indices[time] = %s" % ((fv['segidx'] * hopSize) / args.samplerate, )) logger.debug("%s seg |indices[frame]| = %s" % (fk, len(fv['segidx']), )) # logger.debug(" indices[time] = %s" % ((fv['segidx'] * hopSize) / args.samplerate, )) # logger.debug(" framesize = %d, hopsize = %d" % (frameSize, hopSize)) # copy spectrum into features_ for plotting features_ = features # features_['Spectrum'] = {'gram': specgram[1:40,...]} logger.debug('main_segment: starting plot of %d grams', len(features_)) fig = plt.figure() fig.suptitle("part segmentation for %s with fs=%d, hs=%d" % (args.file.split('/')[-1], frameSize, hopSize)) fig.show() gs = GridSpec(len(features_), 1) axi = 0 for fk, fv in list(features_.items()): logger.debug('main_segment: plotting feature %s with shape %s', fk, fv['gram'].shape) ax = fig.add_subplot(gs[axi]) ax.title.set_text(fk) ax.title.set_position((0.1, 0.9)) ax.pcolormesh(fv['gram']) if 'segidx' in fv: ax.plot(fv['segidx'], fv['gram'].shape[0]/2 * np.ones_like(fv['segidx']), 'ro') if audio_labels is not None: ax.plot(audio_labels, (fv['gram'].shape[0]/2 + 1) * np.ones_like(audio_labels), 'go', alpha=0.7) if axi < (len(features) - 1): # all but last axis ax.set_xticklabels([]) else: ax.set_xlabel('feature-gram, framesize = %d' % (args.frame_size_low_level, )) axi += 1 # ax1 = fig.add_subplot(3, 1,1) # ax1.pcolormesh(specgram) # # for segidx in fv['segidx']: # ax1.plot(fv['segidx'], specgram.shape[0]/2 * np.ones_like(fv['segidx']), 'ro') # ax2 = fig.add_subplot(3, 1,2) # ax2.pcolormesh(mfcc_bandsgram) # ax2.plot(fv['segidx'], mfcc_bandsgram.shape[0]/2 * np.ones_like(fv['segidx']), 'ro') # ax3 = fig.add_subplot(3, 1,3) # ax3 = fig.add_subplot(1, 1, 1) # ax3.pcolormesh(mfcc_coefsgram) # ax3.plot(fv['segidx'], mfcc_coefsgram.shape[0]/2 * np.ones_like(fv['segidx']), 'ro') logger.debug('main_segment: done plotting of %d grams', len(features_)) plt.draw() plt.pause(1e-9) logger.debug('saving figure for %s' % (args.file, )) fig.set_size_inches((12, 3 * len(features))) fig.savefig('data/music_features/segment_%s.png' % (args.file.split('/')[-1]), dpi = 100, bbox_inches = 'tight') logger.debug('done saving, next')