Example #1
0
def energyThresholdAudio(soundfilesList):


    for sound in soundfilesList:
        RMS = esst.RMS()
        audioLoader = esst.MonoLoader(filename=sound)
        audio = audioLoader()


        start=0
        end=0
        thresh=0.05
        rms_vals=[]
        for frame in esst.FrameGenerator(audio, frameSize=2048, hopSize=1024, startFromZero=True):
            rms = RMS(frame)
            rms_vals.append(float(rms))
        rms_vals  = np.array(rms_vals)

        higher=np.where(rms_vals >= thresh)[0]
        if len(higher) > 1:
            start=higher[0]
            end=higher[-1]

        else:
            continue

        newAudio = audio[start*1024:end*1024]
        writer = esst.MonoWriter(filename=sound, format="mp3")
        writer(newAudio)
        print (sound)
Example #2
0
def rms(audio,params):
    """ hop size, frame size, window type """
    hopSize, frameSize, wtype = params
    w = Windowing(type=wtype)
    spec = Spectrum()
    result = []
    RMS = ess.RMS()
    for frame in ess.FrameGenerator(audio, frameSize = frameSize, hopSize = hopSize):
        sf = spec(w(frame))
        result.append(RMS(sf))
    return np.asarray(result),hopSize
Example #3
0
def analyze_misc(filename, segment_duration=20):

    # Compute replay gain and duration on the entire file, then load the
    # segment that is centered in time with replaygain applied
    audio = es.MonoLoader(filename=filename)()
    replaygain = es.ReplayGain()(audio)

    segment_start = (len(audio) / 44100 - segment_duration) / 2
    segment_end = segment_start + segment_duration

    if segment_start < 0 or segment_end > len(audio) / 44100:
        raise ValueError(
            'Segment duration is larger than the input audio duration')

    loader = es.EasyLoader(filename=filename,
                           replayGain=replaygain,
                           startTime=segment_start,
                           endTime=segment_end)

    windowing = es.Windowing(type='blackmanharris62')
    spectrum = es.Spectrum()
    powerspectrum = es.PowerSpectrum()
    centroid = es.Centroid()
    zcr = es.ZeroCrossingRate()
    rms = es.RMS()
    hfc = es.HFC()
    pool = essentia.Pool()

    audio = loader()
    for frame in es.FrameGenerator(audio, frameSize=2048, hopSize=1024):
        frame_spectrum = spectrum(windowing(frame))
        pool.add('rms', rms(frame))
        pool.add('rms_spectrum', rms(frame_spectrum))
        pool.add('hfc', hfc(frame_spectrum))
        pool.add('spectral_centroid', centroid(frame_spectrum))
        pool.add('zcr', zcr(frame))

    audio_st, sr, _, _, _, _ = es.AudioLoader(filename=filename)()
    # Ugly hack because we don't have a StereoResample
    left, right = es.StereoDemuxer()(audio_st)
    resampler = es.Resample(inputSampleRate=sr, outputSampleRate=44100)
    left = resampler(left)
    right = resampler(right)
    audio_st = es.StereoMuxer()(left, right)
    audio_st = es.StereoTrimmer(startTime=segment_start,
                                endTime=segment_end)(audio_st)
    ebu_momentary, _, _, _ = es.LoudnessEBUR128(hopSize=1024 / 44100,
                                                startAtZero=True)(audio_st)
    pool.set('ebu_momentary', ebu_momentary)

    return pool
Example #4
0
def postProcessTicks(audioFilename, ticks, ticksAmp, pool):
    '''Computes delta energy in order to find the correct position of the ticks'''
    # get rid of beats of beats > audio.length

    #    if t < 0 or t > pool['length']: continue
    #    ticks.append(float(t))
    #    ticksAmp.append(float(amp))

    #ticks = essentia.postProcessTicks(ticks, ticksAmp, 60./pool['harmonicBpm'][0]);
    beatWindowDuration = 0.01  # seconds
    beatDuration = 0.005  # seconds
    rmsFrameSize = 64
    rmsHopSize = rmsFrameSize / 2
    audio = std.MonoLoader(filename=audioFilename,
                           sampleRate=pool['samplerate'],
                           downmix=pool['downmix'])()
    for i, tick in enumerate(ticks):
        startTime = tick - beatWindowDuration / 2.0
        if startTime < 0: startTime = 0
        endTime = startTime + beatWindowDuration + beatDuration + 0.0001
        slice = std.Trimmer(sampleRate=pool['samplerate'],
                            startTime=startTime,
                            endTime=endTime)(audio)
        frames = std.FrameGenerator(slice,
                                    frameSize=rmsFrameSize,
                                    hopSize=rmsHopSize)
        maxDeltaRms = 0
        RMS = std.RMS()
        prevRms = 0
        pos = 0
        tickPos = pos
        for frame in frames:
            rms = RMS(frame)
            diff = rms - prevRms
            if diff > maxDeltaRms:
                tickPos = pos
                maxDeltaRms = diff
            pos += 1
            prevRms = rms
        ticks[i] = tick + tickPos * float(rmsHopSize) / pool['samplerate']
    return ticks
def highEnergyFrames(audioIn, threshold=0.05, strip=False):

    # strip: if True, only low-energy frames at the beginning and at the end are discarded

    RMS = ess.RMS()
    highEnergyAudio = []

    frames = ess.FrameGenerator(audioIn,
                                frameSize=2048,
                                hopSize=1024,
                                startFromZero=True)
    rmsValues = np.array([float(RMS(frame)) for frame in frames])
    highRMSFrames = np.where(rmsValues > threshold)[0]
    outSamples = [frame * 1024 for frame in highRMSFrames]

    if strip:
        # return the middle section of the audio
        highEnergyAudio = audioIn[outSamples[0]:outSamples[-1]] if len(
            highRMSFrames > 1) else audioIn
    else:
        # return all high-energy samples
        highEnergyAudio = audioIn[outSamples]

    return highEnergyAudio
## Tabla analysis and synthesis module for the HAMR 2015 ISMIR hack

import os
import essentia as es
import essentia.standard as ess
import numpy as np
import pickle
import glob
import utilFunctions as UF
import scipy.spatial.distance as DS

import parameters as params
import csv

rms=ess.RMS()
window = ess.Windowing(type = "hamming")
spec = ess.Spectrum(size=params.Nfft)
zz = np.zeros((params.zeropadLen,), dtype = 'float32')
genmfcc = ess.MFCC(highFrequencyBound = 22000.0, inputSize = params.Nfft/2+1, sampleRate = params.Fs)
hps = ess.HighPass(cutoffFrequency = 240.0)
onsets = ess.Onsets()

strokeLabels = ['dha', 'dhen', 'dhi', 'dun', 'ge', 'kat', 'ke', 'na', 'ne', 're', 'tak', 'te', 'tit', 'tun']

taals = {"teen": {"nmatra": 16, "accents": np.array([4, 1, 1, 1, 3, 1, 1, 1, 2, 1, 1, 1, 3, 1, 1, 1])}, 
         "ek": {"nmatra": 12, "accents": np.array([4, 1, 1, 2, 1, 1, 3, 1, 1, 2, 1, 1])},
         "jhap": {"nmatra": 10, "accents": np.array([4, 1, 2, 1, 1, 3, 1, 2, 1, 1])},
         "rupak": {"nmatra": 7, "accents": np.array([2, 1, 1, 3, 1, 3, 1])}
         }

rolls = [{"bol": ['dha/dha_02', 'te/te_05', 're/re_04', 'dha/dha_02'], "dur": np.array([1.0, 1.0, 1, 1]), "amp": np.array([1.0, 1.0, 1.0, 1.0])},
Example #7
0
def compute(audio, pool, options):
    # analysis parameters
    sampleRate = options['sampleRate']
    frameSize = options['frameSize']
    hopSize = options['hopSize']
    windowType = options['windowType']

    # temporal descriptors
    lpc = ess.LPC(order=10, type='warped', sampleRate=sampleRate)
    zerocrossingrate = ess.ZeroCrossingRate()

    # frame algorithms
    frames = ess.FrameGenerator(audio=audio, frameSize=frameSize, hopSize=hopSize)
    window = ess.Windowing(size=frameSize, zeroPadding=0, type=windowType)
    spectrum = ess.Spectrum(size=frameSize)

    # spectral algorithms
    barkbands = ess.BarkBands(sampleRate=sampleRate)
    centralmoments = ess.CentralMoments()
    crest = ess.Crest()
    centroid = ess.Centroid()
    decrease = ess.Decrease()
    spectral_contrast = ess.SpectralContrast(frameSize=frameSize,
                                             sampleRate=sampleRate,
                                             numberBands=6,
                                             lowFrequencyBound=20,
                                             highFrequencyBound=11000,
                                             neighbourRatio=0.4,
                                             staticDistribution=0.15)
    distributionshape = ess.DistributionShape()
    energy = ess.Energy()
    # energyband_bass, energyband_middle and energyband_high parameters come from "standard" hi-fi equalizers
    energyband_bass = ess.EnergyBand(startCutoffFrequency=20.0, stopCutoffFrequency=150.0, sampleRate=sampleRate)
    energyband_middle_low = ess.EnergyBand(startCutoffFrequency=150.0, stopCutoffFrequency=800.0, sampleRate=sampleRate)
    energyband_middle_high = ess.EnergyBand(startCutoffFrequency=800.0, stopCutoffFrequency=4000.0,
                                            sampleRate=sampleRate)
    energyband_high = ess.EnergyBand(startCutoffFrequency=4000.0, stopCutoffFrequency=20000.0, sampleRate=sampleRate)
    flatnessdb = ess.FlatnessDB()
    flux = ess.Flux()
    harmonic_peaks = ess.HarmonicPeaks()
    hfc = ess.HFC()
    mfcc = ess.MFCC()
    rolloff = ess.RollOff()
    rms = ess.RMS()
    strongpeak = ess.StrongPeak()

    # pitch algorithms
    pitch_detection = ess.PitchYinFFT(frameSize=frameSize, sampleRate=sampleRate)
    pitch_salience = ess.PitchSalience()

    # dissonance
    spectral_peaks = ess.SpectralPeaks(sampleRate=sampleRate, orderBy='frequency')
    dissonance = ess.Dissonance()

    # spectral complexity
    # magnitudeThreshold = 0.005 is hardcoded for a "blackmanharris62" frame
    spectral_complexity = ess.SpectralComplexity(magnitudeThreshold=0.005)

    INFO('Computing Low-Level descriptors...')

    # used for a nice progress display
    total_frames = frames.num_frames()
    n_frames = 0
    start_of_frame = -frameSize * 0.5

    pitches, pitch_confidences = [], []

    progress = Progress(total=total_frames)

    #scPool = es.Pool()  # pool for spectral contrast

    for frame in frames:

        frameScope = [start_of_frame / sampleRate, (start_of_frame + frameSize) / sampleRate]
        # pool.setCurrentScope(frameScope)

        # silence rate
        # pool.add(namespace + '.' + 'silence_rate_60dB', es.isSilent(frame))
        pool.add(namespace + '.' + 'silence_rate_60dB', is_silent_threshold(frame, -60))
        pool.add(namespace + '.' + 'silence_rate_30dB', is_silent_threshold(frame, -30))
        pool.add(namespace + '.' + 'silence_rate_20dB', is_silent_threshold(frame, -20))

        if options['skipSilence'] and es.isSilent(frame):
            total_frames -= 1
            start_of_frame += hopSize
            continue

        # temporal descriptors
        pool.add(namespace + '.' + 'zerocrossingrate', zerocrossingrate(frame))
        (frame_lpc, frame_lpc_reflection) = lpc(frame)
        pool.add(namespace + '.' + 'temporal_lpc', frame_lpc)

        frame_windowed = window(frame)
        frame_spectrum = spectrum(frame_windowed)

        # spectrum-based descriptors
        power_spectrum = frame_spectrum ** 2
        pool.add(namespace + '.' + 'spectral_centroid', centroid(power_spectrum))
        pool.add(namespace + '.' + 'spectral_decrease', decrease(power_spectrum))
        pool.add(namespace + '.' + 'spectral_energy', energy(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_energyband_low', energyband_bass(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_energyband_middle_low', energyband_middle_low(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_energyband_middle_high', energyband_middle_high(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_energyband_high', energyband_high(frame_spectrum))
        pool.add(namespace + '.' + 'hfc', hfc(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_rms', rms(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_flux', flux(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_rolloff', rolloff(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_strongpeak', strongpeak(frame_spectrum))

        # central moments descriptors
        frame_centralmoments = centralmoments(power_spectrum)
        (frame_spread, frame_skewness, frame_kurtosis) = distributionshape(frame_centralmoments)
        pool.add(namespace + '.' + 'spectral_kurtosis', frame_kurtosis)
        pool.add(namespace + '.' + 'spectral_spread', frame_spread)
        pool.add(namespace + '.' + 'spectral_skewness', frame_skewness)

        # dissonance
        (frame_frequencies, frame_magnitudes) = spectral_peaks(frame_spectrum)
        frame_dissonance = dissonance(frame_frequencies, frame_magnitudes)
        pool.add(namespace + '.' + 'dissonance', frame_dissonance)

        # mfcc
        (frame_melbands, frame_mfcc) = mfcc(frame_spectrum)
        pool.add(namespace + '.' + 'mfcc', frame_mfcc)

        # spectral contrast
        (sc_coeffs, sc_valleys) = spectral_contrast(frame_spectrum)
        #scPool.add(namespace + '.' + 'sccoeffs', sc_coeffs)
        #scPool.add(namespace + '.' + 'scvalleys', sc_valleys)
        pool.add(namespace + '.' + 'spectral_contrast', sc_coeffs)


        # barkbands-based descriptors
        frame_barkbands = barkbands(frame_spectrum)
        pool.add(namespace + '.' + 'barkbands', frame_barkbands)
        pool.add(namespace + '.' + 'spectral_crest', crest(frame_barkbands))
        pool.add(namespace + '.' + 'spectral_flatness_db', flatnessdb(frame_barkbands))
        barkbands_centralmoments = ess.CentralMoments(range=len(frame_barkbands) - 1)
        (barkbands_spread, barkbands_skewness, barkbands_kurtosis) = distributionshape(
            barkbands_centralmoments(frame_barkbands))
        pool.add(namespace + '.' + 'barkbands_spread', barkbands_spread)
        pool.add(namespace + '.' + 'barkbands_skewness', barkbands_skewness)
        pool.add(namespace + '.' + 'barkbands_kurtosis', barkbands_kurtosis)

        # pitch descriptors
        frame_pitch, frame_pitch_confidence = pitch_detection(frame_spectrum)
        if frame_pitch > 0 and frame_pitch <= 20000.:
            pool.add(namespace + '.' + 'pitch', frame_pitch)
        pitches.append(frame_pitch)
        pitch_confidences.append(frame_pitch_confidence)
        pool.add(namespace + '.' + 'pitch_instantaneous_confidence', frame_pitch_confidence)

        frame_pitch_salience = pitch_salience(frame_spectrum[:-1])
        pool.add(namespace + '.' + 'pitch_salience', frame_pitch_salience)

        # spectral complexity
        pool.add(namespace + '.' + 'spectral_complexity', spectral_complexity(frame_spectrum))

        # display of progress report
        progress.update(n_frames)

        n_frames += 1
        start_of_frame += hopSize

    # if no 'temporal_zerocrossingrate' it means that this is a silent file
    if 'zerocrossingrate' not in descriptorNames(pool.descriptorNames(), namespace):
        raise ess.EssentiaError('This is a silent file!')

    #spectralContrastPCA(scPool, pool)

    # build pitch value histogram
    from math import log
    from numpy import bincount
    # convert from Hz to midi notes
    midipitches = []
    unknown = 0
    for freq in pitches:
        if freq > 0. and freq <= 12600:
            midipitches.append(12 * (log(freq / 6.875) / 0.69314718055995) - 3.)
        else:
            unknown += 1

    if len(midipitches) > 0:
        # compute histogram
        midipitchhist = bincount(midipitches)
        # set 0 midi pitch to be the number of pruned value
        midipitchhist[0] = unknown
        # normalise
        midipitchhist = [val / float(sum(midipitchhist)) for val in midipitchhist]
        # zero pad
        for i in range(128 - len(midipitchhist)): midipitchhist.append(0.0)
    else:
        midipitchhist = [0.] * 128
        midipitchhist[0] = 1.

    # pitchhist = ess.array(zip(range(len(midipitchhist)), midipitchhist))
    pool.add(namespace + '.' + 'spectral_pitch_histogram', midipitchhist)  # , pool.GlobalScope)

    # the code below is the same as the one above:
    # for note in midipitchhist:
    #    pool.add(namespace + '.' + 'spectral_pitch_histogram_values', note)
    #    print "midi note:", note

    pitch_centralmoments = ess.CentralMoments(range=len(midipitchhist) - 1)
    (pitch_histogram_spread, pitch_histogram_skewness, pitch_histogram_kurtosis) = distributionshape(
        pitch_centralmoments(midipitchhist))
    pool.add(namespace + '.' + 'spectral_pitch_histogram_spread', pitch_histogram_spread)  # , pool.GlobalScope)

    progress.finish()
    def rms(self, audio):

        rmsFunc = es.RMS()
        return 20 * math.log10(rmsFunc(audio))
Example #9
0
import matplotlib.pyplot as plt

mbid = 'ead85d20-ce7d-4ed0-a00d-0ae199b94d12'

hopSize = 128

loader = es.MonoLoader(filename=os.path.join(mbid, mbid + '-voice.mp3'))
track = loader()

# track[track<0.000001] = 0

print(len(track) / 44100.)

# pY = es.PitchYin(minFrequency=55, maxFrequency=900, tolerance=0.06)
pY = es.PitchYin(minFrequency=55, maxFrequency=600, tolerance=0.03)
rms = es.RMS()

pitch = []
loudness = []

print('Computing pitch and loudness')
for frame in es.FrameGenerator(track,
                               frameSize=2048,
                               hopSize=hopSize,
                               startFromZero=True):
    f = pY(frame)
    if f[1] >= 0.8:
        pitch.append(f[0])
        loudness.append(rms(frame))
    else:
        pitch.append(0)
def compute_features(complete_path):
    result = []
    meta_result = []
    file_count = 0
    # for loop over files
    for file in os.listdir(complete_path):
        if file.endswith(".wav"):
            file_count+=1
            # print(file +' : ' + str(file_count))

            # load our audio into an array
            audio = es.MonoLoader(filename=complete_path + file, sampleRate=44100)()

            # create the pool and the necessary algorithms
            pool = essentia.Pool()
            window = es.Windowing()
            energy = es.Energy()
            spectrum = es.Spectrum()
            centroid = es.Centroid(range=22050)
            rolloff = es.RollOff()
            crest = es.Crest()
            speak = es.StrongPeak()
            rmse = es.RMS()
            mfcc = es.MFCC()
            flux = es.Flux()
            barkbands = es.BarkBands( sampleRate = 44100)
            zerocrossingrate = es.ZeroCrossingRate()

            meta = es.MetadataReader(filename=complete_path + file, failOnError=True)()
            pool_meta, duration, bitrate, samplerate, channels = meta[7:]
            
            # centralmoments = es.SpectralCentralMoments()
            # distributionshape = es.DistributionShape()

            # compute the centroid for all frames in our audio and add it to the pool
            for frame in es.FrameGenerator(audio, frameSize = 1024, hopSize = 512):
                frame_windowed = window(frame)
                frame_spectrum = spectrum(frame_windowed)
                
                c = centroid(frame_spectrum)
                pool.add('spectral.centroid', c)

                cr = crest(frame_spectrum)
                pool.add('spectral crest', cr)

                r = rolloff(frame_spectrum)
                pool.add('spectral rolloff', r)

                sp = speak(frame_spectrum)
                pool.add('strong peak', sp)

                rms = rmse(frame_spectrum)
                pool.add('RMS', rms)

                pool.add('spectral_energy', energy(frame_spectrum))
                # (frame_melbands, frame_mfcc) = mfcc(frame_spectrum)
                # pool.add('frame_MFCC', frame_mfcc)

                fl = flux(frame_spectrum)
                pool.add('spectral flux', fl)

                # bbands = barkbands(frame_spectrum)
                # pool.add('bark bands', bbands)

                zcr = zerocrossingrate(frame_spectrum)
                pool.add('zero crossing rate', zcr)

                # frame_centralmoments = centralmoments(power_spectrum)
                # (frame_spread, frame_skewness, frame_kurtosis) = distributionshape(frame_centralmoments)
                # pool.add('spectral_kurtosis', frame_kurtosis)
                # pool.add('spectral_spread', frame_spread)
                # pool.add('spectral_skewness', frame_skewness)

            # aggregate the results (find mean if needed)
            aggrpool = es.PoolAggregator(defaultStats = ['mean'])(pool) #,'stdev' ])(pool)
            
            pool_meta.set("duration", duration)
            pool_meta.set("filename", os.path.relpath(file))

            # write pools to lists
            pool_arr = pool_to_array(aggrpool)
            result.append(pool_arr)

            meta_arr = pool_to_array(pool_meta)
            meta_result.append(meta_arr)
         
    features_df = pd.DataFrame.from_records(result)
    features_df.columns = ['centroid', 'crest','roll off','strong peak','rms','energy','flux','zcr']
    
    meta_df = pd.DataFrame.from_records(meta_result)
    meta_df.columns = ['duration','filename','metadata.tags.comment']
    del meta_df['metadata.tags.comment']

    return features_df,meta_df