def get_beat_chunks(filename, bpm_restrict=None):
    audio = std.MonoLoader(filename=filename)()
    hpcp = std.HPCP()
    spectrum = std.Spectrum()
    speaks = std.SpectralPeaks()
    large_speaks = std.SpectralPeaks(maxPeaks=2000)
    tivs = []
    sr = 44100
    bpm = get_tempo(filename)
    tivs_framewise = []
    if bpm_restrict != None and bpm_restrict != bpm:
        raise ValueError
    sec_beat = (60 / bpm)
    beats = np.arange(0, len(audio) / sr, sec_beat)
    beats = np.append(beats, len(audio) / sr)
    for i in range(1, len(beats)):
        segmented_audio = audio[int(beats[i - 1] * sr):int(beats[i] * sr)]
        cutter = std.FrameGenerator(segmented_audio)
        for sec in cutter:
            spec = spectrum(sec)
            freq, mag = speaks(spec)
            chroma = hpcp(freq, mag)
            tivs_framewise.append(chroma)
        np2_seg_audio = zeropad_next_power_2(segmented_audio)
        spec = spectrum(np2_seg_audio)
        freq, mag = speaks(spec)
        chroma = hpcp(freq, mag)
        tivs.append(chroma)

    # Calculate the whole TIV
    np2_whole = zeropad_next_power_2(audio)
    spec = spectrum(np2_whole)
    freq, mag = large_speaks(spec)
    chroma_whole = hpcp(freq, mag)
    return mt.TIVCollection.from_pcp(np.array(tivs).T), mt.TIV.from_pcp(chroma_whole), mt.TIVCollection.from_pcp(np.array(tivs_framewise).T)
Example #2
0
    def calc_chromagram(self):

        # save the results in the stft_pool
        self.chromagram = []
        hpcp = es.HPCP(
            size=12,  # we will need higher resolution for Key estimation
            referenceFrequency=440,  # assume tuning frequency is 44100.
            bandPreset=False,
            weightType='cosine',
            nonLinear=False,
            windowSize=1.,
            sampleRate=self.sample_rate)

        spectrum = es.Spectrum(size=self.fft_size)
        spectral_peaks = es.SpectralPeaks(sampleRate=self.sample_rate)

        for frame in es.FrameGenerator(self.audio,
                                       frameSize=self.frame_size,
                                       hopSize=self.hop_size,
                                       startFromZero=True):
            frame = array(frame * self.window)
            freqs, mags = spectral_peaks(spectrum(frame))
            chroma = hpcp(freqs, mags)
            self.chromagram.append(chroma)

        self.chromagram = array(self.chromagram)

        self.timeAxSec = np.arange(len(
            self.chromagram)) * self.hop_size / float(self.sample_rate)
Example #3
0
def getHPCPEssentia(XAudio, Fs, winSize, hopSize, squareRoot=False, NChromaBins=36, NHarmonics = 0):
    """
    Wrap around the essentia library to compute HPCP features
    :param XAudio: A flat array of raw audio samples
    :param Fs: Sample rate
    :param winSize: Window size of each STFT window
    :param hopSize: Hop size between STFT windows
    :param squareRoot: Do square root compression?
    :param NChromaBins: How many chroma bins (default 36)
    :returns H: An (NChromaBins x NWindows) matrix of all \
        chroma windows
    """
    import essentia
    from essentia import Pool, array
    import essentia.standard as ess
    spectrum = ess.Spectrum()
    window = ess.Windowing(size=winSize, type='hann')
    spectralPeaks = ess.SpectralPeaks()
    hpcp = ess.HPCP(size=NChromaBins, harmonics=NHarmonics)
    H = []
    for frame in ess.FrameGenerator(array(XAudio), frameSize=winSize, hopSize=hopSize, startFromZero=True):
        S = spectrum(window(frame))
        freqs, mags = spectralPeaks(S)
        H.append(hpcp(freqs, mags))
    H = np.array(H)
    H = H.T
    if squareRoot:
        H = sqrtCompress(H)
    return H
Example #4
0
def file_to_hpcp(loop):
    loop = e.array(loop)

    windowing = es.Windowing(type='blackmanharris62')
    spectrum = es.Spectrum()
    spectral_peaks = es.SpectralPeaks(orderBy='magnitude',
                                      magnitudeThreshold=0.001,
                                      maxPeaks=20,
                                      minFrequency=20,
                                      maxFrequency=8000)
    hpcp = es.HPCP(maxFrequency=8000)
    spec_group = []
    hpcp_group = []
    for frame in es.FrameGenerator(loop, frameSize=1024, hopSize=512):
        windowed = windowing(frame)
        fft = spectrum(windowed)
        frequencies, magnitudes = spectral_peaks(fft)
        final_hpcp = hpcp(frequencies, magnitudes)
        spec_group.append(fft)
        hpcp_group.append(final_hpcp)

    mean_hpcp = np.mean(np.array(hpcp_group).T, axis=1)
    #normalize to 1
    mean_hpcp = mean_hpcp / mean_hpcp.max()

    return mean_hpcp
Example #5
0
    def _extract_pitch_contours(self, audio):
        # Hann window with x4 zero padding
        run_windowing = estd.Windowing(  # pylint: disable-msg=E1101
            zeroPadding=3 * self.frame_size)
        run_spectrum = estd.Spectrum(  # pylint: disable-msg=E1101
            size=self.frame_size * 4)
        run_spectral_peaks = estd.SpectralPeaks(  # pylint: disable-msg=E1101
            minFrequency=self.min_frequency,
            maxFrequency=self.max_frequency,
            magnitudeThreshold=self.magnitude_threshold,
            sampleRate=self.sample_rate,
            orderBy='magnitude')

        # convert unit to cents, PitchSalienceFunction takes 55 Hz as the
        # default reference
        run_pitch_salience_function = \
            estd.PitchSalienceFunction(  # pylint: disable-msg=E1101
                binResolution=self.bin_resolution)
        run_pitch_salience_function_peaks = \
            estd.PitchSalienceFunctionPeaks(  # pylint: disable-msg=E1101
                binResolution=self.bin_resolution,
                minFrequency=self.min_frequency,
                maxFrequency=self.max_frequency)
        run_pitch_contours = estd.PitchContours(  # pylint: disable-msg=E1101
            hopSize=self.hop_size,
            binResolution=self.bin_resolution,
            peakDistributionThreshold=self.peak_distribution_threshold)

        # compute frame by frame
        pool = Pool()
        for frame in estd.FrameGenerator(
                audio,  # pylint: disable-msg=E1101
                frameSize=self.frame_size,
                hopSize=self.hop_size):
            frame = run_windowing(frame)
            spectrum = run_spectrum(frame)
            peak_frequencies, peak_magnitudes = run_spectral_peaks(spectrum)
            salience = run_pitch_salience_function(peak_frequencies,
                                                   peak_magnitudes)
            salience_peaks_bins, salience_peaks_contour_saliences = \
                run_pitch_salience_function_peaks(salience)
            if not np.size(salience_peaks_bins):
                salience_peaks_bins = np.array([0])
            if not np.size(salience_peaks_contour_saliences):
                salience_peaks_contour_saliences = np.array([0])

            pool.add('allframes_salience_peaks_bins', salience_peaks_bins)
            pool.add('allframes_salience_peaks_contourSaliences',
                     salience_peaks_contour_saliences)

        # post-processing: contour tracking
        contours_bins, contour_saliences, contours_start_times, duration = \
            run_pitch_contours(
                [f.tolist()
                 for f in pool['allframes_salience_peaks_bins']],
                [f.tolist()
                 for f in pool['allframes_salience_peaks_contourSaliences']])
        return contours_bins, contours_start_times, contour_saliences, duration
def FeatureExtraction_Recording(recording, params):

    numBins = params.numbins
    fs = params.fs
    # LOAD Audio file
    Audio = ess.MonoLoader(filename=recording.path, sampleRate=fs)()
    Audio = ess.DCRemoval()(Audio)  # PREPROCESSING / DC removal
    Audio = ess.EqualLoudness()(Audio)  # PREPROCESSING - Equal Loudness Filter

    # Windowing Parameters (first converting from msec to number of samples)
    # assuring windowSize and hopSize are even
    windowSize = round(fs * params.windowSize / 1000)
    windowSize = int(windowSize / 2) * 2
    hopSize = round(fs * params.hopSize / 1000)
    hopSize = int(hopSize / 2) * 2

    tonic = float(recording.tonic)

    # FRAME-BASED Spectral Analysis
    hpcp = []
    for frame in ess.FrameGenerator(Audio,
                                    frameSize=windowSize,
                                    hopSize=hopSize,
                                    startFromZero=True):
        frame = ess.Windowing(size=windowSize,
                              type=params.windowFunction)(frame)
        mX = ess.Spectrum(size=windowSize)(frame)
        mX[mX < np.finfo(float).eps] = np.finfo(float).eps
        # EXTRACT frequency and magnitude information of the harmonic spectral peaks
        freq, mag = ess.SpectralPeaks()(mX)
        # harmonic pitch-class profiles
        hpcp.append(
            ess.HPCP(normalized='unitSum',
                     referenceFrequency=tonic,
                     size=numBins,
                     windowSize=12 / numBins)(freq, mag))
    recording.chroma_framebased = np.array(hpcp)

    # FEATURE SUMMARIZATION
    mean_chroma = []
    # global Mean of HPCP vectors
    std_chroma = []
    # global standard deviation of HPCP vectors
    for j in range(numBins):
        tmp = []
        for i in range(len(recording.chroma_framebased)):
            tmp.append(recording.chroma_framebased[i][j])
        mean_chroma.append(np.mean(tmp))
        std_chroma.append(np.std(tmp))
    recording.chroma_mean = mean_chroma
    recording.chroma_std = std_chroma
Example #7
0
    def compute_features(self, audio):
        """Computes the specified Essentia features from the audio array."""
        features = []

        for frame in ES.FrameGenerator(audio,
                frameSize=self.frame_size, hopSize=self.hop_size):
            if self.feature.name() == "MFCC":
                bands, coeffs = self.feature(self.spectrum(self.w(frame)))
            elif self.feature.name() == "HPCP":
                spectral_peaks = ES.SpectralPeaks()
                freqs, mags = spectral_peaks(self.spectrum(self.w(frame)))
                coeffs = self.feature(freqs, mags)
            features.append(coeffs)

        # Convert to Essentia Numpy array
        features = essentia.array(features)

        if self.beats != []:
            framerate = self.sample_rate / float(self.hop_size)
            tframes = np.arange(features.shape[0]) / float(framerate)
            features = utils.resample_mx(features.T, tframes, self.beats).T

        return features
Example #8
0
    def hpcp(self,
            frameSize=4096,
            windowType='blackmanharris62',
            harmonicsPerPeak=8,
            magnitudeThreshold=0,
            maxPeaks=100,
            whitening=True,
            referenceFrequency=440,
            minFrequency=100,
            maxFrequency=3500,
            nonLinear=False,
            numBins=12,
            display=False):
        """
        Compute Harmonic Pitch Class Profiles (HPCP) for the input audio files using essentia standard mode using
        the default parameters as mentioned in [1].
        Please refer to the following paper for detailed explanantion of the algorithm.
        [1]. Gómez, E. (2006). Tonal Description of Polyphonic Audio for Music Content Processing.
        For full list of parameters of essentia standard mode HPCP 
        please refer to http://essentia.upf.edu/documentation/reference/std_HPCP.html
        
        Returns
        hpcp: ndarray(n_frames, 12)
            The HPCP coefficients at each time frame
        """
        audio = array(self.audio_vector)
        frameGenerator = estd.FrameGenerator(audio, frameSize=frameSize, hopSize=self.hop_length)
        # framecutter = estd.FrameCutter(frameSize=frameSize, hopSize=self.hop_length)
        windowing = estd.Windowing(type=windowType)
        spectrum = estd.Spectrum()
        # Refer http://essentia.upf.edu/documentation/reference/streaming_SpectralPeaks.html
        spectralPeaks = estd.SpectralPeaks(magnitudeThreshold=magnitudeThreshold,
                                            maxFrequency=maxFrequency,
                                            minFrequency=minFrequency,
                                            maxPeaks=maxPeaks,
                                            orderBy="frequency",
                                            sampleRate=self.fs)
        # http://essentia.upf.edu/documentation/reference/streaming_SpectralWhitening.html
        spectralWhitening = estd.SpectralWhitening(maxFrequency= maxFrequency,
                                                    sampleRate=self.fs)
        # http://essentia.upf.edu/documentation/reference/streaming_HPCP.html
        hpcp = estd.HPCP(sampleRate=self.fs,
                        maxFrequency=maxFrequency,
                        minFrequency=minFrequency,
                        referenceFrequency=referenceFrequency,
                        nonLinear=nonLinear,
                        harmonics=harmonicsPerPeak,
                        size=numBins)
        pool = Pool()

        #compute hpcp for each frame and add the results to the pool
        for frame in frameGenerator:
            spectrum_mag = spectrum(windowing(frame))
            frequencies, magnitudes = spectralPeaks(spectrum_mag)
            if whitening:
                w_magnitudes = spectralWhitening(spectrum_mag,
                                                frequencies,
                                                magnitudes)
                hpcp_vector = hpcp(frequencies, w_magnitudes)
            else:
                hpcp_vector = hpcp(frequencies, magnitudes)
            pool.add('tonal.hpcp',hpcp_vector)

        if display:
            display_chroma(pool['tonal.hpcp'].T, self.hop_length)

        return pool['tonal.hpcp']
Example #9
0
bandpreset = False
normalize = False      

""" DSP: FrameGenerator -> Windowing -> Spectrum -> Spectral Peaks -> HPCP """
# window type =  {hamming,hann,triangular,square,blackmanharris62,blackmanharris70,blackmanharris74,blackmanharris92}
# perhaps HERE convert linear units to Db's??

loader = esst.MonoLoader(filename=filename,
                         sampleRate=samplerate)
window = esst.Windowing(type='blackmanharris92',
                        size=framesize)
rfft   = esst.Spectrum(size=framesize)

peaks  = esst.SpectralPeaks(minFrequency=minfreq,
                            maxFrequency=maxfreq,
                            maxPeaks=maxpeaks,
                            magnitudeThreshold=magthres,
                            sampleRate=samplerate,
                            orderBy=orderby)

hpcp   = esst.HPCP(bandPreset=bandpreset,
                   harmonics=partials,
                   normalized=normalize,
                   minFrequency=minfreq,
                   maxFrequency=maxfreq,
                   sampleRate=samplerate,
                   weightType=weight)

audio = loader()
peakF = []
peakA = []
chroma = []
Example #10
0
def hpcpgram(audio,
             sampleRate=44100,
             frameSize=4096,
             hopSize=2048,
             numBins=12,
             windowType='blackmanharris62',
             minFrequency=100,
             maxFrequency=4000,
             whitening=False,
             maxPeaks=100,
             magnitudeThreshold=1e-05,
             **kwargs):
    """
    Compute Harmonic Pitch Class Profile (HPCP) Grams for overlapped frames of a given input audio signal 

    For additional list of parameters of essentia standard mode HPCP please refer to 
    http://essentia.upf.edu/documentation/reference/std_HPCP.html

    References:
    [1]. Gómez, E. (2006). Tonal Description of Polyphonic Audio for Music Content Processing.

    Inputs
        audio (2d vector): audio signal

    Parameters:
        sampleRate : (real ∈ (0, ∞), default = 44100) :
        the sampling rate of the audio signal [Hz]

        frameSize (integer ∈ [1, ∞), default = 1024) :
        the output frame size
        
        hopSize (integer ∈ [1, ∞), default = 512) :
        the hop size between frames

        numBins : (integer ∈ [12, ∞), default = 12) :
        the size of the output HPCP (must be a positive nonzero multiple of 12)

        windowType (string ∈ {hamming, hann, hannnsgcq, triangular, square, blackmanharris62, blackmanharris70, blackmanharris74, blackmanharris92}, default = blackmanharris62) :
        the window type, which can be 'hamming', 'hann', 'triangular', 'square' or 'blackmanharrisXX'

        maxFrequency : (real ∈ (0, ∞), default = 4000) :
        the maximum frequency that contributes to the SpectralPeaks and HPCP algorithms computation [Hz] (the difference between the max and split frequencies must not be less than 200.0 Hz)

        minFrequency : (real ∈ (0, ∞), default = 100) :
        the minimum frequency that contributes to the SpectralPeaks and HPCP algorithm computation [Hz] (the difference between the min and split frequencies must not be less than 200.0 Hz)

        maxPeaks (integer ∈ [1, ∞), default = 100) :
        the maximum number of returned peaks while calculating SpectralPeaks

        magnitudeThreshold (real ∈ (-∞, ∞), default = 0) :
        peaks below this given threshold are not outputted while calculating Spectral Peaks

        whitening : (boolean (True, False), default = False)
        Optional step of computing spectral whitening to the output from speakPeak magnitudes

        kwargs : additional keyword arguments
        Arguments to parameterize HPCP alogithms.
        see standard mode HPCP algorithm (http://essentia.upf.edu/documentation/reference/std_HPCP.html).


    Returns: hpcpgram of overlapped frames of input audio signal (2D vector) 

    """
    frameGenerator = es.FrameGenerator(array(audio),
                                       frameSize=frameSize,
                                       hopSize=hopSize)
    window = es.Windowing(type=windowType)
    spectrum = es.Spectrum()
    # Refer http://essentia.upf.edu/documentation/reference/std_SpectralPeaks.html
    spectralPeaks = es.SpectralPeaks(magnitudeThreshold=magnitudeThreshold,
                                     maxFrequency=maxFrequency,
                                     minFrequency=minFrequency,
                                     maxPeaks=maxPeaks,
                                     sampleRate=sampleRate)
    # http://essentia.upf.edu/documentation/reference/std_SpectralWhitening.html
    spectralWhitening = es.SpectralWhitening(maxFrequency=maxFrequency,
                                             sampleRate=sampleRate)
    # http://essentia.upf.edu/documentation/reference/std_HPCP.html
    hpcp = es.HPCP(sampleRate=sampleRate,
                   maxFrequency=maxFrequency,
                   minFrequency=minFrequency,
                   size=numBins,
                   **kwargs)
    pool = Pool()
    #compute hpcp for each frame and add the results to the pool
    for frame in frameGenerator:
        spectrum_mag = spectrum(window(frame))
        frequencies, magnitudes = spectralPeaks(spectrum_mag)
        if whitening:
            w_magnitudes = spectralWhitening(spectrum_mag, frequencies,
                                             magnitudes)
            hpcp_vector = hpcp(frequencies, w_magnitudes)
        else:
            hpcp_vector = hpcp(frequencies, magnitudes)
        pool.add('tonal.hpcp', hpcp_vector)
    return pool['tonal.hpcp']
Example #11
0
def extractFeatures(audio_data):
    """
  Recebe um vetor de reais representando um sinal de áudio, calcula suas 
  features, agrega-as em uma Pool() de essentia e retorna esta Pool
  """
    from numpy import ndarray
    assert (type(audio_data) is ndarray)
    assert ("float" in str(audio_data.dtype))

    #Inicia Pool()
    output_pool = es.Pool()

    #Calcula espectro do sinal
    output_pool.set(pk_spectrum, es_mode.Spectrum()(audio_data))

    #Calcula EnergyBandRatio
    energy_band_ratio = es_mode.EnergyBandRatio()(output_pool[pk_spectrum])
    output_pool.set(pk_energy_band_ratio, energy_band_ratio)

    #Calcula MaxMagFreq
    max_mag_freq = es_mode.MaxMagFreq()(output_pool[pk_spectrum])
    output_pool.set(pk_max_mag_freq, max_mag_freq)

    #Calcula SpectralCentroidTime
    spectral_centroid_time = es_mode.SpectralCentroidTime()(audio_data)
    output_pool.set(pk_spectral_centroid_time, spectral_centroid_time)

    #Calcula SpectralComplexity
    spectral_complexity = es_mode.SpectralComplexity()(
        output_pool[pk_spectrum])
    output_pool.set(pk_spectral_complexity, spectral_complexity)

    #Calcula StrongPeak
    strong_peak = es_mode.StrongPeak()(output_pool[pk_spectrum])
    output_pool.set(pk_strong_peak, strong_peak)

    #Calcula SpectralPeaks
    sp_freq, sp_mag = es_mode.SpectralPeaks()(output_pool[pk_spectrum])
    #corta o DC, se houver, e pedido de HarmonicPeaks
    if sp_freq[0] == 0:
        sp_freq = sp_freq[1:]
        sp_mag = sp_mag[1:]
    output_pool.set(pk_spectral_peaks_freq, sp_freq)
    output_pool.set(pk_spectral_peaks_mag, sp_mag)

    ######################################
    #       Para Inharmonicity           #
    ######################################
    #Calcula PitchYinFFT
    pitch_yin_fft, pitch_prob_yin_fft = es_mode.PitchYinFFT()(
        output_pool[pk_spectrum])
    output_pool.set(pk_pitch, pitch_yin_fft)
    output_pool.set(pk_pitch_prob, pitch_prob_yin_fft)

    #Calcula HarmonicPeaks
    hp_freq, hp_mag = es_mode.HarmonicPeaks()(output_pool[pk_spectral_peaks_freq],\
                                              output_pool[pk_spectral_peaks_mag],\
                                              output_pool[pk_pitch] )
    output_pool.set(pk_harmonic_peaks_freq, hp_freq)
    output_pool.set(pk_harmonic_peaks_mag, hp_mag)

    #Calcula Inharmonicity
    inharmonicity = es_mode.Inharmonicity()(output_pool[pk_harmonic_peaks_freq],\
                                            output_pool[pk_harmonic_peaks_mag])
    output_pool.set(pk_inharmonicity, inharmonicity)

    #Acaba Inharmonicity#####################################

    #Calcula SpectralContrast
    frame_size = 2 * (output_pool[pk_spectrum].size - 1)
    spectral_contrast, spectral_valley = \
        es_mode.SpectralContrast(frameSize=frame_size)(output_pool[pk_spectrum])
    output_pool.set(pk_spectral_contrast, spectral_contrast)
    output_pool.set(pk_spectral_valley, spectral_valley)

    #Calcula SpectralWhitening
    spectral_whitening = \
                es_mode.SpectralWhitening()(output_pool[pk_spectrum],\
                                            output_pool[pk_spectral_peaks_freq],\
                                            output_pool[pk_spectral_peaks_mag])
    output_pool.set(pk_spectral_whitening, spectral_whitening)

    return output_pool
Example #12
0
def estimate_key(input_audio_file, output_text_file=None, key_profile=None):
    """
    This function estimates the overall key of an audio track
    optionaly with extra modal information.
    :type input_audio_file: str
    :type output_text_file: str
    """

    if key_profile is not None:
        global USE_THREE_PROFILES
        global WITH_MODAL_DETAILS
        global KEY_PROFILE

        KEY_PROFILE = key_profile
        USE_THREE_PROFILES = False
        WITH_MODAL_DETAILS = False

    loader = estd.MonoLoader(filename=input_audio_file, sampleRate=SAMPLE_RATE)
    cut = estd.FrameCutter(frameSize=WINDOW_SIZE, hopSize=HOP_SIZE)
    window = estd.Windowing(size=WINDOW_SIZE, type=WINDOW_SHAPE)
    rfft = estd.Spectrum(size=WINDOW_SIZE)
    sw = estd.SpectralWhitening(maxFrequency=MAX_HZ, sampleRate=SAMPLE_RATE)
    speaks = estd.SpectralPeaks(magnitudeThreshold=SPECTRAL_PEAKS_THRESHOLD,
                                maxFrequency=MAX_HZ,
                                minFrequency=MIN_HZ,
                                maxPeaks=SPECTRAL_PEAKS_MAX,
                                sampleRate=SAMPLE_RATE)
    hpcp = estd.HPCP(
        bandPreset=HPCP_BAND_PRESET,
        #bandSplitFrequency=HPCP_SPLIT_HZ,
        harmonics=HPCP_HARMONICS,
        maxFrequency=MAX_HZ,
        minFrequency=MIN_HZ,
        nonLinear=HPCP_NON_LINEAR,
        normalized=HPCP_NORMALIZE,
        referenceFrequency=HPCP_REFERENCE_HZ,
        sampleRate=SAMPLE_RATE,
        size=HPCP_SIZE,
        weightType=HPCP_WEIGHT_TYPE,
        windowSize=HPCP_WEIGHT_WINDOW_SEMITONES,
        maxShifted=HPCP_SHIFT)
    if HIGHPASS_CUTOFF is not None:
        hpf = estd.HighPass(cutoffFrequency=HIGHPASS_CUTOFF,
                            sampleRate=SAMPLE_RATE)
        audio = hpf(hpf(hpf(loader())))
    else:
        audio = loader()
    duration = len(audio)
    n_slices = 1 + (duration // HOP_SIZE)
    chroma = np.empty([n_slices, HPCP_SIZE], dtype='float64')
    for slice_n in range(n_slices):
        spek = rfft(window(cut(audio)))
        p1, p2 = speaks(spek)
        if SPECTRAL_WHITENING:
            p2 = sw(spek, p1, p2)
        pcp = hpcp(p1, p2)
        if not DETUNING_CORRECTION or DETUNING_CORRECTION_SCOPE == 'average':
            chroma[slice_n] = pcp
        elif DETUNING_CORRECTION and DETUNING_CORRECTION_SCOPE == 'frame':
            pcp = shift_pcp(pcp, HPCP_SIZE)
            chroma[slice_n] = pcp
        else:
            raise NameError("SHIFT_SCOPE must be set to 'frame' or 'average'.")
    chroma = np.sum(chroma, axis=0)
    if PCP_THRESHOLD is not None:
        chroma = normalize_pcp_peak(chroma)
        chroma = pcp_gate(chroma, PCP_THRESHOLD)
    if DETUNING_CORRECTION and DETUNING_CORRECTION_SCOPE == 'average':
        chroma = shift_pcp(chroma, HPCP_SIZE)
    chroma = np.roll(
        chroma, -3)  # Adjust to essentia's HPCP calculation starting on A...
    if USE_THREE_PROFILES:
        estimation_1 = template_matching_3(chroma, KEY_PROFILE)
    else:
        estimation_1 = template_matching_2(chroma, KEY_PROFILE)
    key_1 = estimation_1[0] + '\t' + estimation_1[1]
    correlation_value = estimation_1[2]
    if WITH_MODAL_DETAILS:
        estimation_2 = template_matching_modal(chroma)
        key_2 = estimation_2[0] + '\t' + estimation_2[1]
        key_verbose = key_1 + '\t' + key_2
        key = key_verbose.split('\t')
        # Assign monotonic tracks to minor:
        if key[3] == 'monotonic' and key[0] == key[2]:
            key = '{0}\tminor'.format(key[0])
        else:
            key = key_1
    else:
        key = key_1
    if output_text_file is not None:
        textfile = open(output_text_file, 'w')
        textfile.write(key + '\t' + str(correlation_value) + '\n')
        textfile.close()
    return key, correlation_value
Example #13
0
def reComputeDescriptors(inputAudioFile, outputJsonFile):
    """
    :param inputAudioFile:
    :param outputJsonFile:
    :return:
    """

    #help(ess.SpectralContrast)
    """ orig
    M = 1024
    N = 1024
    H = 512
    fs = 44100
    W = 'hann'
    """
    """ freesound
    Real sampleRate = 44100;
    int frameSize =   2048;
    int hopSize =     1024;
    int zeroPadding = 0;

    string silentFrames ="noise";
    string windowType = "blackmanharris62";

    // Silence Rate
    Real thresholds_dB[] = { -20, -30, -60 };
    vector<Real> thresholds(ARRAY_SIZE(thresholds_dB));
    for (uint i=0; i<thresholds.size(); i++) {
        thresholds[i] = db2lin(thresholds_dB[i]/2.0);
    }


    """

    M = 2048
    N = 2048
    H = 1024
    fs = 44100

    W = 'blackmanharris62'
    #silentFrames = "noise"
    #thresholds_dB = np.array([ -20, -30, -60 ])
    #thresholds = np.power (10.0, thresholds_dB / 20)

    #spectrum = ess.Spectrum(size=N)
    spectrum = ess.Spectrum()
    #window = ess.Windowing(size=M, type=W)
    window = ess.Windowing(type=W)
    #mfcc = ess.MFCC(numberCoefficients=12, inputSize=N/2+1)
    mfcc = ess.MFCC()

    spectral_peaks = ess.SpectralPeaks(minFrequency=1,
                                       maxFrequency=20000,
                                       maxPeaks=100,
                                       sampleRate=fs,
                                       magnitudeThreshold=0,
                                       orderBy="magnitude")

    dissonance = ess.Dissonance()

    #pitch_detection = ess.PitchYinFFT(frameSize=M, sampleRate=fs)
    pitch_detection = ess.PitchYinFFT()

    harmonic_peaks = ess.HarmonicPeaks()

    inharmonicity = ess.Inharmonicity()

    #spectral_contrast = ess.SpectralContrast(sampleRate=fs)
    spectral_contrast = ess.SpectralContrast()

    centroid = ess.Centroid()

    log_attack_time = ess.LogAttackTime()

    hfc = ess.HFC()

    energy = ess.Energy()

    x = ess.MonoLoader(filename=inputAudioFile, sampleRate=fs)()
    frames = ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True)
    pool = es.Pool()
    for frame in frames:
        mX = spectrum(window(frame))
        mfcc_bands, mfcc_coeffs = mfcc(mX)

        pool.add('lowlevel.mfcc', mfcc_coeffs)
        pool.add('lowlevel.mfcc_bands', mfcc_bands)

        pfreq, pmag = spectral_peaks(mX)

        inds = pfreq.argsort()
        pfreq_sorted = pfreq[inds]
        pmag_sorted = pmag[inds]

        diss = dissonance(pfreq_sorted, pmag_sorted)
        pool.add('lowlevel.dissonance', diss)

        pitch, pitch_confidence = pitch_detection(mX)

        phfreq, phmag = harmonic_peaks(pfreq_sorted, pmag_sorted, pitch)
        if len(phfreq) > 1:
            inharm = inharmonicity(phfreq, phmag)
            pool.add('sfx.inharmonicity', inharm)

        sc_coeffs, sc_valleys = spectral_contrast(mX)
        pool.add('lowlevel.spectral_contrast', sc_coeffs)

        c = centroid(mX)
        pool.add('lowlevel.spectral_centroid', c)

        lat = log_attack_time(frame)
        pool.add('sfx.logattacktime', lat)

        h = hfc(mX)
        pool.add('lowlevel.hfc', h)

    calc_Mean_Var = ess.PoolAggregator(defaultStats=['mean', 'var'])
    aggrPool = calc_Mean_Var(pool)

    features = makeFeatures(aggrPool)
    json.dump(features, open(outputJsonFile, 'w'))
Example #14
0
    plt.show()

    audio = es.MonoLoader(filename=filename)()
    rhythm_extractor = es.RhythmExtractor2013(method="multifeature")
    bpm, beats, beats_confidence, _, beats_intervals = rhythm_extractor(audio)
    print("bpm", bpm)
    bps = 2  #bpm / 60
    print("bps", bps)
    hpcps = []
    for b in range(int(track_length_sec * bps)):
        parts = 5
        spectrum = None
        for i in range(parts):
            frame = audio[int(b * fs_rate + i * fs_rate / bps / parts):int(
                b * fs_rate + (i + 1) * fs_rate / bps / parts)]  # for one beat
            # frame = audio[s *fs_rate: (s+1)* fs_rate]
            if spectrum is None:
                spectrum = es.Spectrum()(frame)
            else:
                spectrum += es.Spectrum()(frame)
        es_frequencies, es_magnitudes = es.SpectralPeaks()(spectrum)
        hpcp = es.HPCP()(es_frequencies, es_magnitudes)
        hpcps.append(hpcp)
    for h in hpcps:
        names = [
            "a", "b", "h", "c", "cis", "d", "dis", "e", "f", "fis", "g", "gis"
        ]
        print([f"{name}-{v:0.2}" for name, v in zip(names, h) if v > 0.1])
    chords = es.ChordsDetection()(essentia.array(hpcps))
    print(chords)
Example #15
0
def get_spectral_info(frame):
    """Gets spectrum frequencies and their magnitudes for a single frame"""
    spectrum = es.Spectrum(size=samples_per_frame)(frame)
    freqs, mags = es.SpectralPeaks(**peak_params)(spectrum)
    mags = es.SpectralWhitening()(spectrum, freqs, mags)
    return spectrum, freqs, mags
Example #16
0
def compute(audio, pool, options):
    INFO('Computing SFX descriptors...')

    # analysis parameters
    sampleRate = options['sampleRate']
    frameSize = options['frameSize']
    hopSize = options['hopSize']
    windowType = options['windowType']

    # frame algorithms
    frames = ess.FrameGenerator(audio=audio,
                                frameSize=frameSize,
                                hopSize=hopSize)
    window = ess.Windowing(size=frameSize, zeroPadding=0, type=windowType)
    spectrum = ess.Spectrum(size=frameSize)

    # pitch algorithm
    pitch_detection = ess.PitchYinFFT(frameSize=2048, sampleRate=sampleRate)

    # sfx descriptors
    spectral_peaks = ess.SpectralPeaks(sampleRate=sampleRate,
                                       orderBy='frequency')
    harmonic_peaks = ess.HarmonicPeaks()
    inharmonicity = ess.Inharmonicity()
    odd2evenharmonicenergyratio = ess.OddToEvenHarmonicEnergyRatio()
    tristimulus = ess.Tristimulus()

    # used for a nice progress display
    total_frames = frames.num_frames()
    n_frames = 0
    start_of_frame = -frameSize * 0.5
    progress = Progress(total=total_frames)

    for frame in frames:

        frameScope = [
            start_of_frame / sampleRate,
            (start_of_frame + frameSize) / sampleRate
        ]
        # pool.setCurrentScope(frameScope)

        if options['skipSilence'] and es.isSilent(frame):
            total_frames -= 1
            start_of_frame += hopSize
            continue

        frame_windowed = window(frame)
        frame_spectrum = spectrum(frame_windowed)

        # pitch descriptors
        frame_pitch, frame_pitch_confidence = pitch_detection(frame_spectrum)

        # spectral peaks based descriptors
        frame_frequencies, frame_magnitudes = spectral_peaks(frame_spectrum)

        # ERROR CORRECTION - hoinx 2015-12
        errIdx = np.where(frame_frequencies < 1)
        frame_frequencies = np.delete(frame_frequencies, errIdx)
        frame_magnitudes = np.delete(frame_magnitudes, errIdx)

        (frame_harmonic_frequencies,
         frame_harmonic_magnitudes) = harmonic_peaks(frame_frequencies,
                                                     frame_magnitudes,
                                                     frame_pitch)
        if len(frame_harmonic_frequencies) > 1:
            frame_inharmonicity = inharmonicity(frame_harmonic_frequencies,
                                                frame_harmonic_magnitudes)
            pool.add(namespace + '.' + 'inharmonicity', frame_inharmonicity)
            frame_tristimulus = tristimulus(frame_harmonic_frequencies,
                                            frame_harmonic_magnitudes)
            pool.add(namespace + '.' + 'tristimulus', frame_tristimulus)
            frame_odd2evenharmonicenergyratio = odd2evenharmonicenergyratio(
                frame_harmonic_frequencies, frame_harmonic_magnitudes)
            pool.add(namespace + '.' + 'odd2evenharmonicenergyratio',
                     frame_odd2evenharmonicenergyratio)

        # display of progress report
        progress.update(n_frames)

        n_frames += 1
        start_of_frame += hopSize

    envelope = ess.Envelope()
    file_envelope = envelope(audio)

    # temporal statistics
    decrease = ess.Decrease()
    pool.add(namespace + '.' + 'temporal_decrease',
             decrease(file_envelope))  # , pool.GlobalScope)

    centralmoments = ess.CentralMoments()
    file_centralmoments = centralmoments(file_envelope)

    distributionshape = ess.DistributionShape()
    (file_spread, file_skewness,
     file_kurtosis) = distributionshape(file_centralmoments)
    pool.add(namespace + '.' + 'temporal_spread',
             file_spread)  # , pool.GlobalScope)
    pool.add(namespace + '.' + 'temporal_skewness',
             file_skewness)  # , pool.GlobalScope)
    pool.add(namespace + '.' + 'temporal_kurtosis',
             file_kurtosis)  # , pool.GlobalScope)

    centroid = ess.Centroid()
    pool.add(namespace + '.' + 'temporal_centroid',
             centroid(file_envelope))  # , pool.GlobalScope)

    # effective duration
    effectiveduration = ess.EffectiveDuration()
    pool.add(namespace + '.' + 'effective_duration',
             effectiveduration(file_envelope))  # , pool.GlobalScope)

    # log attack time
    logattacktime = ess.LogAttackTime()
    pool.add(namespace + '.' + 'logattacktime',
             logattacktime(audio))  # , pool.GlobalScope)

    # strong decay
    strongdecay = ess.StrongDecay()
    pool.add(namespace + '.' + 'strongdecay',
             strongdecay(file_envelope))  # , pool.GlobalScope)

    # dynamic profile
    flatness = ess.FlatnessSFX()
    pool.add(namespace + '.' + 'flatness',
             flatness(file_envelope))  # , pool.GlobalScope)
    """
    # onsets number
    onsets_number = len(pool['rhythm.onset_times'][0])
    pool.add(namespace + '.' + 'onsets_number', onsets_number)  # , pool.GlobalScope)
    """

    # morphological descriptors
    max_to_total = ess.MaxToTotal()
    pool.add(namespace + '.' + 'max_to_total',
             max_to_total(file_envelope))  # , pool.GlobalScope)

    tc_to_total = ess.TCToTotal()
    pool.add(namespace + '.' + 'tc_to_total',
             tc_to_total(file_envelope))  # , pool.GlobalScope)

    derivativeSFX = ess.DerivativeSFX()
    (der_av_after_max, max_der_before_max) = derivativeSFX(file_envelope)
    pool.add(namespace + '.' + 'der_av_after_max',
             der_av_after_max)  # , pool.GlobalScope)
    pool.add(namespace + '.' + 'max_der_before_max',
             max_der_before_max)  # , pool.GlobalScope)

    # pitch profile
    """
    pitch = pool['lowlevel.pitch']

    if len(pitch) > 1:
        pool.add(namespace + '.' + 'pitch_max_to_total', max_to_total(pitch))  # , pool.GlobalScope)

        min_to_total = ess.MinToTotal()
        pool.add(namespace + '.' + 'pitch_min_to_total', min_to_total(pitch))  # , pool.GlobalScope)

        pitch_centroid = ess.Centroid(range=len(pitch) - 1)
        pool.add(namespace + '.' + 'pitch_centroid', pitch_centroid(pitch))  # , pool.GlobalScope)

        pitch_after_max_to_before_max_energy_ratio = ess.AfterMaxToBeforeMaxEnergyRatio()
        pool.add(namespace + '.' + 'pitch_after_max_to_before_max_energy_ratio',
                 pitch_after_max_to_before_max_energy_ratio(pitch))  # , pool.GlobalScope)

    else:
        pool.add(namespace + '.' + 'pitch_max_to_total', 0.0)  # , pool.GlobalScope)
        pool.add(namespace + '.' + 'pitch_min_to_total', 0.0)  # , pool.GlobalScope)
        pool.add(namespace + '.' + 'pitch_centroid', 0.0)  # , pool.GlobalScope)
        pool.add(namespace + '.' + 'pitch_after_max_to_before_max_energy_ratio', 0.0)  # , pool.GlobalScope)
    """

    progress.finish()
Example #17
0
def key_ecir(input_audio_file, output_text_file, **kwargs):

    if not kwargs:
        kwargs = KEY_SETTINGS

    loader = estd.MonoLoader(filename=input_audio_file,
                             sampleRate=kwargs["SAMPLE_RATE"])
    cut = estd.FrameCutter(frameSize=kwargs["WINDOW_SIZE"],
                           hopSize=kwargs["HOP_SIZE"])
    window = estd.Windowing(size=kwargs["WINDOW_SIZE"],
                            type=kwargs["WINDOW_SHAPE"])
    rfft = estd.Spectrum(size=kwargs["WINDOW_SIZE"])
    sw = estd.SpectralWhitening(maxFrequency=kwargs["MAX_HZ"],
                                sampleRate=kwargs["SAMPLE_RATE"])
    speaks = estd.SpectralPeaks(
        magnitudeThreshold=kwargs["SPECTRAL_PEAKS_THRESHOLD"],
        maxFrequency=kwargs["MAX_HZ"],
        minFrequency=kwargs["MIN_HZ"],
        maxPeaks=kwargs["SPECTRAL_PEAKS_MAX"],
        sampleRate=kwargs["SAMPLE_RATE"])
    hpcp = estd.HPCP(bandPreset=kwargs["HPCP_BAND_PRESET"],
                     splitFrequency=kwargs["HPCP_SPLIT_HZ"],
                     harmonics=kwargs["HPCP_HARMONICS"],
                     maxFrequency=kwargs["MAX_HZ"],
                     minFrequency=kwargs["MIN_HZ"],
                     nonLinear=kwargs["HPCP_NON_LINEAR"],
                     normalized=kwargs["HPCP_NORMALIZE"],
                     referenceFrequency=kwargs["HPCP_REFERENCE_HZ"],
                     sampleRate=kwargs["SAMPLE_RATE"],
                     size=kwargs["HPCP_SIZE"],
                     weightType=kwargs["HPCP_WEIGHT_TYPE"],
                     windowSize=kwargs["HPCP_WEIGHT_WINDOW_SEMITONES"],
                     maxShifted=kwargs["HPCP_SHIFT"])

    key = estd.Key(numHarmonics=kwargs["KEY_HARMONICS"],
                   pcpSize=kwargs["HPCP_SIZE"],
                   profileType=kwargs["KEY_PROFILE"],
                   slope=kwargs["KEY_SLOPE"],
                   usePolyphony=kwargs["KEY_POLYPHONY"],
                   useThreeChords=kwargs["KEY_USE_THREE_CHORDS"])

    audio = loader()

    if kwargs["HIGHPASS_CUTOFF"] is not None:
        hpf = estd.HighPass(cutoffFrequency=kwargs["HIGHPASS_CUTOFF"],
                            sampleRate=kwargs["SAMPLE_RATE"])
        audio = hpf(hpf(hpf(audio)))

    if kwargs["DURATION"] is not None:
        audio = audio[(kwargs["START_TIME"] *
                       kwargs["SAMPLE_RATE"]):(kwargs["DURATION"] *
                                               kwargs["SAMPLE_RATE"])]

    duration = len(audio)
    number_of_frames = int(duration / kwargs["HOP_SIZE"])
    chroma = []
    for bang in range(number_of_frames):
        spek = rfft(window(cut(audio)))
        p1, p2 = speaks(spek)  # p1 = frequencies; p2 = magnitudes
        if kwargs["SPECTRAL_WHITENING"]:
            p2 = sw(spek, p1, p2)
        vector = hpcp(p1, p2)
        sum_vector = np.sum(vector)

        if sum_vector > 0:
            if kwargs["DETUNING_CORRECTION"] == False or kwargs[
                    "DETUNING_CORRECTION_SCOPE"] == 'average':
                chroma.append(vector)
            elif kwargs["DETUNING_CORRECTION"] and kwargs[
                    "DETUNING_CORRECTION_SCOPE"] == 'frame':
                vector = _detuning_correction(vector, kwargs["HPCP_SIZE"])
                chroma.append(vector)
            else:
                print("SHIFT_SCOPE must be set to 'frame' or 'average'")

    chroma = np.mean(chroma, axis=0)

    if kwargs["DETUNING_CORRECTION"] and kwargs[
            "DETUNING_CORRECTION_SCOPE"] == 'average':
        chroma = _detuning_correction(chroma, kwargs["HPCP_SIZE"])
    key = key(chroma.tolist())
    confidence = (key[2], key[3])
    key = key[0] + '\t' + key[1]
    textfile = open(output_text_file, 'w')
    textfile.write(key + '\n')
    textfile.close()
    return key, confidence
Example #18
0
def compute(audio, pool, options):
    # analysis parameters
    sampleRate = options['sampleRate']
    frameSize = options['frameSize']
    hopSize = options['hopSize']
    windowType = options['windowType']

    # temporal descriptors
    lpc = ess.LPC(order=10, type='warped', sampleRate=sampleRate)
    zerocrossingrate = ess.ZeroCrossingRate()

    # frame algorithms
    frames = ess.FrameGenerator(audio=audio, frameSize=frameSize, hopSize=hopSize)
    window = ess.Windowing(size=frameSize, zeroPadding=0, type=windowType)
    spectrum = ess.Spectrum(size=frameSize)

    # spectral algorithms
    barkbands = ess.BarkBands(sampleRate=sampleRate)
    centralmoments = ess.CentralMoments()
    crest = ess.Crest()
    centroid = ess.Centroid()
    decrease = ess.Decrease()
    spectral_contrast = ess.SpectralContrast(frameSize=frameSize,
                                             sampleRate=sampleRate,
                                             numberBands=6,
                                             lowFrequencyBound=20,
                                             highFrequencyBound=11000,
                                             neighbourRatio=0.4,
                                             staticDistribution=0.15)
    distributionshape = ess.DistributionShape()
    energy = ess.Energy()
    # energyband_bass, energyband_middle and energyband_high parameters come from "standard" hi-fi equalizers
    energyband_bass = ess.EnergyBand(startCutoffFrequency=20.0, stopCutoffFrequency=150.0, sampleRate=sampleRate)
    energyband_middle_low = ess.EnergyBand(startCutoffFrequency=150.0, stopCutoffFrequency=800.0, sampleRate=sampleRate)
    energyband_middle_high = ess.EnergyBand(startCutoffFrequency=800.0, stopCutoffFrequency=4000.0,
                                            sampleRate=sampleRate)
    energyband_high = ess.EnergyBand(startCutoffFrequency=4000.0, stopCutoffFrequency=20000.0, sampleRate=sampleRate)
    flatnessdb = ess.FlatnessDB()
    flux = ess.Flux()
    harmonic_peaks = ess.HarmonicPeaks()
    hfc = ess.HFC()
    mfcc = ess.MFCC()
    rolloff = ess.RollOff()
    rms = ess.RMS()
    strongpeak = ess.StrongPeak()

    # pitch algorithms
    pitch_detection = ess.PitchYinFFT(frameSize=frameSize, sampleRate=sampleRate)
    pitch_salience = ess.PitchSalience()

    # dissonance
    spectral_peaks = ess.SpectralPeaks(sampleRate=sampleRate, orderBy='frequency')
    dissonance = ess.Dissonance()

    # spectral complexity
    # magnitudeThreshold = 0.005 is hardcoded for a "blackmanharris62" frame
    spectral_complexity = ess.SpectralComplexity(magnitudeThreshold=0.005)

    INFO('Computing Low-Level descriptors...')

    # used for a nice progress display
    total_frames = frames.num_frames()
    n_frames = 0
    start_of_frame = -frameSize * 0.5

    pitches, pitch_confidences = [], []

    progress = Progress(total=total_frames)

    #scPool = es.Pool()  # pool for spectral contrast

    for frame in frames:

        frameScope = [start_of_frame / sampleRate, (start_of_frame + frameSize) / sampleRate]
        # pool.setCurrentScope(frameScope)

        # silence rate
        # pool.add(namespace + '.' + 'silence_rate_60dB', es.isSilent(frame))
        pool.add(namespace + '.' + 'silence_rate_60dB', is_silent_threshold(frame, -60))
        pool.add(namespace + '.' + 'silence_rate_30dB', is_silent_threshold(frame, -30))
        pool.add(namespace + '.' + 'silence_rate_20dB', is_silent_threshold(frame, -20))

        if options['skipSilence'] and es.isSilent(frame):
            total_frames -= 1
            start_of_frame += hopSize
            continue

        # temporal descriptors
        pool.add(namespace + '.' + 'zerocrossingrate', zerocrossingrate(frame))
        (frame_lpc, frame_lpc_reflection) = lpc(frame)
        pool.add(namespace + '.' + 'temporal_lpc', frame_lpc)

        frame_windowed = window(frame)
        frame_spectrum = spectrum(frame_windowed)

        # spectrum-based descriptors
        power_spectrum = frame_spectrum ** 2
        pool.add(namespace + '.' + 'spectral_centroid', centroid(power_spectrum))
        pool.add(namespace + '.' + 'spectral_decrease', decrease(power_spectrum))
        pool.add(namespace + '.' + 'spectral_energy', energy(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_energyband_low', energyband_bass(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_energyband_middle_low', energyband_middle_low(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_energyband_middle_high', energyband_middle_high(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_energyband_high', energyband_high(frame_spectrum))
        pool.add(namespace + '.' + 'hfc', hfc(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_rms', rms(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_flux', flux(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_rolloff', rolloff(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_strongpeak', strongpeak(frame_spectrum))

        # central moments descriptors
        frame_centralmoments = centralmoments(power_spectrum)
        (frame_spread, frame_skewness, frame_kurtosis) = distributionshape(frame_centralmoments)
        pool.add(namespace + '.' + 'spectral_kurtosis', frame_kurtosis)
        pool.add(namespace + '.' + 'spectral_spread', frame_spread)
        pool.add(namespace + '.' + 'spectral_skewness', frame_skewness)

        # dissonance
        (frame_frequencies, frame_magnitudes) = spectral_peaks(frame_spectrum)
        frame_dissonance = dissonance(frame_frequencies, frame_magnitudes)
        pool.add(namespace + '.' + 'dissonance', frame_dissonance)

        # mfcc
        (frame_melbands, frame_mfcc) = mfcc(frame_spectrum)
        pool.add(namespace + '.' + 'mfcc', frame_mfcc)

        # spectral contrast
        (sc_coeffs, sc_valleys) = spectral_contrast(frame_spectrum)
        #scPool.add(namespace + '.' + 'sccoeffs', sc_coeffs)
        #scPool.add(namespace + '.' + 'scvalleys', sc_valleys)
        pool.add(namespace + '.' + 'spectral_contrast', sc_coeffs)


        # barkbands-based descriptors
        frame_barkbands = barkbands(frame_spectrum)
        pool.add(namespace + '.' + 'barkbands', frame_barkbands)
        pool.add(namespace + '.' + 'spectral_crest', crest(frame_barkbands))
        pool.add(namespace + '.' + 'spectral_flatness_db', flatnessdb(frame_barkbands))
        barkbands_centralmoments = ess.CentralMoments(range=len(frame_barkbands) - 1)
        (barkbands_spread, barkbands_skewness, barkbands_kurtosis) = distributionshape(
            barkbands_centralmoments(frame_barkbands))
        pool.add(namespace + '.' + 'barkbands_spread', barkbands_spread)
        pool.add(namespace + '.' + 'barkbands_skewness', barkbands_skewness)
        pool.add(namespace + '.' + 'barkbands_kurtosis', barkbands_kurtosis)

        # pitch descriptors
        frame_pitch, frame_pitch_confidence = pitch_detection(frame_spectrum)
        if frame_pitch > 0 and frame_pitch <= 20000.:
            pool.add(namespace + '.' + 'pitch', frame_pitch)
        pitches.append(frame_pitch)
        pitch_confidences.append(frame_pitch_confidence)
        pool.add(namespace + '.' + 'pitch_instantaneous_confidence', frame_pitch_confidence)

        frame_pitch_salience = pitch_salience(frame_spectrum[:-1])
        pool.add(namespace + '.' + 'pitch_salience', frame_pitch_salience)

        # spectral complexity
        pool.add(namespace + '.' + 'spectral_complexity', spectral_complexity(frame_spectrum))

        # display of progress report
        progress.update(n_frames)

        n_frames += 1
        start_of_frame += hopSize

    # if no 'temporal_zerocrossingrate' it means that this is a silent file
    if 'zerocrossingrate' not in descriptorNames(pool.descriptorNames(), namespace):
        raise ess.EssentiaError('This is a silent file!')

    #spectralContrastPCA(scPool, pool)

    # build pitch value histogram
    from math import log
    from numpy import bincount
    # convert from Hz to midi notes
    midipitches = []
    unknown = 0
    for freq in pitches:
        if freq > 0. and freq <= 12600:
            midipitches.append(12 * (log(freq / 6.875) / 0.69314718055995) - 3.)
        else:
            unknown += 1

    if len(midipitches) > 0:
        # compute histogram
        midipitchhist = bincount(midipitches)
        # set 0 midi pitch to be the number of pruned value
        midipitchhist[0] = unknown
        # normalise
        midipitchhist = [val / float(sum(midipitchhist)) for val in midipitchhist]
        # zero pad
        for i in range(128 - len(midipitchhist)): midipitchhist.append(0.0)
    else:
        midipitchhist = [0.] * 128
        midipitchhist[0] = 1.

    # pitchhist = ess.array(zip(range(len(midipitchhist)), midipitchhist))
    pool.add(namespace + '.' + 'spectral_pitch_histogram', midipitchhist)  # , pool.GlobalScope)

    # the code below is the same as the one above:
    # for note in midipitchhist:
    #    pool.add(namespace + '.' + 'spectral_pitch_histogram_values', note)
    #    print "midi note:", note

    pitch_centralmoments = ess.CentralMoments(range=len(midipitchhist) - 1)
    (pitch_histogram_spread, pitch_histogram_skewness, pitch_histogram_kurtosis) = distributionshape(
        pitch_centralmoments(midipitchhist))
    pool.add(namespace + '.' + 'spectral_pitch_histogram_spread', pitch_histogram_spread)  # , pool.GlobalScope)

    progress.finish()
Example #19
0
def key_aes(input_audio_file, output_text_file, **kwargs):
    """
    This function estimates the overall key of an audio track
    optionally with extra modal information.
    :type input_audio_file: str
    :type output_text_file: str

    """
    if not kwargs:
        kwargs = KEY_SETTINGS

    loader = estd.MonoLoader(filename=input_audio_file,
                             sampleRate=kwargs["SAMPLE_RATE"])

    cut = estd.FrameCutter(frameSize=kwargs["WINDOW_SIZE"],
                           hopSize=kwargs["HOP_SIZE"])

    window = estd.Windowing(size=kwargs["WINDOW_SIZE"],
                            type=kwargs["WINDOW_SHAPE"])

    rfft = estd.Spectrum(size=kwargs["WINDOW_SIZE"])

    sw = estd.SpectralWhitening(maxFrequency=kwargs["MAX_HZ"],
                                sampleRate=kwargs["SAMPLE_RATE"])

    speaks = estd.SpectralPeaks(
        magnitudeThreshold=kwargs["SPECTRAL_PEAKS_THRESHOLD"],
        maxFrequency=kwargs["MAX_HZ"],
        minFrequency=kwargs["MIN_HZ"],
        maxPeaks=kwargs["SPECTRAL_PEAKS_MAX"],
        sampleRate=kwargs["SAMPLE_RATE"])

    hpcp = estd.HPCP(bandPreset=kwargs["HPCP_BAND_PRESET"],
                     splitFrequency=kwargs["HPCP_SPLIT_HZ"],
                     harmonics=kwargs["HPCP_HARMONICS"],
                     maxFrequency=kwargs["MAX_HZ"],
                     minFrequency=kwargs["MIN_HZ"],
                     nonLinear=kwargs["HPCP_NON_LINEAR"],
                     normalized=kwargs["HPCP_NORMALIZE"],
                     referenceFrequency=kwargs["HPCP_REFERENCE_HZ"],
                     sampleRate=kwargs["SAMPLE_RATE"],
                     size=kwargs["HPCP_SIZE"],
                     weightType=kwargs["HPCP_WEIGHT_TYPE"],
                     windowSize=kwargs["HPCP_WEIGHT_WINDOW_SEMITONES"],
                     maxShifted=kwargs["HPCP_SHIFT"])

    audio = loader()

    if kwargs["HIGHPASS_CUTOFF"] is not None:
        hpf = estd.HighPass(cutoffFrequency=kwargs["HIGHPASS_CUTOFF"],
                            sampleRate=kwargs["SAMPLE_RATE"])
        audio = hpf(hpf(hpf(audio)))

    if kwargs["DURATION"] is not None:
        audio = audio[(kwargs["START_TIME"] *
                       kwargs["SAMPLE_RATE"]):(kwargs["DURATION"] *
                                               kwargs["SAMPLE_RATE"])]

    duration = len(audio)
    number_of_frames = int(duration / kwargs["HOP_SIZE"])
    chroma = []
    for bang in range(number_of_frames):
        spek = rfft(window(cut(audio)))
        p1, p2 = speaks(spek)
        if kwargs["SPECTRAL_WHITENING"]:
            p2 = sw(spek, p1, p2)

        pcp = hpcp(p1, p2)

        if np.sum(pcp) > 0:
            if not kwargs["DETUNING_CORRECTION"] or kwargs[
                    "DETUNING_CORRECTION_SCOPE"] == 'average':
                chroma.append(pcp)
            elif kwargs["DETUNING_CORRECTION"] and kwargs[
                    "DETUNING_CORRECTION_SCOPE"] == 'frame':
                pcp = _detuning_correction(pcp, kwargs["HPCP_SIZE"])
                chroma.append(pcp)
            else:
                raise NameError(
                    "SHIFT_SCOPE musts be set to 'frame' or 'average'.")

    if not chroma:
        return 'Silence'

    chroma = np.sum(chroma, axis=0)
    chroma = norm_peak(chroma)

    if kwargs["PCP_THRESHOLD"] is not None:
        chroma = vector_threshold(chroma, kwargs["PCP_THRESHOLD"])

    if kwargs["DETUNING_CORRECTION"] and kwargs[
            "DETUNING_CORRECTION_SCOPE"] == 'average':
        chroma = _detuning_correction(chroma, kwargs["HPCP_SIZE"])

    # Adjust to essentia's HPCP calculation starting on A (pc = 9)
    chroma = np.roll(chroma, -3 * (kwargs["HPCP_SIZE"] // 12))

    estimation_1 = estimate_key(chroma,
                                kwargs["KEY_PROFILE"],
                                kwargs["PROFILE_INTERPOLATION"],
                                conf_thres=kwargs["NOKEY_THRESHOLD"],
                                vocabulary=kwargs["KEY_VOCABULARY"])

    key_1 = estimation_1[0]
    correlation_value = estimation_1[1]

    if kwargs["WITH_MODAL_DETAILS"]:
        estimation_2 = _key7(chroma, kwargs["PROFILE_INTERPOLATION"])
        key_2 = estimation_2[0] + '\t' + estimation_2[1]
        key_verbose = key_1 + '\t' + key_2
        key = key_verbose.split('\t')

        # Assign monotonic track to minor:
        if key[3] == 'monotonic' and key[0] == key[2]:
            key = '{0}\tminor'.format(key[0])
        else:
            key = key_1
    else:
        key = key_1

    textfile = open(output_text_file, 'w')
    textfile.write(key)
    textfile.close()

    return key, correlation_value
Example #20
0
# retrieve filenames from folder:
soundfiles = os.listdir(audio_folder)
if '.DS_Store' in soundfiles:
    soundfiles.remove('.DS_Store')
    
# ANALYSIS
print "\nANALYSIS..."
for item in soundfiles:
    loader = estd.MonoLoader(filename=audio_folder + '/' +item,
                             sampleRate=sample_rate)
    window = estd.Windowing(size=window_size, 
                            type="blackmanharris62")
    rfft = estd.Spectrum(size=window_size)
    speaks = estd.SpectralPeaks(orderBy="magnitude",
                                magnitudeThreshold=magnitude_threshold,
                                minFrequency=min_frequency,
                                maxFrequency=max_frequency,
                                maxPeaks=max_peaks,
                                sampleRate=sample_rate)
    hpcp = estd.HPCP(bandPreset=band_preset,
                     harmonics = harmonics,
                     minFrequency=min_frequency,
                     maxFrequency=max_frequency,
                     nonLinear=non_linear,
                     normalized=normalize,
                     sampleRate=sample_rate,
                     weightType=weight_type,
                     windowSize=weight_window_size)
    key = estd.Key(numHarmonics=harmonics_key,
                   slope=slope,
                   usePolyphony=polyphony,
                   useThreeChords=three_chords,
Example #21
0
def reComputeDescriptors(inputAudioFile, outputJsonFile):

    """
    :param inputAudioFile:
    :param outputJsonFile:
    :return:
    """

    M = 2048
    N = 2048
    H = 1024
    fs = 44100

    W = 'blackmanharris62'


    #spectrum = ess.Spectrum(size=N)
    spectrum = ess.Spectrum()
    #window = ess.Windowing(size=M, type=W)
    window = ess.Windowing(type=W)
    #mfcc = ess.MFCC(numberCoefficients=12, inputSize=N/2+1)
    mfcc = ess.MFCC()

    spectral_peaks = ess.SpectralPeaks(minFrequency=1,
                                       maxFrequency=20000,
                                       maxPeaks=100,
                                       sampleRate=fs,
                                       magnitudeThreshold=0,
                                       orderBy="magnitude")

    dissonance = ess.Dissonance()

    #pitch_detection = ess.PitchYinFFT(frameSize=M, sampleRate=fs)
    pitch_detection = ess.PitchYinFFT()

    harmonic_peaks = ess.HarmonicPeaks()

    inharmonicity = ess.Inharmonicity()

    #spectral_contrast = ess.SpectralContrast(sampleRate=fs)
    spectral_contrast = ess.SpectralContrast()

    centroid = ess.Centroid()

    log_attack_time = ess.LogAttackTime()

    hfc = ess.HFC()

    # magnitudeThreshold = 0.005 is hardcoded for a "blackmanharris62" frame, see lowlevel.py
    spectral_complexity = ess.SpectralComplexity(magnitudeThreshold=0.005)


    energy = ess.Energy()

    x = ess.MonoLoader(filename=inputAudioFile, sampleRate=fs)()
    frames = ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True)

    E = []
    numFrames = 0
    for frame in frames:
        numFrames += 1
        E_frame = energy(frame)
        E.append(E_frame)

    E_max = np.max(E)

    frames = ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True)

    pools = [(t, es.Pool()) for t in dscr.threshold]
    for frame in frames:

        eNorm = energy(frame) / E_max

        threshPools = []
        for t, pool in pools:
            if eNorm >= t:
                threshPools.append(pool)

        mX = spectrum(window(frame))
        mfcc_bands, mfcc_coeffs = mfcc(mX)

        [pool.add('lowlevel.mfcc', mfcc_coeffs) for pool in threshPools]
        #[pool.add('lowlevel.mfcc_bands', mfcc_bands) for pool in threshPools]

        pfreq, pmag = spectral_peaks(mX)

        inds = pfreq.argsort()
        pfreq_sorted = pfreq[inds]
        pmag_sorted = pmag[inds]

        diss = dissonance(pfreq_sorted, pmag_sorted)
        [pool.add('lowlevel.dissonance', diss) for pool in threshPools]

        pitch, pitch_confidence = pitch_detection(mX)

        phfreq, phmag = harmonic_peaks(pfreq_sorted, pmag_sorted, pitch)
        if len(phfreq) > 1:
            inharm = inharmonicity(phfreq, phmag)
            [pool.add('sfx.inharmonicity', inharm) for pool in threshPools]

        sc_coeffs, sc_valleys = spectral_contrast(mX)
        [pool.add('lowlevel.spectral_contrast', sc_coeffs) for pool in threshPools]

        c = centroid(mX)
        [pool.add('lowlevel.spectral_centroid', c) for pool in threshPools]

        lat = log_attack_time(frame)
        [pool.add('sfx.logattacktime', lat) for pool in threshPools]

        h = hfc(mX)
        [pool.add('lowlevel.hfc', h) for pool in threshPools]

        spec_complx = spectral_complexity(mX)
        [pool.add('lowlevel.spectral_complexity', spec_complx) for pool in threshPools]


    #calc_Mean_Var = ess.PoolAggregator(defaultStats=['mean', 'var'])
    calc_Mean_Var = ess.PoolAggregator(defaultStats=['mean'])
    aggrPools = [calc_Mean_Var(pool) for t, pool in pools]

    features = {}
    [appendFeatures(features, aggrPools[i], ("ethc"+str(dscr.thresholdSelect[i]))) for i in range(len(aggrPools))]
    json.dump(features, open(outputJsonFile, 'w'))
Example #22
0
def key_detector():
    reloj()
    # create directory to write the results with an unique time id:
    if results_to_file or results_to_csv:
        uniqueTime = str(int(tiempo()))
        wd = os.getcwd()
        temp_folder = wd + '/KeyDetection_' + uniqueTime
        os.mkdir(temp_folder)
    if results_to_csv:
        import csv
        csvFile = open(temp_folder + '/Estimation_&_PCP.csv', 'w')
        lineWriter = csv.writer(csvFile, delimiter=',')
    # retrieve files and filenames according to the desired settings:
    if analysis_mode == 'title':
        allfiles = os.listdir(audio_folder)
        if '.DS_Store' in allfiles: allfiles.remove('.DS_Store')
        for item in collection:
            collection[collection.index(item)] = ' > ' + item + '.'
        for item in genre:
            genre[genre.index(item)] = ' < ' + item + ' > '
        for item in modality:
            modality[modality.index(item)] = ' ' + item + ' < '
        analysis_files = []
        for item in allfiles:
            if any(e1 for e1 in collection if e1 in item):
                if any(e2 for e2 in genre if e2 in item):
                    if any(e3 for e3 in modality if e3 in item):
                        analysis_files.append(item)
        song_instances = len(analysis_files)
        print song_instances, 'songs matching the selected criteria:'
        print collection, genre, modality
        if limit_analysis == 0:
            pass
        elif limit_analysis < song_instances:
            analysis_files = sample(analysis_files, limit_analysis)
            print "taking", limit_analysis, "random samples...\n"
    else:
        analysis_files = os.listdir(audio_folder)
        if '.DS_Store' in analysis_files:
            analysis_files.remove('.DS_Store')
        print len(analysis_files), '\nsongs in folder.\n'
        groundtruth_files = os.listdir(groundtruth_folder)
        if '.DS_Store' in groundtruth_files:
            groundtruth_files.remove('.DS_Store')
    # ANALYSIS
    # ========
    if verbose:
        print "ANALYSING INDIVIDUAL SONGS..."
        print "============================="
    if confusion_matrix:
        matrix = 24 * 24 * [0]
    mirex_scores = []
    for item in analysis_files:
        # INSTANTIATE ESSENTIA ALGORITHMS
        # ===============================
        loader = estd.MonoLoader(filename=audio_folder + '/' + item,
                                 sampleRate=sample_rate)
        cut = estd.FrameCutter(frameSize=window_size, hopSize=hop_size)
        window = estd.Windowing(size=window_size, type=window_type)
        rfft = estd.Spectrum(size=window_size)
        sw = estd.SpectralWhitening(maxFrequency=max_frequency,
                                    sampleRate=sample_rate)
        speaks = estd.SpectralPeaks(magnitudeThreshold=magnitude_threshold,
                                    maxFrequency=max_frequency,
                                    minFrequency=min_frequency,
                                    maxPeaks=max_peaks,
                                    sampleRate=sample_rate)
        hpcp = estd.HPCP(bandPreset=band_preset,
                         harmonics=harmonics,
                         maxFrequency=max_frequency,
                         minFrequency=min_frequency,
                         nonLinear=non_linear,
                         normalized=normalize,
                         referenceFrequency=reference_frequency,
                         sampleRate=sample_rate,
                         size=hpcp_size,
                         splitFrequency=split_frequency,
                         weightType=weight_type,
                         windowSize=weight_window_size)
        key = estd.Key(numHarmonics=num_harmonics,
                       pcpSize=hpcp_size,
                       profileType=profile_type,
                       slope=slope,
                       usePolyphony=use_polyphony,
                       useThreeChords=use_three_chords)
        # ACTUAL ANALYSIS
        # ===============
        audio = loader()
        duration = len(audio)
        if skip_first_minute and duration > (sample_rate * 60):
            audio = audio[sample_rate * 60:]
            duration = len(audio)
        if first_n_secs > 0:
            if duration > (first_n_secs * sample_rate):
                audio = audio[:first_n_secs * sample_rate]
                duration = len(audio)
        if avoid_edges > 0:
            initial_sample = (avoid_edges * duration) / 100
            final_sample = duration - initial_sample
            audio = audio[initial_sample:final_sample]
            duration = len(audio)
        number_of_frames = duration / hop_size
        chroma = []
        for bang in range(number_of_frames):
            spek = rfft(window(cut(audio)))
            p1, p2 = speaks(spek)  # p1 are frequencies; p2 magnitudes
            if spectral_whitening:
                p2 = sw(spek, p1, p2)
            vector = hpcp(p1, p2)
            sum_vector = np.sum(vector)
            if sum_vector > 0:
                if shift_spectrum == False or shift_scope == 'average':
                    chroma.append(vector)
                elif shift_spectrum and shift_scope == 'frame':
                    vector = shift_vector(vector, hpcp_size)
                    chroma.append(vector)
                else:
                    print "shift_scope must be set to 'frame' or 'average'"
        chroma = np.mean(chroma, axis=0)
        if shift_spectrum and shift_scope == 'average':
            chroma = shift_vector(chroma, hpcp_size)
        estimation = key(chroma.tolist())
        result = estimation[0] + ' ' + estimation[1]
        confidence = estimation[2]
        if results_to_csv:
            chroma = list(chroma)
        # MIREX EVALUATION:
        # ================
        if analysis_mode == 'title':
            ground_truth = item[item.find(' = ') + 3:item.rfind(' < ')]
            if verbose and confidence < confidence_threshold:
                print item[:item.rfind(' = ')]
                print 'G:', ground_truth, '|| P:',
            if results_to_csv:
                title = item[:item.rfind(' = ')]
                lineWriter.writerow([
                    title, ground_truth, chroma[0], chroma[1], chroma[2],
                    chroma[3], chroma[4], chroma[5], chroma[6], chroma[7],
                    chroma[8], chroma[9], chroma[10], chroma[11], chroma[12],
                    chroma[13], chroma[14], chroma[15], chroma[16], chroma[17],
                    chroma[18], chroma[19], chroma[20], chroma[21], chroma[22],
                    chroma[23], chroma[24], chroma[25], chroma[26], chroma[27],
                    chroma[28], chroma[29], chroma[30], chroma[31], chroma[32],
                    chroma[33], chroma[34], chroma[35], result
                ])
            ground_truth = key_to_list(ground_truth)
            estimation = key_to_list(result)
            score = mirex_score(ground_truth, estimation)
            mirex_scores.append(score)
        else:
            filename_to_match = item[:item.rfind('.')] + '.txt'
            print filename_to_match
            if filename_to_match in groundtruth_files:
                groundtruth_file = open(
                    groundtruth_folder + '/' + filename_to_match, 'r')
                ground_truth = groundtruth_file.readline()
                if "\t" in ground_truth:
                    ground_truth = re.sub("\t", " ", ground_truth)
                if results_to_csv:
                    lineWriter.writerow([
                        filename_to_match, chroma[0], chroma[1], chroma[2],
                        chroma[3], chroma[4], chroma[5], chroma[6], chroma[7],
                        chroma[8], chroma[9], chroma[10], chroma[11],
                        chroma[12], chroma[13], chroma[14], chroma[15],
                        chroma[16], chroma[17], chroma[18], chroma[19],
                        chroma[20], chroma[21], chroma[22], chroma[23],
                        chroma[24], chroma[25], chroma[26], chroma[27],
                        chroma[28], chroma[29], chroma[30], chroma[31],
                        chroma[32], chroma[33], chroma[34], chroma[35], result
                    ])
                ground_truth = key_to_list(ground_truth)
                estimation = key_to_list(result)
                score = mirex_score(ground_truth, estimation)
                mirex_scores.append(score)
            else:
                print "FILE NOT FOUND... Skipping it from evaluation.\n"
                continue
        # CONFUSION MATRIX:
        # ================
        if confusion_matrix:
            xpos = (ground_truth[0] +
                    (ground_truth[0] * 24)) + (-1 *
                                               (ground_truth[1] - 1) * 24 * 12)
            ypos = ((estimation[0] - ground_truth[0]) +
                    (-1 * (estimation[1] - 1) * 12))
            matrix[(xpos + ypos)] = +matrix[(xpos + ypos)] + 1
        if verbose and confidence < confidence_threshold:
            print result, '(%.2f)' % confidence, '|| SCORE:', score, '\n'
        # WRITE RESULTS TO FILE:
        # =====================
        if results_to_file:
            with open(temp_folder + '/' + item[:-3] + 'txt', 'w') as textfile:
                textfile.write(result)
                textfile.close()
    if results_to_csv:
        csvFile.close()
    print len(mirex_scores), "files analysed in", reloj(), "secs.\n"
    if confusion_matrix:
        matrix = np.matrix(matrix)
        matrix = matrix.reshape(24, 24)
        print matrix
        if results_to_file:
            np.savetxt(
                temp_folder + '/_confusion_matrix.csv',
                matrix,
                fmt='%i',
                delimiter=',',
                header=
                'C,C#,D,Eb,E,F,F#,G,G#,A,Bb,B,Cm,C#m,Dm,Ebm,Em,Fm,F#m,Gm,G#m,Am,Bbm,Bm'
            )
    # MIREX RESULTS
    # =============
    evaluation_results = mirex_evaluation(mirex_scores)
    # WRITE INFO TO FILE
    # ==================
    if results_to_file:
        settings = "SETTINGS\n========\nAvoid edges ('%' of duration disregarded at both ends (0 = complete)) = " + str(
            avoid_edges
        ) + "\nfirst N secs = " + str(
            first_n_secs
        ) + "\nshift spectrum to fit tempered scale = " + str(
            shift_spectrum
        ) + "\nspectral whitening = " + str(
            spectral_whitening
        ) + "\nsample rate = " + str(sample_rate) + "\nwindow size = " + str(
            window_size
        ) + "\nhop size = " + str(hop_size) + "\nmagnitude threshold = " + str(
            magnitude_threshold
        ) + "\nminimum frequency = " + str(
            min_frequency
        ) + "\nmaximum frequency = " + str(
            max_frequency
        ) + "\nmaximum peaks = " + str(max_peaks) + "\nband preset = " + str(
            band_preset
        ) + "\nsplit frequency = " + str(
            split_frequency
        ) + "\nharmonics = " + str(harmonics) + "\nnon linear = " + str(
            non_linear
        ) + "\nnormalize = " + str(
            normalize
        ) + "\nreference frequency = " + str(
            reference_frequency
        ) + "\nhpcp size = " + str(
            hpcp_size
        ) + "\nweigth type = " + weight_type + "\nweight window size in semitones = " + str(
            weight_window_size
        ) + "\nharmonics key = " + str(num_harmonics) + "\nslope = " + str(
            slope) + "\nprofile = " + profile_type + "\npolyphony = " + str(
                use_polyphony) + "\nuse three chords = " + str(
                    use_three_chords)
        results_for_file = "\n\nEVALUATION RESULTS\n==================\nCorrect: " + str(
            evaluation_results[0]) + "\nFifth:  " + str(
                evaluation_results[1]) + "\nRelative: " + str(
                    evaluation_results[2]) + "\nParallel: " + str(
                        evaluation_results[3]) + "\nError: " + str(
                            evaluation_results[4]) + "\nWeighted: " + str(
                                evaluation_results[5])
        write_to_file = open(temp_folder + '/_SUMMARY.txt', 'w')
        write_to_file.write(settings)
        write_to_file.write(results_for_file)
        if analysis_mode == 'title':
            corpus = "\n\nANALYSIS CORPUS\n===============\n" + str(
                collection) + '\n' + str(
                    genre) + '\n' + str(modality) + '\n\n' + str(
                        len(mirex_scores)) + " files analysed.\n"
            write_to_file.write(corpus)
        write_to_file.close()
Example #23
0
# Temporal descriptors
power = es.InstantPower()
log_attack_time = es.LogAttackTime()
effective_duration = es.EffectiveDuration()
auto_correlation = es.AutoCorrelation()
zero_crossing_rate = es.ZeroCrossingRate()

# Spectral descriptors
peak_freq = es.MaxMagFreq()
roll_off = es.RollOff()
flux = es.Flux()
flatness = es.Flatness()

# Harmonic descriptors
pitch = es.PitchYin(frameSize=1024)
spectral_peaks = es.SpectralPeaks(minFrequency=1e-5)
harmonic_peaks = es.HarmonicPeaks()
inharmonicity = es.Inharmonicity()
oer = es.OddToEvenHarmonicEnergyRatio()
tristimulus = es.Tristimulus()

# MFCC
mfcc = es.MFCC(inputSize=513)


class Audio:
    def __init__(self, path):
        self.audio = es.MonoLoader(filename=str(path))()
        self.name = path.name
        self.pool = essentia.Pool()
def computeLoudness(audioFile,
                    outputExt='.loudness',
                    f0=-1,
                    HopSize=0.01,
                    FrameSize=0.04643990929,
                    BinResolution=10,
                    GuessUnvoiced=True,
                    VoicingTolerance=0.2,
                    MaxFrequency=20000,
                    interpolateLoudness=0,
                    maxSilDurIntp=0.25,
                    smoothLoudness=0):
    """
  This function computes loudness (represented by energy) of the predominant source assuming either you have provided pitch of the predominant melodic source or if f0=-1, it uses Essentia-Melodia to estimate pitch of the predominant melodic source and uses harmonic detection to compute energy (treated as loudness).
  Any sudden gap in the harmonic magnitudes (undetected harmonics) which span a continous time duration < maxSilDurIntp will be interpolated. You should set this value exactly the same you used for interpolating pitch sequence to accound for short intra pattern pauses.
  """
    #reading audio file
    fs = 44100.0  #ES.AudioLoader(filename = audioFile)()[1]
    audio = ES.MonoLoader(filename=audioFile, sampleRate=fs)()

    #obtaining just the file name and splitting extionsion
    fname, ext = os.path.splitext(audioFile)

    frameNSamples = np.round(FrameSize * fs).astype(np.int)
    frameNSamples = frameNSamples + np.mod(frameNSamples, 2)

    #checking the cases, possible types of input parameter f0
    if type(f0) == int:
        #if its an integer (which essentially means the user has not provided any input and its -1), run the predominant melody estimation and obtain pitch estimate
        pitch = ES.PredominantPitchMelodia(hopSize=np.round(HopSize *
                                                            fs).astype(np.int),
                                           frameSize=frameNSamples,
                                           binResolution=BinResolution,
                                           guessUnvoiced=GuessUnvoiced,
                                           voicingTolerance=VoicingTolerance,
                                           maxFrequency=MaxFrequency,
                                           minFrequency=60)(audio)[0]
    if type(f0) == str:
        #if its a string that means a user has provided input file name of the pitch file stored in the format <time stamps><pitch value>
        pitch = np.loadtxt(f0)[:, 1]
    if type(f0) == np.ndarray:
        # if its an ndarray, this means that the given sequence is the pitch sequence to be used for loudness computation
        pitch = f0

    #creating algorithm objects to be used for harmonic detection for each audio frame
    NFFT = (2**np.ceil(np.log2(frameNSamples) + 1)).astype(np.int)
    WIN = ES.Windowing()
    SPECTRUM = ES.Spectrum()
    EQUALLOUD = ES.EqualLoudness()
    SPECPEAKS = ES.SpectralPeaks(sampleRate=fs, maxFrequency=8000)
    HARMDET = ES.HarmonicPeaks(maxHarmonics=30)

    audio_in = EQUALLOUD(audio)

    cnt = 0
    harmWghts = []
    for frame in ES.FrameGenerator(audio_in,
                                   frameSize=frameNSamples,
                                   hopSize=np.round(HopSize * fs).astype(
                                       np.int)):
        if cnt >= len(pitch):
            break
        spec = SPECTRUM(WIN(frame))
        peaks = SPECPEAKS(spec)
        #sometimes the first frequency peak corresponds to 0Hz (DC offset), adding correction for that.
        p_freq = peaks[0]
        p_mags = peaks[1]
        if len(p_freq) > 0 and p_freq[0] == 0:
            p_freq = p_freq[1:]
            p_mags = p_mags[1:]
        wghtsLocal = HARMDET(p_freq, p_mags, pitch[cnt])[1]
        harmWghts.append(wghtsLocal)
        cnt += 1

    if interpolateLoudness == 1:
        #interpolating harmonic weights
        harmWghts = np.array(harmWghts)
        harmWghtsIntrp = np.zeros(harmWghts.shape)
        for ii in range(harmWghts.shape[1]):
            harmWghts_temp = InterpolateSilence(harmWghts[:, ii], 0, HopSize,
                                                maxSilDurIntp)
            harmWghtsIntrp[:, ii] = harmWghts_temp
    else:
        harmWghtsIntrp = harmWghts

    loudness = []
    for wghtsLocal in harmWghtsIntrp:
        indValid = np.where(wghtsLocal > 0)[0]
        loudness.append(np.sqrt(np.sum(np.power(wghtsLocal[indValid], 2))))

    if interpolateLoudness == 1:
        loudness = InterpolateSilence(loudness, 0, HopSize, maxSilDurIntp)

    if smoothLoudness == 1:
        loudness = medfilt(loudness,
                           np.round(50.0 / (HopSize * 1000)).astype(np.int))

    #generating time stamps (because its equally hopped)
    TStamps = np.array(range(0, len(loudness))) * np.float(HopSize)
    dump = np.array([TStamps, loudness]).transpose()
    np.savetxt(fname + outputExt, dump, delimiter="\t")
Example #25
0
    def chroma_hpcp(self,
                    frameSize=4096,
                    hopSize=2048,
                    windowType='blackmanharris62',
                    harmonicsPerPeak=8,
                    magnitudeThreshold=1e-05,
                    maxPeaks=1000,
                    whitening=True,
                    referenceFrequency=440,
                    minFrequency=40,
                    maxFrequency=5000,
                    nonLinear=False,
                    numBins=12,
                    display=False):
        '''
        Compute Harmonic Pitch Class Profiles (HPCP) for the input audio files using essentia standard mode using
        the default parameters as mentioned in [1].
        Please refer to the following paper for detailed explanantion of the algorithm.
        [1]. Gómez, E. (2006). Tonal Description of Polyphonic Audio for Music Content Processing.
        For full list of parameters of essentia standard mode HPCP please refer to http://essentia.upf.edu/documentation/reference/std_HPCP.html
        Parameters
            harmonicsPerPeak : (integer ∈ [0, ∞), default = 0) :
            number of harmonics for frequency contribution, 0 indicates exclusive fundamental frequency contribution
            maxFrequency : (real ∈ (0, ∞), default = 5000) :
            the maximum frequency that contributes to the HPCP [Hz] (the difference between the max and split frequencies must not be less than 200.0 Hz)

            minFrequency : (real ∈ (0, ∞), default = 40) :
            the minimum frequency that contributes to the HPCP [Hz] (the difference between the min and split frequencies must not be less than 200.0 Hz)

            nonLinear : (bool ∈ {true, false}, default = false) :
            apply non-linear post-processing to the output (use with normalized='unitMax'). Boosts values close to 1, decreases values close to 0.
            normalized (string ∈ {none, unitSum, unitMax}, default = unitMax) :
            whether to normalize the HPCP vector

            referenceFrequency : (real ∈ (0, ∞), default = 440) :
            the reference frequency for semitone index calculation, corresponding to A3 [Hz]

            sampleRate : (real ∈ (0, ∞), default = 44100) :
            the sampling rate of the audio signal [Hz]

            numBins : (integer ∈ [12, ∞), default = 12) :
            the size of the output HPCP (must be a positive nonzero multiple of 12)
            whitening : (boolean (True, False), default = False)
            Optional step of computing spectral whitening to the output from speakPeak magnitudes
        '''

        audio = array(self.audio_vector)

        #print audio.shape

        frameGenerator = estd.FrameGenerator(audio,
                                             frameSize=frameSize,
                                             hopSize=hopSize)

        window = estd.Windowing(type=windowType)

        spectrum = estd.Spectrum()

        # Refer http://essentia.upf.edu/documentation/reference/std_SpectralPeaks.html
        spectralPeaks = estd.SpectralPeaks(magnitudeThreshold=0,
                                           maxFrequency=maxFrequency,
                                           minFrequency=minFrequency,
                                           maxPeaks=maxPeaks,
                                           orderBy="frequency",
                                           sampleRate=self.fs)

        # http://essentia.upf.edu/documentation/reference/std_SpectralWhitening.html
        spectralWhitening = estd.SpectralWhitening(maxFrequency=maxFrequency,
                                                   sampleRate=self.fs)

        # http://essentia.upf.edu/documentation/reference/std_HPCP.html
        hpcp = estd.HPCP(sampleRate=self.fs,
                         maxFrequency=maxFrequency,
                         minFrequency=minFrequency,
                         referenceFrequency=referenceFrequency,
                         nonLinear=nonLinear,
                         harmonics=harmonicsPerPeak,
                         size=numBins)

        pool = Pool()

        #compute hpcp for each frame and add the results to the pool
        for frame in frameGenerator:
            spectrum_mag = spectrum(window(frame))
            frequencies, magnitudes = spectralPeaks(spectrum_mag)
            if whitening:
                w_magnitudes = spectralWhitening(spectrum_mag, frequencies,
                                                 magnitudes)
                hpcp_vector = hpcp(frequencies, w_magnitudes)
            else:
                hpcp_vector = hpcp(frequencies, magnitudes)
            pool.add('tonal.hpcp', hpcp_vector)

        if display:
            display_chroma(np.swapaxes(pool['tonal.hpcp']), 0, 1)

        return pool['tonal.hpcp']
Example #26
0
import essentia.standard as ess
# matplotlib without any blocking GUI
import matplotlib as mpl

mpl.use('Agg')
import matplotlib.pyplot as plt
import numpy as np

M = 1024
N = 1024
H = 512
fs = 44100
spectrum = ess.Spectrum(size=N)
window = ess.Windowing(size=M, type='hann')
spectralPeaks = ess.SpectralPeaks()
hpcp = ess.HPCP()
x = ess.MonoLoader(filename='../../../sounds/cello-double.wav',
                   sampleRate=fs)()
hpcps = []

for frame in ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True):
    mX = spectrum(window(frame))
    spectralPeaks_freqs, spectralPeaks_mags = spectralPeaks(mX)
    hpcp_vals = hpcp(spectralPeaks_freqs, spectralPeaks_mags)
    hpcps.append(hpcp_vals)
hpcps = np.array(hpcps)

plt.figure(1, figsize=(9.5, 7))

plt.subplot(2, 1, 1)
plt.plot(np.arange(x.size) / float(fs), x, 'b')
Example #27
0
from pylab import *
from numpy import *

from smst.models import stft

filename = '../../../sounds/carnatic.wav'
hopSize = 128
frameSize = 2048
sampleRate = 44100
guessUnvoiced = True

run_windowing = ess.Windowing(type='hann', zeroPadding=3 * frameSize)  # Hann window with x4 zero padding
run_spectrum = ess.Spectrum(size=frameSize * 4)
run_spectral_peaks = ess.SpectralPeaks(minFrequency=50,
                                       maxFrequency=10000,
                                       maxPeaks=100,
                                       sampleRate=sampleRate,
                                       magnitudeThreshold=0,
                                       orderBy="magnitude")
run_pitch_salience_function = ess.PitchSalienceFunction(magnitudeThreshold=60)
run_pitch_salience_function_peaks = ess.PitchSalienceFunctionPeaks(minFrequency=90, maxFrequency=800)
run_pitch_contours = ess.PitchContours(hopSize=hopSize, peakFrameThreshold=0.7)
run_pitch_contours_melody = ess.PitchContoursMelody(guessUnvoiced=guessUnvoiced,
                                                    hopSize=hopSize)

pool = essentia.Pool();

audio = ess.MonoLoader(filename=filename)()
audio = ess.EqualLoudness()(audio)

for frame in ess.FrameGenerator(audio, frameSize=frameSize, hopSize=hopSize):
    frame = run_windowing(frame)
    def __init__(self,
                 bassline_filename,
                 drum_filename,
                 grid,
                 frame_size,
                 hop_size,
                 fft_size,
                 sample_rate,
                 xlim,
                 beats=[],
                 onsets=[],
                 bassline_onsets=[],
                 midi_tracks=[],
                 drum_analysisResults=[],
                 parent=None,
                 group_title="TRANSCRIBER",
                 prefix_text=[],
                 PYIN_midi=[],
                 YIN_times=[]):
        self._width = 12
        self._height = 7
        self._dpi = 100

        #
        self.drum_analysisResults = drum_analysisResults
        self.grid = grid

        #   Create QGroupBox and set the parent canvas (if any)
        QtWidgets.QGroupBox.__init__(self, group_title)
        self.setParent(parent)

        self.tabs_widget = QtWidgets.QTabWidget(self)

        if bassline_filename:
            self.bassline_widget = QtWidgets.QWidget(self)
            self.chroma_widget = QtWidgets.QWidget(self)
            self.tabs_widget.addTab(self.bassline_widget, "Spectrogram")
            self.tabs_widget.addTab(self.chroma_widget, "Chroma")
            self.saveFileName = os.path.join(
                os.path.dirname(bassline_filename), prefix_text + ".txt")

        if drum_filename:
            self.drum_spectrogram_widget = QtWidgets.QWidget(self)
            self.drum_widget = QtWidgets.QWidget(self)
            self.tabs_widget.addTab(self.drum_spectrogram_widget,
                                    "drums spectrogram")
            self.tabs_widget.addTab(
                self.drum_widget, "drums transcription (onsets in band bands)")
            self.saveFileName = os.path.join(os.path.dirname(drum_filename),
                                             prefix_text + ".txt")

        self.main_layout = QtWidgets.QGridLayout(self)
        self.main_layout.setAlignment(Qt.AlignCenter)
        self.resize(1200, 800)

        self.main_layout.addWidget(self.tabs_widget, 0, 0)
        self.main_layout.setColumnStretch(0, 1)
        self.main_layout.setRowStretch(0, 1)

        #   ------ --------- --------- Drum Spectrogram ------ STARTS HERE
        if drum_filename:
            options = {
                "filename": drum_filename,
                "fft_size": fft_size,
                "frame_size": frame_size,
                "hop_size": hop_size,
                "sample_rate": sample_rate,
                "xlim": xlim,
                "ylim": (20, 500),
                "width": self._width,
                "height": self._height,
                "dpi": self._dpi,
                "y_isHz": False,
                "playable": False,
                "saveFilename": self.saveFileName
            }

            self.DrumSpectrogramCanvas = SpectrogramCanvas(
                parent=self.drum_spectrogram_widget, **options)

            # Draw Beats
            for beat in beats:
                self.DrumSpectrogramCanvas.get_stft_ax().axvline(x=beat,
                                                                 ymin=0,
                                                                 ymax=1000,
                                                                 color='g')

            # Draw Onsets
            for onset in onsets:
                self.DrumSpectrogramCanvas.get_stft_ax().scatter(onset,
                                                                 50,
                                                                 c='red',
                                                                 marker='o')

            #   ------ --------- --------- Drum Spectrogram ------ ENDS HERE

            #   ------ --------- --------- Drum Transcription ------ STARTS HERE

            self.drum_onset_dots = []
            self.DrumCanvas = DrumCanvas(parent=self.drum_widget, **options)

            # connect save button
            self.key_pressed_cid = self.DrumCanvas.get_fig(
            ).canvas.mpl_connect('key_press_event', self.on_drum_key_press)

            if self.drum_analysisResults:
                self.draw_drum_results(self.DrumCanvas.get_fig(),
                                       self.DrumCanvas.get_ax())

            for grid_line in self.grid:
                self.DrumCanvas.get_ax().axvline(x=grid_line,
                                                 ymin=0,
                                                 ymax=1000,
                                                 color='b')

            for beat in beats:
                self.DrumCanvas.get_ax().axvline(x=beat,
                                                 ymin=0,
                                                 ymax=1000,
                                                 color='g')

            for onset in onsets:
                self.DrumCanvas.get_ax().scatter(onset,
                                                 0.5,
                                                 c='red',
                                                 marker='o')

        self.bassline_filename = None
        #   ------ --------- --------- Drum Transcription ------ ENDS HERE

        #   ------ --------- --------- Bassline Transcription ------ STARTS HERE
        if bassline_filename:
            options = {
                "filename": bassline_filename,
                "fft_size": fft_size,
                "frame_size": frame_size,
                "hop_size": hop_size,
                "sample_rate": sample_rate,
                "xlim": xlim,
                "ylim": (20, 500),
                "width": self._width,
                "height": self._height,
                "dpi": self._dpi,
                "y_isHz": False,
                "playable": False,
                "saveFilename": self.saveFileName
            }

            self.BasslineCanvas = SpectrogramCanvas(
                parent=self.bassline_widget, **options)
            self.chromaCanvas = ChromaCanvas(parent=self.chroma_widget,
                                             **options)

            # Load Audio for chroma calculations
            loader = es.MonoLoader(filename=bassline_filename,
                                   sampleRate=sample_rate)
            self.audio = loader()
            xvals = np.arange(len(self.audio)) / float(sample_rate)
            xlim = [0, max(xvals) + .25]
            self.chromaCanvas.get_ax().set_xlim(xlim)

            # Calculate Chromagram
            self.chromagram = []
            hpcp = es.HPCP(
                size=12,  # we will need higher resolution for Key estimation
                referenceFrequency=440,  # assume tuning frequency is 44100.
                bandPreset=False,
                weightType='cosine',
                nonLinear=False,
                windowSize=1.,
                sampleRate=sample_rate)

            spectrum = es.Spectrum(size=fft_size)
            spectral_peaks = es.SpectralPeaks(sampleRate=sample_rate)

            for frame in es.FrameGenerator(self.audio,
                                           frameSize=8192,
                                           hopSize=hop_size,
                                           startFromZero=True):

                frame = array(frame * get_window("hann", 8192))
                freqs, mags = spectral_peaks(spectrum(frame))
                chroma = hpcp(freqs, mags)
                self.chromagram.append(chroma)

            self.chromagram = array(self.chromagram)

            self.timeAxSec = np.arange(len(
                self.chromagram)) * hop_size / (sample_rate)

            # plot chromagram
            pitchClasses = [
                "A", "A#", "B", "C", "C#", "D", "D#", "E", "F", "F#", "G", "G#"
            ]
            self.chromaCanvas.get_ax().cla()
            self.chromaCanvas.get_ax().set_xlim(xlim)
            self.chromaCanvas.get_ax().set_ylim(*(-1, 13))
            y_ax = np.arange(13)
            self.chromaCanvas.get_ax().set_yticks(y_ax[:12] + .5)
            self.chromaCanvas.get_ax().set_yticklabels(pitchClasses)
            self.chromaCanvas.get_ax().pcolormesh(self.timeAxSec, y_ax,
                                                  self.chromagram.T)
            self.chromaCanvas.get_ax().set_ylabel("Pitch Class")
            self.chromaCanvas.get_fig().canvas.draw()

            self.InteractiveCanvas = MidiCanvas(
                parent=self.bassline_widget,
                ax=self.BasslineCanvas.get_stft_ax(),
                fig=self.BasslineCanvas.get_stft_fig(),
                horizontal_snap_grid=grid,
                snapVerticallyFlag=True,
                snap_offset_flag=True,
                doubleClickColor="y",
                xlim=xlim,
                ylim=(20, 500),
                width=self._width,
                height=self._height,
                dpi=self._dpi,
                x_sensitivity=.02,
                y_sensitivity=5,
                standalone=False,
                y_isHz=False,
                midi_tracks=midi_tracks,
                filename=bassline_filename,
                ax_chroma=self.chromaCanvas.get_ax(),
                fig_chroma=self.chromaCanvas.get_fig(),
                saveFileName=self.saveFileName)

            # Draw Beats
            for beat in beats:
                self.BasslineCanvas.get_stft_ax().axvline(x=beat,
                                                          ymin=0,
                                                          ymax=1000,
                                                          color='g')
                self.chromaCanvas.get_ax().axvline(x=beat,
                                                   ymin=0,
                                                   ymax=1000,
                                                   color='g')

            # Draw Onsets
            for onset in bassline_onsets:
                self.BasslineCanvas.get_stft_ax().scatter(onset,
                                                          50,
                                                          c='red',
                                                          marker='o')

            if PYIN_midi != []:
                self.BasslineCanvas.get_stft_ax().plot(YIN_times, PYIN_midi)

            self.BasslineCanvas.get_stft_ax().set_title(
                "Green: Beats, Red: Onsets, Blue: Grid")
            self.chromaCanvas.get_ax().set_title("Green: Beats")

            # show canvases
            self.BasslineCanvas.get_stft_fig().canvas.show()
            self.chromaCanvas.get_fig().canvas.show()
        #   ------ --------- --------- Bassline Transcription ------ ENDS HERE

        self.show()