Exemple #1
0
def compute_essentia_descriptors(audio_segment, actual_bar_beg,
                                 actual_bar_end):
    """
	Computes the values of selected descriptors in the given audio segment.
	"""
    frames = FrameGenerator(audio_segment,
                            frameSize=frameSize,
                            hopSize=hopSize)
    mfccs_bar = []
    bark_vector = [0] * 27
    pool = essentia.Pool()
    total_frames = frames.num_frames()

    for frame in frames:
        frame_windowed = window(frame)
        frame_spectrum = spectrum(frame_windowed)
        (frame_frequencies, frame_magnitudes) = spectralPeaks(frame_spectrum)
        mag, phase, = c2p(fft(frame_windowed))
        pool.add('onsets.hfc', od(mag, phase))
        frame_dissonance = dissonance(frame_frequencies, frame_magnitudes)
        pool.add('dissonance', frame_dissonance)
        # pool.add('zerocrossingrate', zerocrossingrate(frame))
        mfcc_bands, mfcc_coeffs = mfcc(spectrum(window(frame)))
        mfccs_bar.append(mfcc_coeffs)
        frame_barkbands = barkbands(frame_spectrum)
        for i in range(27):
            bark_vector[i] += frame_barkbands[i] / total_frames

    onsets_hfc = onsets(essentia.array([pool['onsets.hfc']]), [1])
    onset_rate = float(len(onsets_hfc)) / (actual_bar_end - actual_bar_beg)
    bar_dissonance = mean(pool["dissonance"])

    return mfccs_bar, bark_vector, onset_rate, bar_dissonance
Exemple #2
0
def essFalsestereoDetector(x: list,
                           frameSize=1024,
                           hopSize=512,
                           correlationThreshold=0.98,
                           percentageThreshold=90,
                           channels=2,
                           **kwargs):
    """Computes the correlation and consideres if the information in the two channels is the same

    Args:
        x: (list) input signal
        frameSize: (int) frame size for the analysis in falseStereoDetector
        hopSize: (int) hop_size for the analysis in falseStereoDetector
        correlationthreshold: (float) lower limit to decide if a file has correlation problems

    Returns:
        final_bool: (bool) True if the information is the same in both channels, False otherwise
        percentace: (float) How many frames were false stereo over all the frames
    """
    if channels < 2:
        return 1, False, True

    rx, lx = StereoDemuxer()(x)
    mux = StereoMuxer()
    falseStereoDetector = FalseStereoDetector(
        correlationThreshold=correlationThreshold, **kwargs)

    lfg = FrameGenerator(lx,
                         frameSize=frameSize,
                         hopSize=hopSize,
                         startFromZero=True)
    rfg = FrameGenerator(rx,
                         frameSize=frameSize,
                         hopSize=hopSize,
                         startFromZero=True)

    problematicFrames = sum([
        falseStereoDetector(mux(frameL, frameR))[0]
        for frameL, frameR in zip(lfg, rfg)
    ])
    # problematicFrames = []
    # for frameL, frameR in zip(lfg, rfg):
    #     res, corr = falseStereoDetector(mux(frameL, frameR))
    #     problematicFrames.append(res)

    falseStereoDetector.reset()

    conf = float(sum(problematicFrames)) / float(lfg.num_frames())

    return conf, conf > percentageThreshold / 100, False
    def __call__(self, audio, SR, sumThreshold=1e-5):
        self.__reset__()

        if audio.ndim > 1:
            audio = np.sum(audio, axis=1) / audio.ndim

        fcIndexArr = []
        self.hist = np.zeros(int(self.frameSize / 2 + 1))
        fft = FFT(size=self.frameSize)  # declare FFT function
        window = Windowing(size=self.frameSize,
                           type="hann")  # declare windowing function
        self.avgFrames = np.zeros(int(self.frameSize / 2) + 1)

        maxNrg = max([
            sum(abs(fft(window(frame)))**2)
            for frame in FrameGenerator(audio,
                                        frameSize=self.frameSize,
                                        hopSize=self.hopSize,
                                        startFromZero=True)
        ])

        for i, frame in enumerate(
                FrameGenerator(audio,
                               frameSize=self.frameSize,
                               hopSize=self.hopSize,
                               startFromZero=True)):

            frame = window(frame)  # apply window to the frame
            frameFft = abs(fft(frame))
            nrg = sum(frameFft**2)

            if nrg >= 0.1 * maxNrg:
                for j in reversed(range(len(frameFft))):
                    if sum(frameFft[j:] / j) >= sumThreshold:
                        fcIndexArr.append(j)
                        self.hist[j] += nrg
                        break
                self.avgFrames = self.avgFrames + frameFft

        if len(fcIndexArr) == 0:
            fcIndexArr.append(int(self.frameSize / 2) + 1)
            self.hist[int(self.frameSize / 2)] += 1

        self.avgFrames /= (i + 1)
        self.mostLikelyBin, conf, binary = self.__computeMeanFc(
            fcIndexArr, np.arange(int(self.frameSize / 2) + 2), hist=self.hist)

        return self.mostLikelyBin * SR / self.frameSize, conf, binary
Exemple #4
0
 def analyzer(samples):
     feats = []
     for frame in FrameGenerator(samples, 256, 160):
         frame_feats = mel(spectrum(window(frame)))
         frame_feats = np.log(frame_feats + 1e-16)
         feats.append(frame_feats)
     return np.array(feats)
def essNoiseburstDetector(x: list, frameSize=1024, hopSize=512, detectionThreshold=0.05, percentageThrehold=5, **kwargs):
    """Computes the hum detection in x and computes a value over one of the path of the audio that has hum noise.
    
    Args:
        x: (list) input signal
        frameSize: (int) frame size for the analysis in Noise Burst Detector
        hopSize: (int) hopSize for the analysis in Noise Burst Detector
        detectionThreshold: (float)

    Returns:
        Part over one of the file whith hum noise
    """
    noiseBurstDetector = NoiseBurstDetector(**kwargs)

    idxs = []
    count = 0
    total = 0
    for i, frame in enumerate(FrameGenerator(x, frameSize=frameSize, hopSize=hopSize, startFromZero=True)):
        corrupt_samples = noiseBurstDetector(frame)
        corrupt_samples = hopSize * i + corrupt_samples

        if len(corrupt_samples) > int(detectionThreshold*frameSize):
            count += 1
            for s in corrupt_samples:
                idxs.append(s)
        total += 1

    percentage = round(100*count/total, 2)
    # del noiseBurstDetector_algo; del frame; del corrupt_samples; del x;
    return idxs, percentage, percentage > percentageThrehold
Exemple #6
0
def extract_mel_feats(audio_fp,
                      analyzers,
                      fs=44100.0,
                      nhop=512,
                      nffts=[1024, 2048, 4096],
                      log_scale=True):
    # Extract features
    loader = MonoLoader(filename=audio_fp, sampleRate=fs)
    samples = loader()
    feat_channels = []
    for nfft, (window, spectrum, mel) in zip(nffts, analyzers):
        feats = []
        for frame in FrameGenerator(samples, nfft, nhop):
            frame_feats = mel(spectrum(window(frame)))
            feats.append(frame_feats)
        feat_channels.append(feats)

    # Transpose to move channels to axis 2 instead of axis 0
    feat_channels = np.transpose(np.stack(feat_channels), (1, 2, 0))

    # Apply numerically-stable log-scaling
    # Value 1e-16 comes from inspecting histogram of raw values and picking some epsilon >2 std dev left of mean
    if log_scale:
        feat_channels = np.log(feat_channels + 1e-16)

    return feat_channels
Exemple #7
0
def file_to_hpcp(filename):
    audio = MonoLoader(filename=filename)()
    windowing = Windowing(type='blackmanharris62')
    spectrum = Spectrum()
    spectral_peaks = SpectralPeaks(orderBy='magnitude',
                                   magnitudeThreshold=0.001,
                                   maxPeaks=20,
                                   minFrequency=20,
                                   maxFrequency=8000)
    hpcp = HPCP(maxFrequency=8000)  # ,
    # normalized='unitSum') #VERIFICAR QUE ISTO E O Q FAZ SENTIDO FAZER

    spec_group = []
    hpcp_group = []

    for frame in FrameGenerator(audio, frameSize=1024, hopSize=512):
        windowed = windowing(frame)
        fft = spectrum(windowed)
        frequencies, magnitudes = spectral_peaks(fft)
        final_hpcp = hpcp(frequencies, magnitudes)

        spec_group.append(fft)
        hpcp_group.append(final_hpcp)

    mean_hpcp = np.mean(np.array(hpcp_group).T, axis=1)
    return mean_hpcp
Exemple #8
0
def hfc(filename):
    audio = MonoLoader(filename=filename, sampleRate=44100)()
    features = []
    for frame in FrameGenerator(audio, frameSize = 1024, hopSize = 512):
        mag, phase =CartesianToPolar()(FFT()(Windowing(type='hann')(frame)))
        features.append(OnsetDetection(method='hfc')(mag, phase))
    return Onsets()(array([features]),[1])
Exemple #9
0
def noveltycurve(filename):
    audio = MonoLoader(filename=filename, sampleRate=44100)()
    band_energy = []
    for frame in FrameGenerator(audio, frameSize = 1024, hopSize = 512):
        mag, phase, = CartesianToPolar()(FFT()(Windowing(type='hann')(frame)))
        band_energy.append(FrequencyBands()(mag))
    novelty = NoveltyCurve()(band_energy)
    return Onsets()(np.array([novelty]),[1])
Exemple #10
0
def outofPhaseDetector(x: list,
                       frameSize=1024,
                       hopSize=512,
                       correlationThreshold=-0.8,
                       percentageThreshold=90,
                       channels=2,
                       **kwargs):
    """Computes the correlation and flags the file if the file has a 90% of frames out of phase

    Args:
        x: (list) input signal
        frameSize: (int) frame size for the analysis in falseStereoDetector
        hopSize: (int) hop_size for the analysis in falseStereoDetector
        correlationthreshold: (float) higher limit to decide if a file has correlation problems

    Returns:
        final_bool: (bool) True if the information is the same in both channels, False otherwise
        percentace: (float) How many frames were false stereo over all the frames
    """
    if channels < 2:
        return 1, False, True

    rx, lx = StereoDemuxer()(x)
    mux = StereoMuxer()
    falseStereoDetector = FalseStereoDetector(**kwargs)

    lfg = FrameGenerator(lx,
                         frameSize=frameSize,
                         hopSize=hopSize,
                         startFromZero=True)
    rfg = FrameGenerator(rx,
                         frameSize=frameSize,
                         hopSize=hopSize,
                         startFromZero=True)

    problematicFrames = 0
    for frameL, frameR in zip(lfg, rfg):
        _, corr = falseStereoDetector(mux(frameL, frameR))
        problematicFrames += corr < correlationThreshold
    falseStereoDetector.reset()

    conf = problematicFrames / lfg.num_frames()

    return conf, conf > percentageThreshold / 100, False
Exemple #11
0
def shared_main(source, dest, display_result):
    source_audio = _loader(source)
    destination_audio = _loader(dest)

    source_frame = FrameGenerator(source_audio, frameSize=2048, hopSize=512)
    destination_frame = FrameGenerator(destination_audio,
                                       frameSize=2048,
                                       hopSize=512)

    window = Windowing(type='hann')  # window function
    spectrum = Spectrum()  # spectrum function
    pitch_yin_fft = PitchYinFFT()  # pitch extractor
    pitch_saliennce = PitchSalience()
    loudness = Loudness()

    # draw_plot(source_frame, window, spectrum, pitch_yin_fft)
    min_cost, match_result = compare(source_frame, destination_frame, window, \
                                  spectrum, pitch_yin_fft, 5, 1, 1, display_result, loudness)

    return min_cost, match_result
    def calculateDownbeats(self, audio, bpm, phase):
        # Step 0: calculate the CSD (Complex Spectral Difference) features
        # and the associated onset detection function ON LOWPASSED SIGNAL
        spec = Spectrum(size=self.FRAME_SIZE)
        w = Windowing(type='hann')
        fft = FFT()
        c2p = CartesianToPolar()
        od_csd = OnsetDetection(method='complex')
        lowpass = LowPass(cutoffFrequency=1500)

        pool = Pool()

        # TODO test faster (numpy) way
        #audio = lowpass(audio)
        for frame in FrameGenerator(audio,
                                    frameSize=self.FRAME_SIZE,
                                    hopSize=self.HOP_SIZE):
            mag, ph = c2p(fft(w(frame)))
            pool.add('onsets.complex', od_csd(mag, ph))

        # Step 1: normalise the data using an adaptive mean threshold
        novelty_mean = self.adaptive_mean(pool['onsets.complex'], 16.0)

        # Step 2: half-wave rectify the result
        novelty_hwr = (pool['onsets.complex'] - novelty_mean).clip(min=0)

        # Step 7 (experimental): Determine downbeat locations as subsequence with highest complex spectral difference
        for i in range(4):
            phase_frames = (phase * 44100.0) / (512.0)
            frames = (
                np.round(
                    np.arange(phase_frames + i * self.numFramesPerBeat(bpm),
                              np.size(novelty_hwr),
                              4 * self.numFramesPerBeat(bpm))).astype('int')
            )[:
              -1]  # Discard last value to prevent reading beyond array (last value rounded up for example)
            pool.add('output.downbeat',
                     np.sum(novelty_hwr[frames]) / np.size(frames))

            plt.subplot(4, 1, i + 1)
            plt.plot(novelty_hwr)
            for f in frames:
                plt.axvline(x=f)
        print pool['output.downbeat']
        downbeatIndex = np.argmax(pool['output.downbeat'])
        plt.show()

        # experimental
        return 1.0 * self.beats[downbeatIndex::4]
Exemple #13
0
def essClickDetector(x,
                     frameSize=1024,
                     hopSize=512,
                     percentageThrehold=1,
                     **kwargs):
    """Breaks x into frames and computes the start and end indexes.
    
    Args:
        x: (list) input signal
        frameSize: (int) frame size for the analysis in Click Detector
        hopSize: (int) hopSize for the analysis in Click Detector
    
    Kwargs:
        same **kwargs for ClickDetector

    Returns:
        starts: start indexes
        ends: end indexes
        percentage of frames with the issue
    """

    clickDetector = ClickDetector(frameSize=frameSize,
                                  hopSize=hopSize,
                                  **kwargs)

    ends = []
    starts = []
    count = 0
    total = 0

    for frame in FrameGenerator(x,
                                frameSize=frameSize,
                                hopSize=hopSize,
                                startFromZero=True):
        frame_starts, frame_ends = clickDetector(frame)

        for s in frame_starts:
            starts.append(s)
        for e in frame_ends:
            ends.append(e)

        if len(frame_starts) + len(frame_ends) != 0:
            count += 1
        total += 1

    percentage = round(100 * count / total, 2)
    # print("Number of frames:", i+1)
    # del x; del frame; del frame_ends; del frame_starts;
    return starts, ends, percentage, percentage > percentageThrehold
Exemple #14
0
def rms_centroids(filename, frameSize=1024, hopSize=512, sampleRate=44100):
    # load our audio into an array
    audio = MonoLoader(filename=filename, sampleRate=44100)()

    # create the pool and the necessary algorithms
    w = Windowing()
    spec = Spectrum()
    rms = RMS()
    centroid = Centroid(range=int(sampleRate / 2))
    cs = []
    rmss = []
    # compute the centroid for all frames in our audio and add it to the pool
    for frame in FrameGenerator(audio, frameSize=frameSize, hopSize=hopSize):
        sf = spec(w(frame))
        cs.append(centroid(sf))
        rmss.append(rms(sf))
    return np.array(rmss), np.array(cs)
Exemple #15
0
def hcdf(filename):
    audio = MonoLoader(filename=filename)()
    windowing = Windowing(type='hann')

    for frame in FrameGenerator(audio, frameSize=32768, hopSize=4096):
        windowed = windowing(frame)
        print('window', windowed)
        # ConstantQ transform
        # constant_q = ConstantQ(binsPerOctave=36, minFrequency=110, maxFrequency=3520, sampleRate=11025)
        # kk = constant_q(windowed)
        # 12 bin tunned Chromagram
        # pedirle al ruso que lo ponga
        chroma = Chromagram(numberBins=12,
                            binsPerOctave=36,
                            minFrequency=110,
                            windowType='hann')  # maxFrequency=3520

        pitch_class_vectors = chroma(frame)
        print('pitch_class_vectors', pitch_class_vectors)
Exemple #16
0
def ninos(filename,gamma=0.94):
    """
    reference: Mounir, M., Karsmakers, P., & Van Waterschoot, T. (2016). Guitar note onset detection based on a spectral sparsity measure. 
    European Signal Processing Conference. https://doi.org/10.1109/EUSIPCO.2016.7760394
    """
    N = 2048
    hopSize = int(N/10)
    J = int(N*gamma/2)
    audio = MonoLoader(filename=filename, sampleRate=44100)()
    mag = []
    for frame in FrameGenerator(audio, frameSize = N, hopSize = hopSize):
        m = CartesianToPolar()(FFT()(Windowing(type='hann')(frame)))[0]
        m = np.asarray(m)
        idx = np.argsort(m)[::-1][:J]
        mag.append(m[idx])
    mag = np.asarray(mag)
    x2 = mag*mag
    inos=np.sum(x2,axis=1)/(np.sum(x2*x2,axis=1)**(0.25))
    ninos = inos/(J**(0.25))
    return  OnsetPeakPickingProcessor(threshold=0.03,fps=44100/hopSize)(ninos)                          
    def run(self, audio):

        # Calculate the melflux onset detection function

        pool = Pool()
        w = Windowing(type='hann')
        fft = np.fft.fft
        od_flux = OnsetDetection(method='melflux')

        for frame in FrameGenerator(audio,
                                    frameSize=self.FRAME_SIZE,
                                    hopSize=self.HOP_SIZE):
            pool.add('audio.windowed_frames', w(frame))

        fft_result = fft(pool['audio.windowed_frames']).astype('complex64')
        fft_result_mag = np.absolute(fft_result)
        fft_result_ang = np.angle(fft_result)
        self.fft_mag_1024_512 = fft_result_mag
        self.fft_phase_1024_512 = fft_result_ang

        for mag, phase in zip(fft_result_mag, fft_result_ang):
            pool.add('onsets.complex', od_flux(mag, phase))

        odf = pool['onsets.complex']

        # Given the ODF, calculate the tempo and the phase
        tempo, tempo_curve, phase, phase_curve = BeatTracker.get_tempo_and_phase_from_odf(
            odf, self.HOP_SIZE)

        # Calculate the beat annotations
        spb = 60. / tempo  #seconds per beat
        beats = (np.arange(phase,
                           (np.size(audio) / self.SAMPLE_RATE) - spb + phase,
                           spb).astype('single'))

        # Store all the results
        self.bpm = tempo
        self.phase = phase
        self.beats = beats
        self.onset_curve = BeatTracker.hwr(pool['onsets.complex'])
Exemple #18
0
    def f_essentia_extract(Audio):
        ##    METODOS DE LIBRERIA QUE DETECTAN DONDE OCURRE CADA NOTA RESPECTO AL TIEMPO

        od2 = OnsetDetection(method='complex')
        # Let's also get the other algorithms we will need, and a pool to store the results
        w = Windowing(type='hann')
        fft = FFT()  # this gives us a complex FFT
        c2p = CartesianToPolar(
        )  # and this turns it into a pair (magnitude, phase)
        pool = essentia.Pool()

        # Computing onset detection functions.
        for frame in FrameGenerator(Audio, frameSize=1024, hopSize=512):
            mag, phase, = c2p(fft(w(frame)))
            pool.add('features.complex', od2(mag, phase))

        ## inicio de cada "nota"
        onsets = Onsets()
        tiempos_detectados_essentia = onsets(
            essentia.array([pool['features.complex']]), [1])
        #print(tiempos_detectados_essentia)
        return tiempos_detectados_essentia
Exemple #19
0
def detectBW(audio: list,
             SR: float,
             frame_size=256,
             hop_size=128,
             floor_db=-90,
             oversample_f=1):

    frame_size *= oversample_f  # if an oversample factor is desired, apply it

    fc_index_arr = []
    fft = FFT(size=frame_size)  # declare FFT function
    window = Windowing(size=frame_size,
                       type="hann")  # declare windowing function

    for frame in FrameGenerator(audio,
                                frameSize=frame_size,
                                hopSize=hop_size,
                                startFromZero=True):

        frame_fft = abs(fft(window(frame)))
        frame_fft_db = 20 * np.log10(
            frame_fft + eps)  # calculate frame fft values in db
        # compute the linear interpolation between the values of the maxima of the spectrum
        interp_frame = compute_spectral_envelope(frame_fft_db, "linear")
        interp_frame = modify_floor(interp_frame, floor_db, log=True)
        fc_index = compute_fc(interp_frame)

        if energy_verification(frame_fft, fc_index):
            fc_index_arr.append(fc_index)

    if len(fc_index_arr) == 0:
        fc_index_arr = [frame_size]

    fc_bin, conf, binary = compute_mean_fc(fc_index_arr,
                                           np.arange(len(frame_fft)), SR)

    # print("mean_fc: ", fc_bin*SR/frame_size ," conf: ", conf ," binary_result: ", binary)

    return fc_bin * SR / frame_size, conf, binary
def essSaturationDetector(x: list, frameSize=1024, hopSize=512, percentageThrehold=5, **kwargs):
    """Breaks x into frames and computes the start and end indexes 
    
    Args:
        x: (list) input signal
        frameSize: (int) frame size for the analysis in Saturation Detector
        hopSize: (int) hopSize for the analysis in Saturation Detector
        percentageThrehold: (int)
    
    Kwargs:
        Same **kwargs than the ones for SaturationDetector

    Returns:
        starts: start indexes
        ends: end indexes
        percentage of frames with the issue
    """
    saturationDetector = SaturationDetector(frameSize=frameSize, hopSize=hopSize, **kwargs)

    ends = []
    starts = []
    count = 0
    total = 0
    for frame in FrameGenerator(x, frameSize=frameSize, hopSize=hopSize, startFromZero=True):
        frame_starts, frame_ends = saturationDetector(frame)

        for s in frame_starts:
            starts.append(s)
        for e in frame_ends:
            ends.append(e)
        if len(frame_starts) + len(frame_ends) != 0:
            count += 1
        total += 1

    percentage = round(100*count/total, 2)

    return starts, ends, percentage, percentage > percentageThrehold
beats = beats * frames_per_second
spec = Spectrum(size=FRAME_SIZE - FRAME_SIZE % 2)
w = Windowing(type='hann')
spectrum = Spectrum()  # FFT would return complex FFT, we only want magnitude
mfcc = MFCC()
pool = Pool()

# Step 0: align audio with phase

beats = beats - 0.5

start_sample = int((phase) * (44100.0 * 60 / bpm))

# Step 1: Calculate framewise MFCC
for frame in FrameGenerator(audio[start_sample:],
                            frameSize=FRAME_SIZE,
                            hopSize=HOP_SIZE):
    mfcc_bands, mfcc_coeffs = mfcc(
        spectrum(w(frame[:FRAME_SIZE - (FRAME_SIZE % 2)])))
    pool.add('lowlevel.mfcc', mfcc_coeffs)
    pool.add('lowlevel.mfcc_bands', mfcc_bands)

# Step 2: correlate
print np.shape(pool['lowlevel.mfcc'])
matrix = 1 - pairwise_distances(pool['lowlevel.mfcc'], metric='cosine')

plt.imshow(matrix,
           aspect='auto',
           interpolation='nearest',
           vmin=np.percentile(matrix, 1.0),
           vmax=np.percentile(matrix, 99.0))
Exemple #22
0
            int(progress), '%', '\t Time elapsed: ', current - start,
            ' seconds')

    ainp = MonoLoader(filename=f, sampleRate=params['fs'])()
    fHz = dict_fmap[f.split('/')[-1].split('_')[0]]
    ccoeff = params['fs'] / (2 * fHz)
    params_ceps['ceps_coeffs'] = (int)(ccoeff)

    # Extract relevant information(name,midi)
    name = f.split('/')[-1][:-4]
    midival = (int)(69 + 12 * np.log2(fHz / 440))

    # Select the relevant portion of audio to compute cc using essentia's silence detection function
    s_3 = StartStopSilence(threshold=-30)
    for frame in FrameGenerator(ainp,
                                frameSize=params['N'],
                                hopSize=params['H']):
        sss = s_3(frame)
    start_frame = (int)(sss[0] * params['H'])
    stop_frame = (int)(sss[1] * params['H'])
    ainp = ainp[start_frame:stop_frame]

    # # Condition to ensure that each has at least num_frames!
    # if(ainp.shape[0] < num_frames):
    # 	continue

    # Compute the cc's
    op = cc_calc(ainp, params, params_ceps)

    # Store cc'c + other relevant parameters in dict
    results[name] = {}
    def run(self, audio):

        # TODO put this in some util class

        # Step 0: calculate the CSD (Complex Spectral Difference) features
        # and the associated onset detection function
        spec = Spectrum(size=self.FRAME_SIZE)
        w = Windowing(type='hann')
        fft = FFT()
        c2p = CartesianToPolar()
        od_csd = OnsetDetection(method='complex')

        pool = Pool()

        # TODO test faster (numpy) way
        for frame in FrameGenerator(audio,
                                    frameSize=self.FRAME_SIZE,
                                    hopSize=self.HOP_SIZE):
            mag, phase = c2p(fft(w(frame)))
            pool.add('onsets.complex', od_csd(mag, phase))

        # Step 1: normalise the data using an adaptive mean threshold
        novelty_mean = self.adaptive_mean(pool['onsets.complex'], 16.0)

        # Step 2: half-wave rectify the result
        novelty_hwr = (pool['onsets.complex'] - novelty_mean).clip(min=0)

        # Step 3: then calculate the autocorrelation of this signal
        novelty_autocorr = self.autocorr(novelty_hwr)

        # Step 4: Sum over constant intervals to detect most likely BPM
        valid_bpms = np.arange(self.minBpm, self.maxBpm, self.stepBpm)
        for bpm in valid_bpms:
            frames = (
                np.round(
                    np.arange(0, np.size(novelty_autocorr),
                              self.numFramesPerBeat(bpm))).astype('int')
            )[:
              -1]  # Discard last value to prevent reading beyond array (last value rounded up for example)
            pool.add('output.bpm',
                     np.sum(novelty_autocorr[frames]) / np.size(frames))
        bpm = valid_bpms[np.argmax(pool['output.bpm'])]

        # Step 5: Calculate phase information
        valid_phases = np.arange(0.0, 60.0 / bpm,
                                 0.001)  # Valid phases in SECONDS
        for phase in valid_phases:
            # Convert phase from seconds to frames
            phase_frames = (phase * 44100.0) / (512.0)
            frames = (
                np.round(
                    np.arange(phase_frames, np.size(novelty_hwr),
                              self.numFramesPerBeat(bpm))).astype('int')
            )[:
              -1]  # Discard last value to prevent reading beyond array (last value rounded up for example)
            pool.add('output.phase',
                     np.sum(novelty_hwr[frames]) / np.size(frames))
        phase = valid_phases[np.argmax(pool['output.phase'])]
        print 'PHASE', phase
        # Step 6: Determine the beat locations
        spb = 60. / bpm  #seconds per beat
        beats = (np.arange(phase, (np.size(audio) / 44100) - spb + phase,
                           spb).astype('single'))

        # Store all the results
        self.bpm = bpm
        self.phase = phase
        self.beats = beats

        self.downbeats = self.calculateDownbeats(audio, bpm, phase)
Exemple #24
0
 def __iter__(self) -> Iterable['AudioSequence']:
     return (self.new(x) for x in FrameGenerator(
         self.audio, frameSize=self.fs, hopSize=self.hs))
Exemple #25
0
    def run(self, audio):
        def numFramesPerBeat(bpm):
            return (60.0 * self.SAMPLE_RATE) / (self.HOP_SIZE * bpm)

        def autocorr(x):
            result = np.correlate(x, x, mode='full')
            return result[result.size / 2:]

        def adaptive_mean(x, N):
            return np.convolve(x, [1.0] * int(N), mode='same') / N

        # Step 0: calculate the CSD (Complex Spectral Difference) features
        # and the associated onset detection function
        spec = Spectrum(size=self.FRAME_SIZE)
        w = Windowing(type='hann')
        fft = np.fft.fft
        c2p = CartesianToPolar()
        od_csd = OnsetDetection(method='melflux')

        pool = Pool()

        for frame in FrameGenerator(audio,
                                    frameSize=self.FRAME_SIZE,
                                    hopSize=self.HOP_SIZE):
            pool.add('audio.windowed_frames', w(frame))

        fft_result = fft(pool['audio.windowed_frames']).astype('complex64')
        fft_result_mag = np.absolute(fft_result)
        fft_result_ang = np.angle(fft_result)

        for mag, phase in zip(fft_result_mag, fft_result_ang):
            pool.add('onsets.complex', od_csd(mag, phase))

        # Step 1: normalise the data using an adaptive mean threshold
        novelty_mean = adaptive_mean(pool['onsets.complex'], 16.0)

        # Step 2: half-wave rectify the result
        novelty_hwr = (pool['onsets.complex'] - novelty_mean).clip(min=0)

        # Step 3: then calculate the autocorrelation of this signal
        novelty_autocorr = autocorr(novelty_hwr)

        # Step 4: Sum over constant intervals to detect most likely BPM
        valid_bpms = np.arange(self.minBpm, self.maxBpm, self.stepBpm)
        for bpm in valid_bpms:
            frames = (
                np.round(
                    np.arange(0, np.size(novelty_autocorr),
                              numFramesPerBeat(bpm))).astype('int')
            )[:
              -1]  # Discard last value to prevent reading beyond array (last value rounded up for example)
            pool.add('output.bpm',
                     np.sum(novelty_autocorr[frames]) / np.size(frames))
        bpm = valid_bpms[np.argmax(pool['output.bpm'])]

        # Step 5: Calculate phase information
        valid_phases = np.arange(0.0, 60.0 / bpm,
                                 0.001)  # Valid phases in SECONDS
        for phase in valid_phases:
            # Convert phase from seconds to frames
            phase_frames = (phase * 44100.0) / (512.0)
            frames = (
                np.round(
                    np.arange(phase_frames, np.size(novelty_hwr),
                              numFramesPerBeat(bpm))).astype('int')
            )[:
              -1]  # Discard last value to prevent reading beyond array (last value rounded up for example)
            pool.add('output.phase',
                     np.sum(novelty_hwr[frames]) / np.size(frames))
        phase = valid_phases[np.argmax(pool['output.phase'])]

        # Step 6: Determine the beat locations
        spb = 60. / bpm  #seconds per beat
        beats = (np.arange(phase, (np.size(audio) / 44100) - spb + phase,
                           spb).astype('single'))

        # Store all the results
        self.bpm = bpm
        self.phase = phase
        self.beats = beats
Exemple #26
0
 def to_frames(self, audio: np.array) -> list:
     return list(FrameGenerator(audio, frameSize=self.fs, hopSize=self.hs))
s1 = Song(sys.argv[1])

s1.open()
s1.openAudio()
audio = s1.audio
FRAME_SIZE = int(44100 * (60.0 / s1.tempo) / 2)
HOP_SIZE = FRAME_SIZE / 2


def adaptive_mean(x, N):
    return np.convolve(x, [1.0] * int(N), mode='same') / N


pool = Pool()
for frame in FrameGenerator(audio, frameSize=FRAME_SIZE, hopSize=HOP_SIZE):
    pool.add('lowlevel.rms', np.average(frame**2))

adaptive_mean_rms = adaptive_mean(
    pool['lowlevel.rms'],
    64)  # Mean of rms in window of [-4 dbeats, + 4 dbeats]
mean_rms = np.mean(adaptive_mean_rms)
adaptive_mean_odf = adaptive_mean(s1.onset_curve,
                                  int((44100 * 60 / s1.tempo) / 512) *
                                  4)  # -4 dbeats, +4 dbeats
adaptive_mean_odf_2 = adaptive_mean(adaptive_mean_odf, 8)
mean_odf = np.mean(adaptive_mean_odf)

plt.plot(np.linspace(0.0, 1.0, adaptive_mean_rms.size),
         adaptive_mean_rms / max(adaptive_mean_rms),
         c='r')
Exemple #28
0
    def run(self):

        loader = essentia.standard.MonoLoader(filename=sys.argv[1])()

        msg = "0"

        socks = [
            socket.socket(socket.AF_INET, socket.SOCK_DGRAM),
            socket.socket(socket.AF_INET, socket.SOCK_DGRAM),
            socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
        ]
        beat_count = 0
        N_BEATS = 4
        global frame_g
        frame_q_o = Queue.Queue()
        result_q_o = Queue.Queue()
        frame_q_b = Queue.Queue()
        result_q_b = Queue.Queue()
        ot = OnsetThread(frame_q_o, result_q_o)
        bt = BeatThread(frame_q_b, result_q_b)
        ot.daemon = True
        bt.daemon = True
        ot.start()
        bt.start()

        for frame in FrameGenerator(loader,
                                    frameSize=CHUNK,
                                    hopSize=CHUNK,
                                    startFromZero=True):
            frame_g = frame
            if self.stoprequest.isSet():
                break
            else:
                self.play_started.get()

            start_time = time.time()

            frame_q_b.put(True)
            frame_q_o.put(True)

            energy = Energy()
            frame_energy = energy(frame)
            pool2 = result_q_o.get()
            pool = result_q_b.get()

            bpm = pool['Rhythm.bpm']
            spb = 60.0 / bpm if bpm > 0 else 0.0
            look_ahead_n = 16
            beats = pool['Rhythm.ticks']
            onset = pool2['Rhythm.onsetRate']

            for i, b in enumerate(beats):
                beat_count += 1
                if (beat_count % N_BEATS == 0):
                    half_beat = start_time + b + spb * (look_ahead_n / 2)
                    next_beat = start_time + b + spb * look_ahead_n

                    for i, sock in enumerate(socks):
                        sock.sendto(
                            str(half_beat) + "," + str(frame_energy) + "," +
                            str(onset), (UDP_IP, UDP_PORT[i]))
                        sock.sendto(
                            str(next_beat) + "," + str(frame_energy) + "," +
                            str(onset), (UDP_IP, UDP_PORT[i]))
            '''
            print "beats: ", pool['Rhythm.ticks']
            print "energy: ", frame_energy 
            print "onset: ", pool2['Rhythm.onsetRate']
            print "bpm: ", pool['Rhythm.bpm']
            print "spb: ", spb
            print "bar started: ", start_time
            print "time now: ", time.time()
            print "next_beat: ", next_beat
            '''
            self.extract_done.put(True)
        ot.stop()
        bt.stop()
def feature_allframes(audio, beats, frame_indexer = None):
	
	# Initialise the algorithms	
	FRAME_SIZE = 1024
	HOP_SIZE = 512
	spec = Spectrum(size = FRAME_SIZE)
	w = Windowing(type = 'hann')
	fft = np.fft.fft

	od_csd = OnsetDetection(method = 'complex')
	od_hfc = OnsetDetection(method = 'flux')

	pool = Pool()
	
	# Calculate onset detection curve on audio
	for frame in FrameGenerator(audio, frameSize = FRAME_SIZE, hopSize = HOP_SIZE):
		pool.add('windowed_frames', w(frame))
		
	fft_result = fft(pool['windowed_frames']).astype('complex64')
	fft_result_mag = np.absolute(fft_result)
	fft_result_ang = np.angle(fft_result)

	for mag,phase in zip(fft_result_mag, fft_result_ang):
		pool.add('onsets.flux', od_hfc(mag, phase))
	
	# Normalize and half-rectify onset detection curve
	def adaptive_mean(x, N):
		return np.convolve(x, [1.0]*int(N), mode='same')/N
		
	novelty_mean = adaptive_mean(pool['onsets.flux'], 16.0)
	novelty_hwr = (pool['onsets.flux'] - novelty_mean).clip(min=0)
	novelty_hwr = novelty_hwr / np.average(novelty_hwr)
	
	# For every frame in frame_indexer, 
	if frame_indexer is None:
		frame_indexer = list(range(4,len(beats) - 1)) # Exclude first frame, because it has no predecessor to calculate difference with
		
	# Feature: correlation between current frame onset detection f and of previous frame
	# Feature: correlation between current frame onset detection f and of next frame
	# Feature: diff between correlation between current frame onset detection f and corr cur and next
	onset_integrals = np.zeros((2 * len(beats), 1))
	frame_i = (np.array(beats) * 44100.0/ HOP_SIZE).astype('int')
	onset_correlations = np.zeros((len(beats), 21))
	
	for i in [i for i in range(len(beats)) if (i in frame_indexer) or (i+1 in frame_indexer)
		or (i-1 in frame_indexer) or (i-2 in frame_indexer) or (i-3 in frame_indexer)
		or (i-4 in frame_indexer) or (i-5 in frame_indexer) or (i-6 in frame_indexer) or (i-7 in frame_indexer)]:
		
		half_i = int((frame_i[i] + frame_i[i+1]) / 2)
		cur_frame_1st_half = novelty_hwr[frame_i[i] : half_i]
		cur_frame_2nd_half = novelty_hwr[half_i : frame_i[i+1]]
		onset_integrals[2*i] = np.sum(cur_frame_1st_half)
		onset_integrals[2*i + 1] = np.sum(cur_frame_2nd_half)
	
	# Step 2: Calculate the cosine distance between the MFCC values
	for i in frame_indexer:
		
		onset_correlations[i][0] = max(np.correlate(novelty_hwr[frame_i[i-1] : frame_i[i]], novelty_hwr[frame_i[i] : frame_i[i+1]], mode='valid')) # Only 1 value
		onset_correlations[i][1] = max(np.correlate(novelty_hwr[frame_i[i] : frame_i[i+1]], novelty_hwr[frame_i[i+1] : frame_i[i+2]], mode='valid')) # Only 1 value
		onset_correlations[i][2] = max(np.correlate(novelty_hwr[frame_i[i] : frame_i[i+1]], novelty_hwr[frame_i[i+2] : frame_i[i+3]], mode='valid')) # Only 1 value
		onset_correlations[i][3] = max(np.correlate(novelty_hwr[frame_i[i] : frame_i[i+1]], novelty_hwr[frame_i[i+3] : frame_i[i+4]], mode='valid')) # Only 1 value
		
		# Difference in integrals of novelty curve between frames
		# Quantifies the difference in number and prominence of onsets in this frame
		onset_correlations[i][4] = onset_integrals[2*i] - onset_integrals[2*i-1]
		onset_correlations[i][5] = onset_integrals[2*i+2] + onset_integrals[2*i+3] - onset_integrals[2*i-1] - onset_integrals[2*i-2]
		for j in range(1,16):
			onset_correlations[i][5 + j] = onset_integrals[2*i + j] - onset_integrals[2*i]
		
			
	# Include the MFCC coefficients as features
	result = onset_correlations[frame_indexer]
	return preprocessing.scale(result)