def is_single_event(audiofile, max_duration=7): ''' Estimate if the audio signal contains one single event using the 'estimate_number_of_events' function above. We store the result of 'estimate_number_of_events' in a global variable so it can be reused in the different calls of 'is_single_event'. ''' global _is_single_event_cache if _is_single_event_cache is None: sample_rate = 44100 audio_file = MonoLoader(filename=audiofile, sampleRate=sample_rate) audio = audio_file.compute() if len(audio)/sample_rate > max_duration: # If file is longer than max duration, we don't consider it to be single event _is_single_event_cache = False else: _is_single_event_cache = estimate_number_of_events(audiofile, audio, sample_rate=sample_rate) == 1 return _is_single_event_cache
def is_single_event(audiofile, max_duration=7): ''' Estimate if the audio signal contains one single event using the 'estimate_number_of_events' function above. We store the result of 'estimate_number_of_events' in a global variable so it can be reused in the different calls of 'is_single_event'. ''' global _is_single_event_cache if _is_single_event_cache is None: sample_rate = 44100 try: audio_file = MonoLoader(filename=audiofile, sampleRate=sample_rate) except RuntimeError as e: if MORE_THAN_2_CHANNELS_EXCEPTION_MATCH_TEXT in str(e): converted_audiofile = convert_to_wav(audiofile) audio_file = MonoLoader(filename=converted_audiofile, sampleRate=sample_rate) audio = audio_file.compute() if len(audio) / sample_rate > max_duration: # If file is longer than max duration, we don't consider it to be single event _is_single_event_cache = False else: _is_single_event_cache = estimate_number_of_events( audiofile, audio, sample_rate=sample_rate) == 1 return _is_single_event_cache
def estimate_number_of_events(audiofile, region_energy_thr=2, silence_thr_scale=4, group_regions_ms=100): """ Returns list of activity "onsets" for an audio signal based on its energy envelope. This is more like "activity detecton" than "onset detection". """ logger.debug('{0}: estimating number of sound events'.format(audiofile)) def group_regions(regions, group_regions_ms): """ Group together regions which are very close in time (i.e. the end of a region is very close to the start of the following). """ if len(regions) <= 1: grouped_regions = regions[:] # Don't do anything if only one region or no regions at all else: # Iterate over regions and mark which regions should be grouped with the following regions to_group = [] for count, ((at0, at1, a_energy), (bt0, bt1, b_energy)) in enumerate(zip(regions[:-1], regions[1:])): if bt0 - at1 < group_regions_ms / 1000: to_group.append(1) else: to_group.append(0) to_group.append( 0 ) # Add 0 for the last one which will never be grouped with next (there is no "next region") # Now generate the grouped list of regions based on the marked ones in 'to_group' grouped_regions = [] i = 0 while i < len(to_group): current_group_start = None current_group_end = None x = to_group[i] if x == 1 and current_group_start is None: # Start current grouping current_group_start = i while x == 1: i += 1 x = to_group[i] current_group_end = i grouped_regions.append( (regions[current_group_start][0], regions[current_group_end][1], sum([ z for x, y, z in regions[current_group_start:current_group_end + 1] ]))) current_group_start = None current_group_end = None else: grouped_regions.append(regions[i]) i += 1 return grouped_regions # Load audio file sample_rate = 44100 audio_file = MonoLoader(filename=audiofile, sampleRate=sample_rate) audio = audio_file.compute() t = np.linspace(0, len(audio) / sample_rate, num=len(audio)) # Compute envelope and average signal energy env_algo = essentia.standard.Envelope( attackTime=15, releaseTime=50, ) envelope = env_algo(audio) average_signal_energy = np.sum(np.array(envelope)**2) / len(envelope) silence_thr = average_signal_energy * silence_thr_scale # Get energy regions above threshold # Implementation based on https://stackoverflow.com/questions/43258896/extract-subarrays-of-numpy-array-whose-values-are-above-a-threshold mask = np.concatenate(([False], envelope > silence_thr, [False])) idx = np.flatnonzero(mask[1:] != mask[:-1]) idx -= 1 # Avoid index out of bounds (0-index) regions = [ (t[idx[i]], t[idx[i + 1]], np.sum(envelope[idx[i]:idx[i + 1]]**2)) for i in range(0, len(idx), 2) ] # Energy is a list of tuples like (start_time, end_time, energy) regions = [region for region in regions if region[2] > region_energy_thr ] # Discard those below region_energy_thr # Group detected regions that happen close together regions = group_regions(regions, group_regions_ms) return len(regions) # Return number of sound events detected