class IPUsAudio( object ): """ @author: Brigitte Bigi @organization: Laboratoire Parole et Langage, Aix-en-Provence, France @contact: [email protected] @license: GPL, v3 @copyright: Copyright (C) 2011-2016 Brigitte Bigi @summary: An IPUs segmenter from audio. IPUs - Inter-Pausal Units are blocks of speech bounded by silent pauses of more than X ms, and time-aligned on the speech signal. """ MIN_SIL_DUR = 0.08 MIN_IPU_DUR = 0.08 def __init__(self, channel): """ Creates a new IPUsAudio instance. """ super(IPUsAudio, self).__init__() self.reset() self.set_channel(channel) # ------------------------------------------------------------------ def reset(self): """ Set default values. """ self.min_sil_dur = 0.250 self.min_ipu_dur = 0.300 self.vol_threshold = 0 self.shift_start = 0.010 self.shift_end = 0.020 self.win_lenght = 0.020 self.auto_vol = True self.bornestart = False self.borneend = False # ------------------------------------------------------------------ # Manage Channel # ------------------------------------------------------------------ def get_channel(self): """ Return the channel. """ return self.chansil.get_channel() # ------------------------------------------------------------------ def set_channel(self, channel): """ Set a new Channel. """ if channel is not None: self.chansil = ChannelSilence( channel, self.win_lenght ) else: self.chansil = None # ------------------------------------------------------------------ def reset_silences(self): """ Reset the list of silences. """ if self.chansil is not None: self.chansil.reset_silences() # ------------------------------------------------------------------ def set_silences(self, silences): """ Fix the list of silences. """ if self.chansil is not None: self.chansil.set_silences( silences ) # ------------------------------------------------------------------ # Setters for members # ------------------------------------------------------------------ def set_vol_threshold(self, vol_threshold): """ Fix the default minimum volume value to find silences. @param vol_threshold (int) RMS value """ self.vol_threshold = int(vol_threshold) if vol_threshold == 0: self.auto_vol = True else: self.auto_vol = False # ------------------------------------------------------------------ def set_min_silence(self, min_sil_dur): """ Fix the default minimum duration of a silence. @param min_sil_dur (float) Duration in seconds. """ self.min_sil_dur = float(min_sil_dur) # ------------------------------------------------------------------ def set_min_speech(self, min_ipu_dur): """ Fix the default minimum duration of an IPU. @param min_ipu_dur (float) Duration in seconds. """ self.min_ipu_dur = float(min_ipu_dur) # ------------------------------------------------------------------ def set_vol_win_lenght(self, winlength): """ Fix the default windows length for RMS estimations. @param winlength (float) Duration in seconds. """ self.win_lenght = max(winlength, 0.005) # ------------------------------------------------------------------ def set_shift(self, s): """ Fix the default minimum boundary shift value. @param s (float) Duration in seconds. """ self.shift_start = float(s) self.shift_end = float(s) # ------------------------------------------------------------------ def set_shift_start(self, s): """ Fix the default minimum boundary shift value. @param s (float) Duration in seconds. """ self.shift_start = float(s) # ------------------------------------------------------------------ def set_shift_end(self,s): """ Fix the default minimum boundary shift value. @param s (float) Duration in seconds. """ self.shift_end = float(s) # ------------------------------------------------------------------ def min_channel_duration(self): """ Return the minimum duration we expect for a channel. """ d1 = self.min_sil_dur+self.shift_start+self.shift_end d2 = self.min_ipu_dur+self.shift_start+self.shift_end return max(d1,d2) # ------------------------------------------------------------------ def set_bound_start(self, sil=False): """ Fix if it is expected (or not) to find a silence at the beginning of the channel. """ self.bornestart = sil # ------------------------------------------------------------------ def set_bound_end(self, sil=False): """ Fix if it is expected (or not) to find a silence at the end of the channel. """ self.borneend = sil # ------------------------------------------------------------------ # Silence/Speech segmentation # ------------------------------------------------------------------ def extract_tracks(self, min_ipu_dur=None, shift_start=None, shift_end=None): """ Return a list of tuples (from_pos,to_pos) of tracks. The tracks are found from the current list of silences. @param min_ipu_dur (float) The minimum duration for a track (in seconds) @param shiftdurstart (float) The time to remove to the start boundary (in seconds) @param shiftdurend (float) The time to add to the end boundary (in seconds) @return (list of tuples) """ if self.chansil is None: return [] if min_ipu_dur is None: min_ipu_dur=self.min_ipu_dur if shift_start is None: shift_start=self.shift_start if shift_end is None: shift_end=self.shift_end return self.chansil.extract_tracks(min_ipu_dur, shift_start, shift_end) # ------------------------------------------------------------------ def search_tracks(self, volume): """ Return the tracks if volume is used as threshold. """ if self.chansil is None: return [] self.chansil.search_silences(volume, mintrackdur=IPUsAudio.MIN_IPU_DUR) self.chansil.filter_silences(self.min_sil_dur) return self.extract_tracks() # ------------------------------------------------------------------ def check_boundaries(self, tracks): """ Check if silences at start and end are as expected. @return bool """ if len(tracks) == 0: return False if self.chansil is None: return False if self.bornestart is False and self.borneend is False: # we do not know anything about silences at start and end # then, everything is ALWAYS OK! return True first_from_pos = tracks[0][0] last_to_pos = tracks[len(tracks)-1][1] # If I expected a silence at start... and I found a track if self.bornestart is True and first_from_pos==0: return False # If I expected a silence at end... and I found a track if self.borneend is True and last_to_pos==self.chansil.get_channel().get_nframes(): return False return True # ------------------------------------------------------------------ def split_into_vol(self, nbtracks): """ Try various volume values to estimate silences then get the expected number of tracks. @param nbtracks is the expected number of IPUs @return number of tracks """ if self.chansil is None: return 0 volstats = self.chansil.get_volstats() # Min volume in the speech vmin = volstats.min() # Max is set to the mean vmax = volstats.mean() # Step is necessary to not exaggerate a detailed search! # step is set to 5% of the volume between min and mean. step = int( (vmax - vmin) / 20.0 ) # Min and max are adjusted vmin += step vmax -= step # First Test !!! self.vol_threshold = vmin tracks = self.search_tracks(vmin) n = len(tracks) b = self.check_boundaries(tracks) while (n != nbtracks or b is False): # We would never be done anyway. if (vmax==vmin) or (vmax-vmin) < step: return n # Try with the middle volume value vmid = int(vmin + (vmax - vmin) / 2.0) if n > nbtracks: # We split too often. Need to consider less as silence. vmax = vmid elif n < nbtracks: # We split too seldom. Need to consider more as silence. vmin = vmid else: # We did not find start/end silence. vmin += step # Find silences with these parameters self.vol_threshold = int(vmid) tracks = self.search_tracks(vmid) n = len(tracks) b = self.check_boundaries(tracks) return n # ------------------------------------------------------------------ def split_into(self, nbtracks): """ Try various volume values, pause durations and silence duration to get silences. @param nbtracks is the expected number of IPUs. 0=auto. """ if self.chansil is None: raise Exception('No audio data.') if self.auto_vol is True: self.vol_threshold = self.chansil.search_threshold_vol() if nbtracks == 0: self.search_tracks( self.vol_threshold ) return 0 # Try with default parameters: tracks = self.search_tracks( self.vol_threshold ) n = len(tracks) b = self.check_boundaries(tracks) if n == nbtracks and b is True: return n # Try with default lengths (change only volume): n = self.split_into_vol( nbtracks ) if n > nbtracks: # We split too often. Try with larger' values. while n > nbtracks: self.min_sil_dur += self.win_lenght self.min_ipu_dur += self.win_lenght n = self.split_into_vol( nbtracks ) elif n < nbtracks: # We split too seldom. Try with shorter' values of silences p = self.min_sil_dur m = self.min_ipu_dur while n < nbtracks and self.min_sil_dur > IPUsAudio.MIN_SIL_DUR: self.min_sil_dur -= self.win_lenght n = self.split_into_vol( nbtracks ) # we failed... try with shorter' values of ipus if n < nbtracks: self.min_sil_dur = p while n < nbtracks and self.min_ipu_dur > IPUsAudio.MIN_IPU_DUR: self.min_ipu_dur -= self.win_lenght n = self.split_into_vol( nbtracks ) # we failed... try with shorter' values of both sil/ipus if n < nbtracks: self.min_ipu_dur = m while n < nbtracks and self.min_sil_dur > IPUsAudio.MIN_SIL_DUR and self.min_ipu_dur > IPUsAudio.MIN_IPU_DUR: self.min_ipu_dur -= self.win_lenght self.min_sil_dur -= self.win_lenght n = self.split_into_vol( nbtracks ) return n