Exemple #1
0
    def extract(self, src_file, tgt_file):
        '''
        Feature extraction
        '''

        # Read this audio file
        y_t = ia.read(src_file,
                      in_fs=self.config['in_fs'],
                      out_fs=self.config['work_fs'])[0]

        #
        # BEAMFORMING
        #

        # Beamformer poiting at the fromtal direction by default
        if len(y_t.shape) > 1 and y_t.shape[1] > 1:
            y_t = y_t.sum(1)

        #
        # SPEECH ENHANCEMENT
        #

        # Pre-emphasis, STFT
        y_t = sip.preemphasis(y_t, coef=self.config['preemcoef'])
        Y = sip.stft(y_t, self.config['windowsize'], self.config['shift'],
                     self.config['nfft'])
        # Compute IMCRA
        hat_X_LSA = self.se.update(Y)

        if (self.config['mmse_method'] == 'MFCC'
                or self.config['mmse_method'] == 'Wiener'):

            # Get a priori SNR and noise variance
            xi, Lambda_D = self.se.get_param(['xi', 'Lambda_D'])
            # Get Wiener estimate and residual MSE
            G = xi / (1 + xi)
            # Use the posterior associated to the Wiener filter
            hat_X = G * Y
            if self.config['mmse_method'] == 'MFCC':
                MSE = G * Lambda_D
            else:
                MSE = np.zeros(G.shape)

        elif self.config['mmse_method'] == 'LSA':

            # Use LSA
            hat_X = hat_X_LSA
            MSE = np.zeros(hat_X.shape)

        #
        # FEATURE EXTRACTION / UNCERTAINTY PROPAGATION
        #

        # MFCC
        mu_x, Sigma_x = self.mfcc.extract_up(hat_X, MSE)
        # CMS
        mu_x, Sigma_x = self.mfcc.cms_up(mu_x, Sigma_x)
        # Deltas, Accelerations
        mu_d, Sigma_d = fe.deltas_up(mu_x, Sigma_x)
        mu_a, Sigma_a = fe.deltas_up(mu_d, Sigma_d)
        mu_x = np.concatenate((mu_x, mu_d, mu_a))
        Sigma_x = np.concatenate((Sigma_x, Sigma_d, Sigma_a))

        # Provide uncertainty
        if self.config['do_up']:
            x = np.concatenate((mu_x, Sigma_x))
        else:
            x = mu_x

        # Plot features, for debug
        if self.config['targetformat'] == 'HTK':
            htk.writehtkfeats(tgt_file, x, self.config['fp'],
                              self.config['tc'])
        else:
            raise ValueError, ("TARGETFORMAT = %s Not supported" %
                               self.config['targetformat'])
    def extract(self, src_file, tgt_file):
        '''
        Feature extraction
        '''
    
        # Read this audio file
        y_t = ia.read(src_file, in_fs=self.config['in_fs'], 
                      out_fs=self.config['work_fs'])[0]

        #
        # BEAMFORMING
        # 

        # Beamformer poiting at the fromtal direction by default
        if len(y_t.shape) > 1 and  y_t.shape[1] > 1:
            y_t = y_t.sum(1)

        #
        # SPEECH ENHANCEMENT
        #            
        
        # Pre-emphasis, STFT
        y_t = sip.preemphasis(y_t, coef=self.config['preemcoef'])
        Y   = sip.stft(y_t, self.config['windowsize'], self.config['shift'], 
                       self.config['nfft'])
        # Compute IMCRA  
        hat_X_LSA = self.se.update(Y)


        if (self.config['mmse_method'] == 'MFCC' or 
            self.config['mmse_method'] == 'Wiener'):

            # Get a priori SNR and noise variance 
            xi, Lambda_D = self.se.get_param(['xi', 'Lambda_D'])
            # Get Wiener estimate and residual MSE
            G     = xi/(1+xi)
            # Use the posterior associated to the Wiener filter
            hat_X = G*Y
            if self.config['mmse_method'] == 'MFCC':
                MSE   = G*Lambda_D 
            else:
                MSE   = np.zeros(G.shape) 

        elif self.config['mmse_method'] == 'LSA':
            
            # Use LSA 
            hat_X = hat_X_LSA
            MSE   = np.zeros(hat_X.shape)

        #
        # FEATURE EXTRACTION / UNCERTAINTY PROPAGATION
        #

        # MFCC
        mu_x, Sigma_x = self.mfcc.extract_up(hat_X, MSE)  
        # CMS
        mu_x, Sigma_x   = self.mfcc.cms_up(mu_x, Sigma_x)
        # Deltas, Accelerations
        mu_d, Sigma_d = fe.deltas_up(mu_x, Sigma_x)   
        mu_a, Sigma_a = fe.deltas_up(mu_d, Sigma_d)   
        mu_x          = np.concatenate((mu_x, mu_d, mu_a))
        Sigma_x       = np.concatenate((Sigma_x, Sigma_d, Sigma_a))

        # Provide uncertainty
        if self.config['do_up']:
            x = np.concatenate((mu_x, Sigma_x))
        else:
            x = mu_x
       
        # Plot features, for debug 
        if self.config['targetformat'] == 'HTK':
            htk.writehtkfeats(tgt_file, x, self.config['fp'], self.config['tc']) 
        else:
            raise ValueError, ("TARGETFORMAT = %s Not supported" 
                               % self.config['targetformat'])
Exemple #3
0
    def extract(self, src_file, tgt_file):
        '''
        Feature extraction
        '''
        # Get indices for the position of speech and background based on
        # external info. If MLF or STM provided for VAD use them.
        if 'stm_vad' in self.config:

            # VAD SPECIFIED BY A STM
            if src_file not in self.config['stm_trans']:
                raise EnvironmentError, ("stm file %s has not transcription "
                                         "for %s" %
                                         (self.config['stm_vad'], src_file))
            # Collect speech events and preceeding backgrounds
            events = []
            backgs = []
            t_bg = 0
            for tr in self.config['stm_trans'][src_file]:
                # Preceeding background
                if not tr[2]:
                    backgs.append(None)
                else:
                    backgs.append((0, tr[2] * self.config['in_fs']))
                # Speech event
                events.append((src_file, tr[2] * self.config['in_fs'],
                               tr[3] * self.config['in_fs']))

        else:

            # ONE SINGLE EVENT IN PRESENT MICROPHONE
            T = int(self.config['work_fs'] * self.config['init_time'])
            events = [(src_file, T, -1)]
            backgs = [(0, T)]

        # Loop over events in the scene
        for backg, event in zip(backgs, events):

            # Read this audio file
            y_t = ia.read(src_file,
                          in_fs=self.config['in_fs'],
                          out_fs=self.config['work_fs'])[0]

            #
            # BEAMFORMING
            #

            # Beamformer poiting at the fromtal direction by default
            if y_t.shape[1] > 1:
                y_t = y_t.sum(1)

            #
            # SPEECH ENHANCEMENT
            #

            # Select segment of background preceeding speech
            if backg:
                d_t = y_t[backg[0]:backg[1]]
            else:
                d_t = None

            # Select segment of speech
            y_t = y_t[event[1]:event[2]]

            # Pre-emphasis, STFT
            y_t = sip.preemphasis(y_t, coef=self.config['preemcoef'])
            Y = sip.stft(y_t, self.config['windowsize'], self.config['shift'],
                         self.config['nfft'])
            # Compute IMCRA
            self.se.update(Y)

            # Get a priori SNR and Wiener gain
            xi = self.se.store['xi'][:, :self.se.l]
            G = xi / (1 + xi)
            # Get Wiener estimate and residual MSE
            hat_X_W = G * Y
            MSE = G * self.se.store['Lambda_D'][:, :self.se.l]

            set_trace()

            # MFCC
            mu_x, Sigma_x = self.mfcc.extract_up(hat_X_W, MSE)
            # CMS
            mu_x, Sigma_x = self.mfcc.cms_up(mu_x, Sigma_x)
            # Deltas, Accelerations
            mu_d, Sigma_d = fe.deltas_up(mu_x, Sigma_x)
            mu_a, Sigma_a = fe.deltas_up(mu_d, Sigma_d)
            mu_x = np.concatenate((mu_x, mu_d, mu_a))
            Sigma_x = np.concatenate((Sigma_x, Sigma_d, Sigma_a))

            if self.config['unc_prop']:
                x = np.concatenate((mu_x, Sigma_x))
            else:
                x = mu_x

            # Plot features, for debug
            htk.writehtkfeats(target_file, x, self.config['fp'],
                              self.config['tc'])
    def extract(self, src_file, tgt_file):
        '''
        Feature extraction
        '''
        # Get indices for the position of speech and background based on 
        # external info. If MLF or STM provided for VAD use them.
        if 'stm_vad' in self.config:

            # VAD SPECIFIED BY A STM
            if src_file not in self.config['stm_trans']:
                raise EnvironmentError, ("stm file %s has not transcription "
                                         "for %s" % (self.config['stm_vad'], 
                                         src_file))  
            # Collect speech events and preceeding backgrounds
            events = []
            backgs = []
            t_bg   = 0 
            for tr in self.config['stm_trans'][src_file]:
                # Preceeding background
                if not tr[2]:
                    backgs.append(None) 
                else:
                    backgs.append((0, tr[2]*self.config['in_fs'])) 
                # Speech event
                events.append((src_file, tr[2]*self.config['in_fs'], 
                               tr[3]*self.config['in_fs'])) 
            
        else:

            # ONE SINGLE EVENT IN PRESENT MICROPHONE
            T      = int(self.config['work_fs']*self.config['init_time'])
            events = [(src_file, T, -1)]  
            backgs = [(0, T)] 

        # Loop over events in the scene
        for backg, event in zip(backgs, events):
     
            # Read this audio file
            y_t = ia.read(src_file, in_fs=self.config['in_fs'], 
                          out_fs=self.config['work_fs'])[0]

            #
            # BEAMFORMING
            # 

            # Beamformer poiting at the fromtal direction by default
            if y_t.shape[1] > 1:
                y_t = y_t.sum(1)

            #
            # SPEECH ENHANCEMENT
            #            

            # Select segment of background preceeding speech 
            if backg:
                d_t = y_t[backg[0]:backg[1]]
            else:
                d_t = None

            # Select segment of speech 
            y_t = y_t[event[1]:event[2]] 
        
            # Pre-emphasis, STFT
            y_t = sip.preemphasis(y_t, coef=self.config['preemcoef'])
            Y   = sip.stft(y_t, self.config['windowsize'], self.config['shift'], 
                           self.config['nfft'])
            # Compute IMCRA  
            self.se.update(Y)

            # Get a priori SNR and Wiener gain
            xi      = self.se.store['xi'][:, :self.se.l]
            G       = xi/(1+xi)
            # Get Wiener estimate and residual MSE
            hat_X_W = G*Y
            MSE     = G*self.se.store['Lambda_D'][:, :self.se.l] 

            set_trace()

            # MFCC
            mu_x, Sigma_x = self.mfcc.extract_up(hat_X_W, MSE)  
            # CMS
            mu_x, Sigma_x   = self.mfcc.cms_up(mu_x, Sigma_x)
            # Deltas, Accelerations
            mu_d, Sigma_d = fe.deltas_up(mu_x, Sigma_x)   
            mu_a, Sigma_a = fe.deltas_up(mu_d, Sigma_d)   
            mu_x          = np.concatenate((mu_x, mu_d, mu_a))
            Sigma_x       = np.concatenate((Sigma_x, Sigma_d, Sigma_a))

            if self.config['unc_prop']:
                x = np.concatenate((mu_x, Sigma_x))
            else:
                x = mu_x
       
            # Plot features, for debug 
            htk.writehtkfeats(target_file, x, self.config['fp'], self.config['tc'])