コード例 #1
0
    def get_frame(self, file_name, frame_id):
        """Returns a frame from a specific param file."""
        if self.last_file_name != file_name:
            self.last_file_name = file_name

            # print "FN", file_name

            # find matching param file
            param_file_name = self.get_param_file_name(file_name)
            if param_file_name == None:
                raise Exception("MLFMFCCOnlineAlignedArray: param_file_name cannot be None, file_name: " + file_name)
            # print "PFN", param_file_name

            # open the param file
            try:
                self.last_param_file_features = wave.open(param_file_name, 'r')
            except AttributeError:
                print "Error opening file:", param_file_name

            if self.last_param_file_features.getnchannels() != 1:
                raise Exception('Input wave is not in mono')

            if self.last_param_file_features.getsampwidth() != 2:
                raise Exception('Input wave is not in 16bit')

            sample_rate = self.last_param_file_features.getframerate()
            self.frame_size = int(sample_rate * self.windowsize / 10000000)
            if self.frame_size > 1024:
                self.frame_size = 2048
            elif self.frame_size > 512:
                self.frame_size = 1024
            elif self.frame_size > 256:
                self.frame_size = 512
            elif self.frame_size > 128:
                self.frame_size = 256
            elif self.frame_size > 64:
                self.frame_size = 128

            self.frame_shift = int(sample_rate * self.targetrate / 10000000)
            self.mfcc_front_end = MFCCFrontEnd(sample_rate, self.frame_size, usec0=self.usec0,
                                               usedelta=self.usedelta, useacc=self.useacc,
                                               n_last_frames=self.n_last_frames, mel_banks_only = self.mel_banks_only)

        # print "FS", self.frame_size
        self.last_param_file_features.setpos(max(frame_id * self.frame_shift - int(self.frame_size / 2), 0))
        frame = self.last_param_file_features.readframes(self.frame_size)
        # print "LN", len(frame)

        frame = numpy.frombuffer(frame, dtype=numpy.int16)

        try:
            mfcc_params = self.mfcc_front_end.param(frame)
        except ValueError:
            print file_name, frame_id, len(frame)
            raise
            
        return mfcc_params
コード例 #2
0
ファイル: htk.py プロジェクト: tkraut/alex
    def get_frame(self, file_name, frame_id):
        """Returns a frame from a specific param file."""
        if self.last_file_name != file_name:
            self.last_file_name = file_name

            # print "FN", file_name

            # find matching param file
            param_file_name = self.get_param_file_name(file_name)

            # print "PFN", param_file_name

            # open the param file
            self.last_param_file_features = wave.open(param_file_name, 'r')

            if self.last_param_file_features.getnchannels() != 1:
                raise Exception('Input wave is not in mono')

            if self.last_param_file_features.getsampwidth() != 2:
                raise Exception('Input wave is not in 16bit')

            sample_rate = self.last_param_file_features.getframerate()
            self.frame_size = int(sample_rate * self.windowsize / 10000000)
            if self.frame_size > 1024:
                self.frame_size = 2048
            elif self.frame_size > 512:
                self.frame_size = 1024
            elif self.frame_size > 256:
                self.frame_size = 512
            elif self.frame_size > 128:
                self.frame_size = 256
            elif self.frame_size > 64:
                self.frame_size = 128

            self.frame_shift = int(sample_rate * self.targetrate / 10000000)
            self.mfcc_front_end = MFCCFrontEnd(
                sample_rate,
                self.frame_size,
                usec0=self.usec0,
                usedelta=self.usedelta,
                useacc=self.useacc,
                n_last_frames=self.n_last_frames)

        # print "FS", self.frame_size
        self.last_param_file_features.setpos(
            max(frame_id * self.frame_shift - int(self.frame_size / 2), 0))
        frame = self.last_param_file_features.readframes(self.frame_size)
        # print "LN", len(frame)

        frame = numpy.frombuffer(frame, dtype=numpy.int16)

        mfcc_params = self.mfcc_front_end.param(frame)

        return mfcc_params
コード例 #3
0
ファイル: ffnn.py プロジェクト: UFAL-DSG/alex
    def __init__(self, model, filter_length, sample_rate, framesize, frameshift,
                 usehamming, preemcoef, numchans,  ceplifter, numceps,
                 enormalise, zmeansource, usepower, usec0, usecmn, usedelta,
                 useacc, n_last_frames, n_prev_frames, lofreq, hifreq,
                 mel_banks_only):
        self.audio_recorded_in = []

        self.ffnn = TheanoFFNN()
        self.ffnn.load(model)

        self.log_probs_speech = deque(maxlen=filter_length)
        self.log_probs_sil = deque(maxlen=filter_length)

        self.last_decision = 0.0


        self.front_end = MFCCFrontEnd(
            sample_rate, framesize,
            usehamming, preemcoef,
            numchans, ceplifter,
            numceps, enormalise,
            zmeansource, usepower,
            usec0, usecmn,
            usedelta, useacc,
            n_last_frames + n_prev_frames,
            lofreq, hifreq,
            mel_banks_only)

        self.framesize = framesize
        self.frameshift = frameshift
コード例 #4
0
ファイル: ffnn.py プロジェクト: beka-evature/alex
    def __init__(self, cfg):
        self.cfg = cfg

        self.audio_recorded_in = []

        self.ffnn = TheanoFFNN()
        self.ffnn.load(self.cfg['VAD']['ffnn']['model'])

        self.log_probs_speech = deque(maxlen=self.cfg['VAD']['ffnn']['filter_length'])
        self.log_probs_sil = deque(maxlen=self.cfg['VAD']['ffnn']['filter_length'])

        self.last_decision = 0.0

        if self.cfg['VAD']['ffnn']['frontend'] == 'MFCC':
            self.front_end = MFCCFrontEnd(
                self.cfg['Audio']['sample_rate'], self.cfg['VAD']['ffnn']['framesize'],
                self.cfg['VAD']['ffnn']['usehamming'], self.cfg['VAD']['ffnn']['preemcoef'],
                self.cfg['VAD']['ffnn']['numchans'], self.cfg['VAD']['ffnn']['ceplifter'],
                self.cfg['VAD']['ffnn']['numceps'], self.cfg['VAD']['ffnn']['enormalise'],
                self.cfg['VAD']['ffnn']['zmeansource'], self.cfg['VAD']['ffnn']['usepower'],
                self.cfg['VAD']['ffnn']['usec0'], self.cfg['VAD']['ffnn']['usecmn'],
                self.cfg['VAD']['ffnn']['usedelta'], self.cfg['VAD']['ffnn']['useacc'],
                self.cfg['VAD']['ffnn']['n_last_frames']+self.cfg['VAD']['ffnn']['n_prev_frames'],
                self.cfg['VAD']['ffnn']['lofreq'], self.cfg['VAD']['ffnn']['hifreq'],
                self.cfg['VAD']['ffnn']['mel_banks_only'])
        else:
            raise ASRException('Unsupported frontend: %s' % (self.cfg['VAD']['ffnn']['frontend'], ))
コード例 #5
0
    def __init__(self, cfg):
        self.cfg = cfg

        self.audio_recorded_in = []

        self.ffnn = TheanoFFNN()
        self.ffnn.load(self.cfg['VAD']['ffnn']['model'])

        self.log_probs_speech = deque(
            maxlen=self.cfg['VAD']['ffnn']['filter_length'])
        self.log_probs_sil = deque(
            maxlen=self.cfg['VAD']['ffnn']['filter_length'])

        self.last_decision = 0.0

        if self.cfg['VAD']['ffnn']['frontend'] == 'MFCC':
            self.front_end = MFCCFrontEnd(
                self.cfg['Audio']['sample_rate'],
                self.cfg['VAD']['ffnn']['framesize'],
                self.cfg['VAD']['ffnn']['usehamming'],
                self.cfg['VAD']['ffnn']['preemcoef'],
                self.cfg['VAD']['ffnn']['numchans'],
                self.cfg['VAD']['ffnn']['ceplifter'],
                self.cfg['VAD']['ffnn']['numceps'],
                self.cfg['VAD']['ffnn']['enormalise'],
                self.cfg['VAD']['ffnn']['zmeansource'],
                self.cfg['VAD']['ffnn']['usepower'],
                self.cfg['VAD']['ffnn']['usec0'],
                self.cfg['VAD']['ffnn']['usecmn'],
                self.cfg['VAD']['ffnn']['usedelta'],
                self.cfg['VAD']['ffnn']['useacc'],
                self.cfg['VAD']['ffnn']['n_last_frames'] +
                self.cfg['VAD']['ffnn']['n_prev_frames'],
                self.cfg['VAD']['ffnn']['lofreq'],
                self.cfg['VAD']['ffnn']['hifreq'],
                self.cfg['VAD']['ffnn']['mel_banks_only'])
        else:
            raise ASRException('Unsupported frontend: %s' %
                               (self.cfg['VAD']['ffnn']['frontend'], ))
コード例 #6
0
ファイル: htk.py プロジェクト: AoJ/alex
    def get_frame(self, file_name, frame_id):
        """Returns a frame from a specific param file."""
        if self.last_file_name != file_name:
            self.last_file_name = file_name

#      print "FN", file_name

            # find matching param file
            param_file_name = self.get_param_file_name(file_name)

#      print "PFN", param_file_name

            # open the param file
            self.last_param_file_features = wave.open(param_file_name, 'r')

            if self.last_param_file_features.getnchannels() != 1:
                raise Exception('Input wave is not in mono')

            if self.last_param_file_features.getsampwidth() != 2:
                raise Exception('Input wave is not in 16bit')

            sample_rate = self.last_param_file_features.getframerate()
            self.frame_size = int(sample_rate * self.windowsize / 10000000)
            if self.frame_size > 1024:
                self.frame_size = 2048
            elif self.frame_size > 512:
                self.frame_size = 1024
            elif self.frame_size > 256:
                self.frame_size = 512
            elif self.frame_size > 128:
                self.frame_size = 256
            elif self.frame_size > 64:
                self.frame_size = 128

            self.frame_shift = int(sample_rate * self.targetrate / 10000000)
            self.mfcc_front_end = MFCCFrontEnd(sample_rate, self.frame_size, usec0=self.usec0,
                                               usedelta=self.usedelta, useacc=self.useacc,
                                               n_last_frames=self.n_last_frames)

#    print "FS", self.frame_size
        self.last_param_file_features.setpos(max(frame_id * self.frame_shift - int(self.frame_size / 2), 0))
        frame = self.last_param_file_features.readframes(self.frame_size)
#    print "LN", len(frame)

        frame = numpy.frombuffer(frame, dtype=numpy.int16)

        mfcc_params = self.mfcc_front_end.param(frame)

        return mfcc_params
コード例 #7
0
ファイル: ffnn.py プロジェクト: henrypig/alex-1
    def __init__(self, model, filter_length, sample_rate, framesize,
                 frameshift, usehamming, preemcoef, numchans, ceplifter,
                 numceps, enormalise, zmeansource, usepower, usec0, usecmn,
                 usedelta, useacc, n_last_frames, n_prev_frames, lofreq,
                 hifreq, mel_banks_only):
        self.audio_recorded_in = []

        self.ffnn = TheanoFFNN()
        self.ffnn.load(model)

        self.log_probs_speech = deque(maxlen=filter_length)
        self.log_probs_sil = deque(maxlen=filter_length)

        self.last_decision = 0.0

        self.front_end = MFCCFrontEnd(sample_rate, framesize, usehamming,
                                      preemcoef, numchans, ceplifter, numceps,
                                      enormalise, zmeansource, usepower, usec0,
                                      usecmn, usedelta, useacc,
                                      n_last_frames + n_prev_frames, lofreq,
                                      hifreq, mel_banks_only)

        self.framesize = framesize
        self.frameshift = frameshift
コード例 #8
0
class MLFMFCCOnlineAlignedArray(MLFFeaturesAlignedArray):

    """This is an extension of MLFFeaturesAlignedArray which computes the features on the fly from
    the input wav files.

    It uses our own implementation of the MFCC computation. As a result it does not give the same results
    as the HTK HCopy.

    The experience suggests that our MFFC features are worse than the features generated by HCopy.

    """

    def __init__(self, windowsize=250000, targetrate=100000, filter=None,
                 usec0=False, usedelta=True, useacc=True,
                 n_last_frames=0, mel_banks_only = False):
        """Initialise the MFCC front-end.

        windowsize - defines the length of the window (frame) in the HTK's 100ns units
        targetrate - defines the period with which new coefficients should be generated (again in 100ns units)
        """
        MLFFeaturesAlignedArray.__init__(self, filter)

        self.windowsize = windowsize
        self.targetrate = targetrate
        self.usec0 = usec0
        self.usedelta = usedelta
        self.useacc = useacc
        self.n_last_frames = n_last_frames
        self.mel_banks_only = mel_banks_only

        self.mfcc_front_end = None

    def get_frame(self, file_name, frame_id):
        """Returns a frame from a specific param file."""
        if self.last_file_name != file_name:
            self.last_file_name = file_name

            # print "FN", file_name

            # find matching param file
            param_file_name = self.get_param_file_name(file_name)
            if param_file_name == None:
                raise Exception("MLFMFCCOnlineAlignedArray: param_file_name cannot be None, file_name: " + file_name)
            # print "PFN", param_file_name

            # open the param file
            try:
                self.last_param_file_features = wave.open(param_file_name, 'r')
            except AttributeError:
                print "Error opening file:", param_file_name

            if self.last_param_file_features.getnchannels() != 1:
                raise Exception('Input wave is not in mono')

            if self.last_param_file_features.getsampwidth() != 2:
                raise Exception('Input wave is not in 16bit')

            sample_rate = self.last_param_file_features.getframerate()
            self.frame_size = int(sample_rate * self.windowsize / 10000000)
            if self.frame_size > 1024:
                self.frame_size = 2048
            elif self.frame_size > 512:
                self.frame_size = 1024
            elif self.frame_size > 256:
                self.frame_size = 512
            elif self.frame_size > 128:
                self.frame_size = 256
            elif self.frame_size > 64:
                self.frame_size = 128

            self.frame_shift = int(sample_rate * self.targetrate / 10000000)
            self.mfcc_front_end = MFCCFrontEnd(sample_rate, self.frame_size, usec0=self.usec0,
                                               usedelta=self.usedelta, useacc=self.useacc,
                                               n_last_frames=self.n_last_frames, mel_banks_only = self.mel_banks_only)

        # print "FS", self.frame_size
        self.last_param_file_features.setpos(max(frame_id * self.frame_shift - int(self.frame_size / 2), 0))
        frame = self.last_param_file_features.readframes(self.frame_size)
        # print "LN", len(frame)

        frame = numpy.frombuffer(frame, dtype=numpy.int16)

        try:
            mfcc_params = self.mfcc_front_end.param(frame)
        except ValueError:
            print file_name, frame_id, len(frame)
            raise
            
        return mfcc_params
コード例 #9
0
ファイル: ffnn.py プロジェクト: henrypig/alex-1
class FFNNVADGeneral(object):
    """ This is implementation of a FFNN based voice activity detector.

    It only implements decisions whether input frame is speech of non speech.
    It returns the posterior probability of speech for N last input frames.
    """
    def __init__(self, model, filter_length, sample_rate, framesize,
                 frameshift, usehamming, preemcoef, numchans, ceplifter,
                 numceps, enormalise, zmeansource, usepower, usec0, usecmn,
                 usedelta, useacc, n_last_frames, n_prev_frames, lofreq,
                 hifreq, mel_banks_only):
        self.audio_recorded_in = []

        self.ffnn = TheanoFFNN()
        self.ffnn.load(model)

        self.log_probs_speech = deque(maxlen=filter_length)
        self.log_probs_sil = deque(maxlen=filter_length)

        self.last_decision = 0.0

        self.front_end = MFCCFrontEnd(sample_rate, framesize, usehamming,
                                      preemcoef, numchans, ceplifter, numceps,
                                      enormalise, zmeansource, usepower, usec0,
                                      usecmn, usedelta, useacc,
                                      n_last_frames + n_prev_frames, lofreq,
                                      hifreq, mel_banks_only)

        self.framesize = framesize
        self.frameshift = frameshift

    def decide(self, data):
        """Processes the input frame whether the input segment is speech or non speech.

        The returned values can be in range from 0.0 to 1.0.
        It returns 1.0 for 100% speech segment and 0.0 for 100% non speech segment.
        """

        data = struct.unpack('%dh' % (len(data) / 2, ), data)
        self.audio_recorded_in.extend(data)

        while len(self.audio_recorded_in) > self.framesize:
            frame = self.audio_recorded_in[:self.framesize]
            self.audio_recorded_in = self.audio_recorded_in[self.frameshift:]

            mfcc = self.front_end.param(frame)

            prob_sil, prob_speech = self.ffnn.predict_normalise(
                mfcc.reshape(1, len(mfcc)))[0]

            # print prob_sil, prob_speech

            self.log_probs_speech.append(log(prob_speech))
            self.log_probs_sil.append(log(prob_sil))

            log_prob_speech_avg = 0.0
            for log_prob_speech, log_prob_sil in zip(self.log_probs_speech,
                                                     self.log_probs_sil):
                log_prob_speech_avg += log_prob_speech - logsumexp(
                    [log_prob_speech, log_prob_sil])
            log_prob_speech_avg /= len(self.log_probs_speech)

            prob_speech_avg = np.exp(log_prob_speech_avg)

            # print 'prob_speech_avg: %5.3f' % prob_speech_avg

            self.last_decision = prob_speech_avg

        # returns a speech / non-speech decisions
        return self.last_decision
コード例 #10
0
ファイル: ffnn.py プロジェクト: tkraut/alex
class FFNNVAD():
    """ This is implementation of a FFNN based voice activity detector.

    It only implements decisions whether input frame is speech of non speech.
    It returns the posterior probability of speech for N last input frames.
    """
    def __init__(self, cfg):
        self.cfg = cfg

        self.audio_recorded_in = []

        self.ffnn = FFNN()
        self.ffnn.load(self.cfg['VAD']['ffnn']['model'])

        self.log_probs_speech = deque(
            maxlen=self.cfg['VAD']['ffnn']['filter_length'])
        self.log_probs_sil = deque(
            maxlen=self.cfg['VAD']['ffnn']['filter_length'])

        self.last_decision = 0.0

        if self.cfg['VAD']['ffnn']['frontend'] == 'MFCC':
            self.front_end = MFCCFrontEnd(
                self.cfg['Audio']['sample_rate'],
                self.cfg['VAD']['ffnn']['framesize'],
                self.cfg['VAD']['ffnn']['usehamming'],
                self.cfg['VAD']['ffnn']['preemcoef'],
                self.cfg['VAD']['ffnn']['numchans'],
                self.cfg['VAD']['ffnn']['ceplifter'],
                self.cfg['VAD']['ffnn']['numceps'],
                self.cfg['VAD']['ffnn']['enormalise'],
                self.cfg['VAD']['ffnn']['zmeansource'],
                self.cfg['VAD']['ffnn']['usepower'],
                self.cfg['VAD']['ffnn']['usec0'],
                self.cfg['VAD']['ffnn']['usecmn'],
                self.cfg['VAD']['ffnn']['usedelta'],
                self.cfg['VAD']['ffnn']['useacc'],
                self.cfg['VAD']['ffnn']['n_last_frames'],
                self.cfg['VAD']['ffnn']['lofreq'],
                self.cfg['VAD']['ffnn']['hifreq'])
        else:
            raise ASRException('Unsupported frontend: %s' %
                               (self.cfg['VAD']['ffnn']['frontend'], ))

    def decide(self, data):
        """Processes the input frame whether the input segment is speech or non speech.

        The returned values can be in range from 0.0 to 1.0.
        It returns 1.0 for 100% speech segment and 0.0 for 100% non speech segment.
        """

        data = struct.unpack('%dh' % (len(data) / 2, ), data)
        self.audio_recorded_in.extend(data)

        while len(
                self.audio_recorded_in) > self.cfg['VAD']['ffnn']['framesize']:
            frame = self.audio_recorded_in[:self.
                                           cfg['VAD']['ffnn']['framesize']]
            self.audio_recorded_in = self.audio_recorded_in[
                self.cfg['VAD']['ffnn']['frameshift']:]

            mfcc = self.front_end.param(frame)

            prob_sil, prob_speech = self.ffnn.predict(mfcc)

            self.log_probs_speech.append(log(prob_speech))
            self.log_probs_sil.append(log(prob_sil))

            log_prob_speech_avg = 0.0
            for log_prob_speech, log_prob_sil in zip(self.log_probs_speech,
                                                     self.log_probs_sil):
                log_prob_speech_avg += log_prob_speech - logsumexp(
                    [log_prob_speech, log_prob_sil])
            log_prob_speech_avg /= len(self.log_probs_speech)

            prob_speech_avg = np.exp(log_prob_speech_avg)

            #      print 'prob_speech_avg: %5.3f' % prob_speech_avg

            self.last_decision = prob_speech_avg

        # returns a speech / non-speech decisions
        return self.last_decision
コード例 #11
0
ファイル: ffnn.py プロジェクト: UFAL-DSG/alex
class FFNNVADGeneral(object):
    """ This is implementation of a FFNN based voice activity detector.

    It only implements decisions whether input frame is speech of non speech.
    It returns the posterior probability of speech for N last input frames.
    """
    def __init__(self, model, filter_length, sample_rate, framesize, frameshift,
                 usehamming, preemcoef, numchans,  ceplifter, numceps,
                 enormalise, zmeansource, usepower, usec0, usecmn, usedelta,
                 useacc, n_last_frames, n_prev_frames, lofreq, hifreq,
                 mel_banks_only):
        self.audio_recorded_in = []

        self.ffnn = TheanoFFNN()
        self.ffnn.load(model)

        self.log_probs_speech = deque(maxlen=filter_length)
        self.log_probs_sil = deque(maxlen=filter_length)

        self.last_decision = 0.0


        self.front_end = MFCCFrontEnd(
            sample_rate, framesize,
            usehamming, preemcoef,
            numchans, ceplifter,
            numceps, enormalise,
            zmeansource, usepower,
            usec0, usecmn,
            usedelta, useacc,
            n_last_frames + n_prev_frames,
            lofreq, hifreq,
            mel_banks_only)

        self.framesize = framesize
        self.frameshift = frameshift

    def decide(self, data):
        """Processes the input frame whether the input segment is speech or non speech.

        The returned values can be in range from 0.0 to 1.0.
        It returns 1.0 for 100% speech segment and 0.0 for 100% non speech segment.
        """

        data = struct.unpack('%dh' % (len(data) / 2, ), data)
        self.audio_recorded_in.extend(data)

        while len(self.audio_recorded_in) > self.framesize:
            frame = self.audio_recorded_in[:self.framesize]
            self.audio_recorded_in = self.audio_recorded_in[self.frameshift:]

            mfcc = self.front_end.param(frame)

            prob_sil, prob_speech = self.ffnn.predict_normalise(mfcc.reshape(1,len(mfcc)))[0]

            # print prob_sil, prob_speech

            self.log_probs_speech.append(log(prob_speech))
            self.log_probs_sil.append(log(prob_sil))

            log_prob_speech_avg = 0.0
            for log_prob_speech, log_prob_sil in zip(self.log_probs_speech, self.log_probs_sil):
                log_prob_speech_avg += log_prob_speech - logsumexp([log_prob_speech, log_prob_sil])
            log_prob_speech_avg /= len(self.log_probs_speech)

            prob_speech_avg = np.exp(log_prob_speech_avg)

            # print 'prob_speech_avg: %5.3f' % prob_speech_avg

            self.last_decision = prob_speech_avg

        # returns a speech / non-speech decisions
        return self.last_decision
コード例 #12
0
ファイル: ffnn.py プロジェクト: beka-evature/alex
class FFNNVAD():
    """ This is implementation of a FFNN based voice activity detector.

    It only implements decisions whether input frame is speech of non speech.
    It returns the posterior probability of speech for N last input frames.
    """
    def __init__(self, cfg):
        self.cfg = cfg

        self.audio_recorded_in = []

        self.ffnn = TheanoFFNN()
        self.ffnn.load(self.cfg['VAD']['ffnn']['model'])

        self.log_probs_speech = deque(maxlen=self.cfg['VAD']['ffnn']['filter_length'])
        self.log_probs_sil = deque(maxlen=self.cfg['VAD']['ffnn']['filter_length'])

        self.last_decision = 0.0

        if self.cfg['VAD']['ffnn']['frontend'] == 'MFCC':
            self.front_end = MFCCFrontEnd(
                self.cfg['Audio']['sample_rate'], self.cfg['VAD']['ffnn']['framesize'],
                self.cfg['VAD']['ffnn']['usehamming'], self.cfg['VAD']['ffnn']['preemcoef'],
                self.cfg['VAD']['ffnn']['numchans'], self.cfg['VAD']['ffnn']['ceplifter'],
                self.cfg['VAD']['ffnn']['numceps'], self.cfg['VAD']['ffnn']['enormalise'],
                self.cfg['VAD']['ffnn']['zmeansource'], self.cfg['VAD']['ffnn']['usepower'],
                self.cfg['VAD']['ffnn']['usec0'], self.cfg['VAD']['ffnn']['usecmn'],
                self.cfg['VAD']['ffnn']['usedelta'], self.cfg['VAD']['ffnn']['useacc'],
                self.cfg['VAD']['ffnn']['n_last_frames']+self.cfg['VAD']['ffnn']['n_prev_frames'],
                self.cfg['VAD']['ffnn']['lofreq'], self.cfg['VAD']['ffnn']['hifreq'],
                self.cfg['VAD']['ffnn']['mel_banks_only'])
        else:
            raise ASRException('Unsupported frontend: %s' % (self.cfg['VAD']['ffnn']['frontend'], ))

    def decide(self, data):
        """Processes the input frame whether the input segment is speech or non speech.

        The returned values can be in range from 0.0 to 1.0.
        It returns 1.0 for 100% speech segment and 0.0 for 100% non speech segment.
        """

        data = struct.unpack('%dh' % (len(data) / 2, ), data)
        self.audio_recorded_in.extend(data)

        while len(self.audio_recorded_in) > self.cfg['VAD']['ffnn']['framesize']:
            frame = self.audio_recorded_in[:self.cfg['VAD']['ffnn']['framesize']]
            self.audio_recorded_in = self.audio_recorded_in[self.cfg['VAD']['ffnn']['frameshift']:]

            mfcc = self.front_end.param(frame)

            prob_sil, prob_speech = self.ffnn.predict_normalise(mfcc.reshape(1,len(mfcc)))[0]

            # print prob_sil, prob_speech

            self.log_probs_speech.append(log(prob_speech))
            self.log_probs_sil.append(log(prob_sil))

            log_prob_speech_avg = 0.0
            for log_prob_speech, log_prob_sil in zip(self.log_probs_speech, self.log_probs_sil):
                log_prob_speech_avg += log_prob_speech - logsumexp([log_prob_speech, log_prob_sil])
            log_prob_speech_avg /= len(self.log_probs_speech)

            prob_speech_avg = np.exp(log_prob_speech_avg)

            # print 'prob_speech_avg: %5.3f' % prob_speech_avg

            self.last_decision = prob_speech_avg

        # returns a speech / non-speech decisions
        return self.last_decision