Exemple #1
0
    print '-' * 120
    gmm_speech = GMM(n_features=0)
    gmm_speech.load_model('model_voip/vad_speech_sds_mfcc.gmm')
    gmm_sil = GMM(n_features=0)
    gmm_sil.load_model('model_voip/vad_sil_sds_mfcc.gmm')


    vta = test

    print "Length of test data:", len(vta)
    print datetime.datetime.now()

    accuracy = 0.0
    n = 0
    for frame, label in vta:
        log_prob_speech = gmm_speech.score(frame)
        log_prob_sil = gmm_sil.score(frame)

        ratio = log_prob_speech - log_prob_sil
        if ratio >= 0:
            rec_label = 'speech'
        else:
            rec_label = 'sil'

        if rec_label == label:
            accuracy += 1.0

        n += 1

    accuracy = accuracy * 100.0 / n
Exemple #2
0
    print datetime.datetime.now()
    print '-' * 120
    gmm_speech = GMM(n_features=0)
    gmm_speech.load_model('model_voip/vad_speech_sds_mfcc.gmm')
    gmm_sil = GMM(n_features=0)
    gmm_sil.load_model('model_voip/vad_sil_sds_mfcc.gmm')

    vta = test

    print "Length of test data:", len(vta)
    print datetime.datetime.now()

    accuracy = 0.0
    n = 0
    for frame, label in vta:
        log_prob_speech = gmm_speech.score(frame)
        log_prob_sil = gmm_sil.score(frame)

        ratio = log_prob_speech - log_prob_sil
        if ratio >= 0:
            rec_label = 'speech'
        else:
            rec_label = 'sil'

        if rec_label == label:
            accuracy += 1.0

        n += 1

    accuracy = accuracy * 100.0 / n
Exemple #3
0
class GMMVAD():
    """ This is implementation of a GMM based voice activity detector.

    It only implements decisions whether input frame is speech of non speech.
    It returns the posterior probability of speech for N last input frames.
    """
    def __init__(self, cfg):
        self.cfg = cfg

        self.audio_recorded_in = []

        self.gmm_speech = GMM()
        self.gmm_speech.load_model(self.cfg['VAD']['gmm']['speech_model'])
        self.gmm_sil = GMM()
        self.gmm_sil.load_model(self.cfg['VAD']['gmm']['sil_model'])

        self.log_probs_speech = deque(
            maxlen=self.cfg['VAD']['gmm']['filter_length'])
        self.log_probs_sil = deque(
            maxlen=self.cfg['VAD']['gmm']['filter_length'])

        self.last_decision = 0.0

        if self.cfg['VAD']['gmm']['frontend'] == 'MFCC':
            self.front_end = MFCCFrontEnd(
                self.cfg['Audio']['sample_rate'],
                self.cfg['VAD']['gmm']['framesize'],
                self.cfg['VAD']['gmm']['usehamming'],
                self.cfg['VAD']['gmm']['preemcoef'],
                self.cfg['VAD']['gmm']['numchans'],
                self.cfg['VAD']['gmm']['ceplifter'],
                self.cfg['VAD']['gmm']['numceps'],
                self.cfg['VAD']['gmm']['enormalise'],
                self.cfg['VAD']['gmm']['zmeansource'],
                self.cfg['VAD']['gmm']['usepower'],
                self.cfg['VAD']['gmm']['usec0'],
                self.cfg['VAD']['gmm']['usecmn'],
                self.cfg['VAD']['gmm']['usedelta'],
                self.cfg['VAD']['gmm']['useacc'],
                self.cfg['VAD']['gmm']['n_last_frames'],
                self.cfg['VAD']['gmm']['lofreq'],
                self.cfg['VAD']['gmm']['hifreq'])
        else:
            raise ASRException('Unsupported frontend: %s' %
                               (self.cfg['VAD']['gmm']['frontend'], ))

    def decide(self, data):
        """Processes the input frame whether the input segment is speech or non speech.

        The returned values can be in range from 0.0 to 1.0.
        It returns 1.0 for 100% speech segment and 0.0 for 100% non speech segment.
        """

        data = struct.unpack('%dh' % (len(data) / 2, ), data)
        self.audio_recorded_in.extend(data)

        while len(
                self.audio_recorded_in) > self.cfg['VAD']['gmm']['framesize']:
            frame = self.audio_recorded_in[:self.
                                           cfg['VAD']['gmm']['framesize']]
            self.audio_recorded_in = self.audio_recorded_in[
                self.cfg['VAD']['gmm']['frameshift']:]

            mfcc = self.front_end.param(frame)

            log_prob_speech = self.gmm_speech.score(mfcc)
            log_prob_sil = self.gmm_sil.score(mfcc)

            self.log_probs_speech.append(log_prob_speech)
            self.log_probs_sil.append(log_prob_sil)

            log_prob_speech_avg = 0.0
            for log_prob_speech, log_prob_sil in zip(self.log_probs_speech,
                                                     self.log_probs_sil):
                log_prob_speech_avg += log_prob_speech - logsumexp(
                    [log_prob_speech, log_prob_sil])
            log_prob_speech_avg /= len(self.log_probs_speech)

            prob_speech_avg = np.exp(log_prob_speech_avg)

            #      print 'prob_speech_avg: %5.3f' % prob_speech_avg

            self.last_decision = prob_speech_avg

        # returns a speech / non-speech decisions
        return self.last_decision
Exemple #4
0
Fichier : gmm.py Projet : AoJ/alex
class GMMVAD():
    """ This is implementation of a GMM based voice activity detector.

    It only implements decisions whether input frame is speech of non speech.
    It returns the posterior probability of speech for N last input frames.
    """
    def __init__(self, cfg):
        self.cfg = cfg

        self.audio_recorded_in = []

        self.gmm_speech = GMM()
        self.gmm_speech.load_model(self.cfg['VAD']['gmm']['speech_model'])
        self.gmm_sil = GMM()
        self.gmm_sil.load_model(self.cfg['VAD']['gmm']['sil_model'])

        self.log_probs_speech = deque(maxlen=self.cfg['VAD']['gmm']['filter_length'])
        self.log_probs_sil = deque(maxlen=self.cfg['VAD']['gmm']['filter_length'])

        self.last_decision = 0.0

        if self.cfg['VAD']['gmm']['frontend'] == 'MFCC':
            self.front_end = MFCCFrontEnd(
                self.cfg['Audio']['sample_rate'], self.cfg['VAD']['gmm']['framesize'],
                self.cfg['VAD']['gmm']['usehamming'], self.cfg['VAD']['gmm']['preemcoef'],
                self.cfg['VAD']['gmm']['numchans'], self.cfg['VAD']['gmm']['ceplifter'],
                self.cfg['VAD']['gmm']['numceps'], self.cfg['VAD']['gmm']['enormalise'],
                self.cfg['VAD']['gmm']['zmeansource'], self.cfg['VAD']['gmm']['usepower'],
                self.cfg['VAD']['gmm']['usec0'], self.cfg['VAD']['gmm']['usecmn'],
                self.cfg['VAD']['gmm']['usedelta'], self.cfg['VAD']['gmm']['useacc'],
                self.cfg['VAD']['gmm']['n_last_frames'],
                self.cfg['VAD']['gmm']['lofreq'], self.cfg['VAD']['gmm']['hifreq'])
        else:
            raise ASRException('Unsupported frontend: %s' % (self.cfg['VAD']['gmm']['frontend'], ))

    def decide(self, data):
        """Processes the input frame whether the input segment is speech or non speech.

        The returned values can be in range from 0.0 to 1.0.
        It returns 1.0 for 100% speech segment and 0.0 for 100% non speech segment.
        """

        data = struct.unpack('%dh' % (len(data) / 2, ), data)
        self.audio_recorded_in.extend(data)

        while len(self.audio_recorded_in) > self.cfg['VAD']['gmm']['framesize']:
            frame = self.audio_recorded_in[:self.cfg['VAD']['gmm']['framesize']]
            self.audio_recorded_in = self.audio_recorded_in[self.cfg['VAD']['gmm']['frameshift']:]

            mfcc = self.front_end.param(frame)

            log_prob_speech = self.gmm_speech.score(mfcc)
            log_prob_sil = self.gmm_sil.score(mfcc)

            self.log_probs_speech.append(log_prob_speech)
            self.log_probs_sil.append(log_prob_sil)

            log_prob_speech_avg = 0.0
            for log_prob_speech, log_prob_sil in zip(self.log_probs_speech, self.log_probs_sil):
                log_prob_speech_avg += log_prob_speech - logsumexp([log_prob_speech, log_prob_sil])
            log_prob_speech_avg /= len(self.log_probs_speech)

            prob_speech_avg = np.exp(log_prob_speech_avg)

#      print 'prob_speech_avg: %5.3f' % prob_speech_avg

            self.last_decision = prob_speech_avg

        # returns a speech / non-speech decisions
        return self.last_decision