Beispiel #1
0
    def __init__(self, vad_cfg, speech_thresh=0.7, non_speech_thresh=0.1):
        self.vad_cfg = vad_cfg

        self.speech_thresh = speech_thresh
        self.non_speech_thresh = non_speech_thresh

        logging.info('Loading VAD model.')
        self.vad = FFNNVAD(**vad_cfg)
Beispiel #2
0
    def __init__(self, vad_cfg, speech_thresh=0.7, non_speech_thresh=0.1):
        self.vad_cfg = vad_cfg

        self.speech_thresh = speech_thresh
        self.non_speech_thresh = non_speech_thresh

        logging.info('Loading VAD model.')
        self.vad = FFNNVAD(**vad_cfg)
Beispiel #3
0
class RecordingSplitter(object):
    CHANGE_TO_NON_SPEECH = 2
    CHANGE_TO_SPEECH = 1

    speech_thresh = 0.7
    non_speech_thresh = 0.1

    read_buffer_size = 128

    def __init__(self, vad_cfg, speech_thresh=0.7, non_speech_thresh=0.1):
        self.vad_cfg = vad_cfg

        self.speech_thresh = speech_thresh
        self.non_speech_thresh = non_speech_thresh

        logging.info('Loading VAD model.')
        self.vad = FFNNVAD(**vad_cfg)

    def split_single_channel_wav(self, file_name, out_dir, out_prefix):
        logging.info('Splitting %s' % file_name)
        wave_in = wave.open(file_name)

        sample_rate = wave_in.getframerate()
        sample_width = wave_in.getsampwidth()

        bytes_per_second = sample_rate * sample_width

        frames_per_second = bytes_per_second / self.read_buffer_size

        (detection_window_sil, detection_window_speech,
         pre_detection_buffer) = self._initialize_buffers(frames_per_second)

        res_files = []
        res_file_cntr = 0

        frames = []

        is_speech = False
        n_read = 0
        n_read_beg = None

        while 1:
            audio_data = wave_in.readframes(self.read_buffer_size)
            n_read += self.read_buffer_size

            if len(audio_data) == 0:
                break

            raw_vad_decision = self.vad.decide(audio_data)
            is_speech, change = self._smoothe_decison(raw_vad_decision,
                                                      is_speech,
                                                      detection_window_speech,
                                                      detection_window_sil)

            if not is_speech:
                pre_detection_buffer.append(audio_data)

            if change == self.CHANGE_TO_SPEECH:
                n_read_beg = n_read - self.read_buffer_size
                frames = []
            elif change == self.CHANGE_TO_NON_SPEECH:
                #if not is_speech and len(frames) > 1:
                self._save_part(res_file_cntr,
                                list(pre_detection_buffer) + frames, out_dir,
                                res_files, wave_in, out_prefix, n_read_beg,
                                n_read, bytes_per_second)
                res_file_cntr += 1
                pre_detection_buffer.extend(
                    frames[-pre_detection_buffer.maxlen:])

            if is_speech:
                frames.append(audio_data)

        if n_read_beg:
            self._save_part(res_file_cntr, frames, out_dir, res_files, wave_in,
                            out_prefix, n_read_beg, n_read, bytes_per_second)

        return res_files

    def _initialize_buffers(self, frames_per_second):
        pre_detection_buffer_frames = int(frames_per_second * 0.5)
        smoothe_decision_window_sil = int(frames_per_second * 0.2)
        smoothe_decision_window_speech = int(frames_per_second * 0.2)

        detection_window_speech = deque(maxlen=smoothe_decision_window_speech)
        detection_window_sil = deque(maxlen=smoothe_decision_window_sil)
        pre_detection_buffer = deque(maxlen=pre_detection_buffer_frames)

        return detection_window_sil, detection_window_speech, pre_detection_buffer

    def _smoothe_decison(self, decision, last_vad, detection_window_speech,
                         detection_window_sil):
        detection_window_speech.append(decision)
        detection_window_sil.append(decision)

        speech = float(sum(detection_window_speech)) / (
            len(detection_window_speech) + 1.0)
        sil = float(
            sum(detection_window_sil)) / (len(detection_window_sil) + 1.0)

        vad = last_vad
        change = None
        if last_vad:
            # last decision was speech
            if sil < self.non_speech_thresh:
                vad = False
                change = self.CHANGE_TO_NON_SPEECH
        else:
            if speech > self.speech_thresh:
                vad = True
                change = self.CHANGE_TO_SPEECH

        return vad, change

    def _save_part(self, cntr, frames, out_dir, res_files, wave_in, out_prefix,
                   n_read_beg, n_read_end, bytes_per_second):
        content = b''.join(frames)
        logging.info('Saving part %d (%.1f s).' %
                     (cntr, len(content) * 1.0 / bytes_per_second))

        res_file = os.path.join(out_dir, 'part.%s.%.3d.wav' % (
            out_prefix,
            cntr,
        ))
        wf = wave.open(res_file, 'wb')
        wf.setnchannels(wave_in.getnchannels())
        wf.setsampwidth(wave_in.getsampwidth())
        wf.setframerate(wave_in.getframerate())
        wf.writeframes(content)
        wf.close()

        res_files.append(((n_read_beg * 1.0 / bytes_per_second,
                           n_read_end * 1.0 / bytes_per_second), res_file))
Beispiel #4
0
class RecordingSplitter(object):
    CHANGE_TO_NON_SPEECH = 2
    CHANGE_TO_SPEECH = 1

    speech_thresh = 0.7
    non_speech_thresh = 0.1

    read_buffer_size = 128

    def __init__(self, vad_cfg, speech_thresh=0.7, non_speech_thresh=0.1):
        self.vad_cfg = vad_cfg

        self.speech_thresh = speech_thresh
        self.non_speech_thresh = non_speech_thresh

        logging.info('Loading VAD model.')
        self.vad = FFNNVAD(**vad_cfg)

    def split_single_channel_wav(self, file_name, out_dir, out_prefix):
        logging.info('Splitting %s' % file_name)
        wave_in = wave.open(file_name)

        sample_rate = wave_in.getframerate()
        sample_width = wave_in.getsampwidth()

        bytes_per_second = sample_rate * sample_width

        frames_per_second = bytes_per_second / self.read_buffer_size

        (detection_window_sil,
         detection_window_speech,
         pre_detection_buffer) = self._initialize_buffers(frames_per_second)

        res_files = []
        res_file_cntr = 0

        frames = []

        is_speech = False
        n_read = 0
        n_read_beg = None

        while 1:
            audio_data = wave_in.readframes(self.read_buffer_size)
            n_read += self.read_buffer_size

            if len(audio_data) == 0:
                break

            raw_vad_decision = self.vad.decide(audio_data)
            is_speech, change = self._smoothe_decison(raw_vad_decision, is_speech, detection_window_speech, detection_window_sil)

            if not is_speech:
                pre_detection_buffer.append(audio_data)

            if change == self.CHANGE_TO_SPEECH:
                n_read_beg = n_read - self.read_buffer_size
                frames = []
            elif change == self.CHANGE_TO_NON_SPEECH:
                #if not is_speech and len(frames) > 1:
                self._save_part(res_file_cntr, list(pre_detection_buffer) + frames, out_dir, res_files, wave_in, out_prefix, n_read_beg, n_read, bytes_per_second)
                res_file_cntr += 1
                pre_detection_buffer.extend(frames[-pre_detection_buffer.maxlen:])

            if is_speech:
                frames.append(audio_data)

        if n_read_beg:
            self._save_part(res_file_cntr, frames, out_dir, res_files, wave_in, out_prefix, n_read_beg, n_read, bytes_per_second)

        return res_files

    def _initialize_buffers(self, frames_per_second):
        pre_detection_buffer_frames = int(frames_per_second * 0.5)
        smoothe_decision_window_sil = int(frames_per_second * 0.2)
        smoothe_decision_window_speech = int(frames_per_second * 0.2)

        detection_window_speech = deque(maxlen=smoothe_decision_window_speech)
        detection_window_sil = deque(maxlen=smoothe_decision_window_sil)
        pre_detection_buffer = deque(maxlen=pre_detection_buffer_frames)

        return detection_window_sil, detection_window_speech, pre_detection_buffer


    def _smoothe_decison(self, decision, last_vad, detection_window_speech, detection_window_sil):
        detection_window_speech.append(decision)
        detection_window_sil.append(decision)

        speech = float(sum(detection_window_speech)) / (len(detection_window_speech) + 1.0)
        sil = float(sum(detection_window_sil)) / (len(detection_window_sil) + 1.0)

        vad = last_vad
        change = None
        if last_vad:
            # last decision was speech
            if sil < self.non_speech_thresh:
                vad = False
                change = self.CHANGE_TO_NON_SPEECH
        else:
            if speech > self.speech_thresh:
                vad = True
                change = self.CHANGE_TO_SPEECH

        return vad, change

    def _save_part(self, cntr, frames, out_dir, res_files, wave_in, out_prefix, n_read_beg, n_read_end, bytes_per_second):
        content = b''.join(frames)
        logging.info('Saving part %d (%.1f s).' % (cntr, len(content) * 1.0 / bytes_per_second))

        res_file = os.path.join(out_dir, 'part.%s.%.3d.wav' % (out_prefix, cntr, ))
        wf = wave.open(res_file, 'wb')
        wf.setnchannels(wave_in.getnchannels())
        wf.setsampwidth(wave_in.getsampwidth())
        wf.setframerate(wave_in.getframerate())
        wf.writeframes(content)
        wf.close()

        res_files.append(((n_read_beg * 1.0 / bytes_per_second, n_read_end * 1.0 / bytes_per_second), res_file))