def __init__(self, vad_cfg, speech_thresh=0.7, non_speech_thresh=0.1): self.vad_cfg = vad_cfg self.speech_thresh = speech_thresh self.non_speech_thresh = non_speech_thresh logging.info('Loading VAD model.') self.vad = FFNNVADGeneral(**vad_cfg)
def __init__(self, vad_cfg, speech_thresh=0.7, non_speech_thresh=0.1): self.vad_cfg = vad_cfg self.speech_thresh = speech_thresh self.non_speech_thresh = non_speech_thresh logging.info("Loading VAD model.") self.vad = FFNNVADGeneral(**vad_cfg)
class RecordingSplitter(object): CHANGE_TO_NON_SPEECH = 2 CHANGE_TO_SPEECH = 1 speech_thresh = 0.7 non_speech_thresh = 0.1 read_buffer_size = 128 def __init__(self, vad_cfg, speech_thresh=0.7, non_speech_thresh=0.1): self.vad_cfg = vad_cfg self.speech_thresh = speech_thresh self.non_speech_thresh = non_speech_thresh logging.info("Loading VAD model.") self.vad = FFNNVADGeneral(**vad_cfg) def split_single_channel_wav(self, file_name, out_dir, out_prefix): logging.info("Splitting %s" % file_name) wave_in = wave.open(file_name) sample_rate = wave_in.getframerate() sample_width = wave_in.getsampwidth() bytes_per_second = sample_rate * sample_width frames_per_second = bytes_per_second / self.read_buffer_size (detection_window_sil, detection_window_speech, pre_detection_buffer) = self._initialize_buffers( frames_per_second ) res_files = [] res_file_cntr = 0 frames = [] is_speech = False n_read = 0 n_read_beg = None while 1: audio_data = wave_in.readframes(self.read_buffer_size) n_read += self.read_buffer_size if len(audio_data) == 0: break raw_vad_decision = self.vad.decide(audio_data) is_speech, change = self._smoothe_decison( raw_vad_decision, is_speech, detection_window_speech, detection_window_sil ) if not is_speech: pre_detection_buffer.append(audio_data) if change == self.CHANGE_TO_SPEECH: n_read_beg = n_read - self.read_buffer_size frames = [] elif change == self.CHANGE_TO_NON_SPEECH: # if not is_speech and len(frames) > 1: self._save_part( res_file_cntr, list(pre_detection_buffer) + frames, out_dir, res_files, wave_in, out_prefix, n_read_beg, n_read, bytes_per_second, ) res_file_cntr += 1 pre_detection_buffer.extend(frames[-pre_detection_buffer.maxlen :]) if is_speech: frames.append(audio_data) if n_read_beg: self._save_part( res_file_cntr, frames, out_dir, res_files, wave_in, out_prefix, n_read_beg, n_read, bytes_per_second ) return res_files def _initialize_buffers(self, frames_per_second): pre_detection_buffer_frames = int(frames_per_second * 0.5) smoothe_decision_window_sil = int(frames_per_second * 0.2) smoothe_decision_window_speech = int(frames_per_second * 0.2) detection_window_speech = deque(maxlen=smoothe_decision_window_speech) detection_window_sil = deque(maxlen=smoothe_decision_window_sil) pre_detection_buffer = deque(maxlen=pre_detection_buffer_frames) return detection_window_sil, detection_window_speech, pre_detection_buffer def _smoothe_decison(self, decision, last_vad, detection_window_speech, detection_window_sil): detection_window_speech.append(decision) detection_window_sil.append(decision) speech = float(sum(detection_window_speech)) / (len(detection_window_speech) + 1.0) sil = float(sum(detection_window_sil)) / (len(detection_window_sil) + 1.0) vad = last_vad change = None if last_vad: # last decision was speech if sil < self.non_speech_thresh: vad = False change = self.CHANGE_TO_NON_SPEECH else: if speech > self.speech_thresh: vad = True change = self.CHANGE_TO_SPEECH return vad, change def _save_part( self, cntr, frames, out_dir, res_files, wave_in, out_prefix, n_read_beg, n_read_end, bytes_per_second ): content = b"".join(frames) logging.info("Saving part %d (%.1f s)." % (cntr, len(content) * 1.0 / bytes_per_second)) res_file = os.path.join(out_dir, "part.%s.%.3d.wav" % (out_prefix, cntr)) wf = wave.open(res_file, "wb") wf.setnchannels(wave_in.getnchannels()) wf.setsampwidth(wave_in.getsampwidth()) wf.setframerate(wave_in.getframerate()) wf.writeframes(content) wf.close() res_files.append(((n_read_beg * 1.0 / bytes_per_second, n_read_end * 1.0 / bytes_per_second), res_file))
class RecordingSplitter(object): CHANGE_TO_NON_SPEECH = 2 CHANGE_TO_SPEECH = 1 speech_thresh = 0.7 non_speech_thresh = 0.1 read_buffer_size = 128 def __init__(self, vad_cfg, speech_thresh=0.7, non_speech_thresh=0.1): self.vad_cfg = vad_cfg self.speech_thresh = speech_thresh self.non_speech_thresh = non_speech_thresh logging.info('Loading VAD model.') self.vad = FFNNVADGeneral(**vad_cfg) def split_single_channel_wav(self, file_name, out_dir, out_prefix): logging.info('Splitting %s' % file_name) wave_in = wave.open(file_name) sample_rate = wave_in.getframerate() sample_width = wave_in.getsampwidth() bytes_per_second = sample_rate * sample_width frames_per_second = bytes_per_second / self.read_buffer_size (detection_window_sil, detection_window_speech, pre_detection_buffer) = self._initialize_buffers(frames_per_second) res_files = [] res_file_cntr = 0 frames = [] is_speech = False n_read = 0 n_read_beg = None while 1: audio_data = wave_in.readframes(self.read_buffer_size) n_read += self.read_buffer_size if len(audio_data) == 0: break raw_vad_decision = self.vad.decide(audio_data) is_speech, change = self._smoothe_decison(raw_vad_decision, is_speech, detection_window_speech, detection_window_sil) if not is_speech: pre_detection_buffer.append(audio_data) if change == self.CHANGE_TO_SPEECH: n_read_beg = n_read - self.read_buffer_size frames = [] elif change == self.CHANGE_TO_NON_SPEECH: #if not is_speech and len(frames) > 1: self._save_part(res_file_cntr, list(pre_detection_buffer) + frames, out_dir, res_files, wave_in, out_prefix, n_read_beg, n_read, bytes_per_second) res_file_cntr += 1 pre_detection_buffer.extend(frames[-pre_detection_buffer.maxlen:]) if is_speech: frames.append(audio_data) if n_read_beg: self._save_part(res_file_cntr, frames, out_dir, res_files, wave_in, out_prefix, n_read_beg, n_read, bytes_per_second) return res_files def _initialize_buffers(self, frames_per_second): pre_detection_buffer_frames = int(frames_per_second * 0.5) smoothe_decision_window_sil = int(frames_per_second * 0.2) smoothe_decision_window_speech = int(frames_per_second * 0.2) detection_window_speech = deque(maxlen=smoothe_decision_window_speech) detection_window_sil = deque(maxlen=smoothe_decision_window_sil) pre_detection_buffer = deque(maxlen=pre_detection_buffer_frames) return detection_window_sil, detection_window_speech, pre_detection_buffer def _smoothe_decison(self, decision, last_vad, detection_window_speech, detection_window_sil): detection_window_speech.append(decision) detection_window_sil.append(decision) speech = float(sum(detection_window_speech)) / (len(detection_window_speech) + 1.0) sil = float(sum(detection_window_sil)) / (len(detection_window_sil) + 1.0) vad = last_vad change = None if last_vad: # last decision was speech if sil < self.non_speech_thresh: vad = False change = self.CHANGE_TO_NON_SPEECH else: if speech > self.speech_thresh: vad = True change = self.CHANGE_TO_SPEECH return vad, change def _save_part(self, cntr, frames, out_dir, res_files, wave_in, out_prefix, n_read_beg, n_read_end, bytes_per_second): content = b''.join(frames) logging.info('Saving part %d (%.1f s).' % (cntr, len(content) * 1.0 / bytes_per_second)) res_file = os.path.join(out_dir, 'part.%s.%.3d.wav' % (out_prefix, cntr, )) wf = wave.open(res_file, 'wb') wf.setnchannels(wave_in.getnchannels()) wf.setsampwidth(wave_in.getsampwidth()) wf.setframerate(wave_in.getframerate()) wf.writeframes(content) wf.close() res_files.append(((n_read_beg * 1.0 / bytes_per_second, n_read_end * 1.0 / bytes_per_second), res_file))