async def recognize(websocket, path): global model global spk_model global args global loop global pool rec = None phrase_list = None sample_rate = args.sample_rate show_words = args.show_words max_alternatives = args.max_alternatives logging.info('Connection from %s', websocket.remote_address) while True: message = await websocket.recv() # Load configuration if provided if isinstance(message, str) and 'config' in message: jobj = json.loads(message)['config'] logging.info("Config %s", jobj) if 'phrase_list' in jobj: phrase_list = jobj['phrase_list'] if 'sample_rate' in jobj: sample_rate = float(jobj['sample_rate']) if 'words' in jobj: show_words = bool(jobj['words']) if 'max_alternatives' in jobj: max_alternatives = int(jobj['max_alternatives']) continue # Create the recognizer, word list is temporary disabled since not every model supports it if not rec: if phrase_list: rec = KaldiRecognizer( model, sample_rate, json.dumps(phrase_list, ensure_ascii=False)) else: rec = KaldiRecognizer(model, sample_rate) rec.SetWords(show_words) rec.SetMaxAlternatives(max_alternatives) if spk_model: rec.SetSpkModel(spk_model) response, stop = await loop.run_in_executor(pool, process_chunk, rec, message) await websocket.send(response) if stop: break
"Please download the speaker model from https://alphacephei.com/vosk/models and unpack as {} in the current folder." .format(spk_model_path)) exit(1) wf = wave.open(sys.argv[1], "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype( ) != "NONE": print("Audio file must be WAV format mono PCM.") exit(1) # Large vocabulary free form recognition model = Model(lang="en-us") spk_model = SpkModel(spk_model_path) #rec = KaldiRecognizer(model, wf.getframerate(), spk_model) rec = KaldiRecognizer(model, wf.getframerate()) rec.SetSpkModel(spk_model) # We compare speakers with cosine distance. We can keep one or several fingerprints for the speaker in a database # to distingusih among users. spk_sig = [ -1.110417, 0.09703002, 1.35658, 0.7798632, -0.305457, -0.339204, 0.6186931, -0.4521213, 0.3982236, -0.004530723, 0.7651616, 0.6500852, -0.6664245, 0.1361499, 0.1358056, -0.2887807, -0.1280468, -0.8208137, -1.620276, -0.4628615, 0.7870904, -0.105754, 0.9739769, -0.3258137, -0.7322628, -0.6212429, -0.5531687, -0.7796484, 0.7035915, 1.056094, -0.4941756, -0.6521456, -0.2238328, -0.003737517, 0.2165709, 1.200186, -0.7737719, 0.492015, 1.16058, 0.6135428, -0.7183084, 0.3153541, 0.3458071, -1.418189, -0.9624157, 0.4168292, -1.627305, 0.2742135, -0.6166027, 0.1962581, -0.6406527, 0.4372789, -0.4296024, 0.4898657, -0.9531326, -0.2945702, 0.7879696, -1.517101, -0.9344181, -0.5049928, -0.005040941, -0.4637912, 0.8223695, -1.079849, 0.8871287, -0.9732434, -0.5548235, 1.879138,
class VoskProcessor(EngineInterface): """Process chunks with Vosk""" def __init__(self, send_message, options: dict = None): """Create Vosk processor""" super().__init__(send_message) # Options if not options: options = {} # Common options - See 'EngineInterface' self._sample_rate = options.get("samplerate", float(16000)) self._language = options.get("language") if self._language: self._language = self._language.replace( "_", "-") # make sure we have xx-XX format self.language_code_short = re.split("[-]", self._language)[0].lower() else: self.language_code_short = None self._asr_model_path = options.get("model", None) self._continuous_mode = options.get("continuous", False) self._optimize_final_result = options.get("optimizeFinalResult", False) # Specific options self._alternatives = options.get("alternatives", int(1)) self._return_words = options.get("words", False) try_speaker_detection = options.get("speaker", False) self._phrase_list = options.get("phrases") # example: self._phrase_list = ["hallo", "kannst du mich hören", "[unk]"] # NOTE: speaker detection does not work in all configurations if try_speaker_detection: self._speaker_detection = (settings.has_speaker_detection_model and self._alternatives == 0) else: self._speaker_detection = False # Recognizer if self._asr_model_path: # Reset language because model has higher priority if self._asr_model_path in settings.asr_model_paths: model_index = settings.asr_model_paths.index( self._asr_model_path) self._language = settings.asr_model_languages[model_index] else: self._language = "" elif not self._language or self._language not in settings.asr_model_languages: self._asr_model_path = settings.asr_model_paths[0] self._language = settings.asr_model_languages[0] else: model_index = settings.asr_model_languages.index(self._language) self._asr_model_path = settings.asr_model_paths[model_index] asr_model_path = settings.asr_models_folder + self._asr_model_path # Speaker model spk_model_path = settings.speaker_models_folder + settings.speaker_model_paths[ 0] # Make sure paths exist and load models if self._asr_model_path not in settings.asr_model_paths: raise RuntimeError( "ASR model path is not defined in available paths") if not os.path.exists(asr_model_path): raise RuntimeError("ASR model path seems to be wrong") if self._speaker_detection and not os.path.exists(spk_model_path): raise RuntimeError("Speaker model path seems to be wrong") self._model = Model(asr_model_path) if self._speaker_detection: self._spk_model = SpkModel(spk_model_path) # Use phrase list? if self._phrase_list and len(self._phrase_list) > 0: self._recognizer = KaldiRecognizer( self._model, self._sample_rate, json.dumps(self._phrase_list, ensure_ascii=False)) else: self._recognizer = KaldiRecognizer(self._model, self._sample_rate) self._recognizer.SetMaxAlternatives(self._alternatives) if self._return_words: self._recognizer.SetWords(True) if self._speaker_detection: self._recognizer.SetSpkModel(self._spk_model) self._partial_result = {} self._last_partial_str = "" self._final_result = {} # states - 0: waiting for input, 1: got partial result, 2: got final result, 3: closing self._state = 0 # # TODO: GPU support: check Vosk examples to find out how to enable GPU ... :-P # Example code: # from vosk import GpuInit, GpuInstantiate # GpuInit() # def thread_init(): # GpuInstantiate() # pool = concurrent.futures.ThreadPoolExecutor(initializer=thread_init) async def process(self, chunk: bytes): """Feed audio chunks to recognizer""" result = None if self._state == 3: pass elif self._recognizer.AcceptWaveform(chunk): # Silence detected result = self._recognizer.Result() self._state = 2 await self._handle_final_result(result) else: # Partial results possible result = self._recognizer.PartialResult() self._state = 1 await self._handle_partial_result(result) # End? #if not self.accept_chunks: # await self._finish() async def finish_processing(self): """Wait for last process and end""" # End? await self._finish() async def close(self): """Reset recognizer and remove""" #if self._recognizer: #self._recognizer.Reset() # this throws an error!? Maye because its closed already? #self._recognizer = None def get_options(self): """Get Vosk options for active setup""" active_options = { "language": self._language, "model": self._asr_model_path, "samplerate": self._sample_rate, "optimizeFinalResult": self._optimize_final_result, "alternatives": self._alternatives, "continuous": self._continuous_mode, "words": self._return_words, "speaker": self._speaker_detection } if self._phrase_list and len(self._phrase_list) > 0: # NOTE: this can be very large, for now we use a placeholder active_options["phrases"] = [] #active_options["phrases"] = self._phrase_list else: active_options["phrases"] = [] return active_options async def _handle_partial_result(self, result): """Handle a partial result""" if result and self._last_partial_str != result: self._last_partial_str = result norm_result = VoskProcessor.normalize_result_format( result, self._alternatives, self._return_words) self._partial_result = norm_result #print("PARTIAL: ", self._partial_result) await self._send(self._partial_result, False) async def _handle_final_result(self, result, skip_send=False): """Handle a final result""" if result: #print("FINAL: ", result) norm_result = VoskProcessor.normalize_result_format( result, self._alternatives, self._return_words) if self._continuous_mode: # In continous mode we send "intermediate" final results self._final_result = norm_result if not skip_send: await self._send(self._final_result, True) else: # In non-continous mode we remember one big result self._final_result = VoskProcessor.append_to_result( self._final_result, norm_result) #print("FINAL (auto): ", self._final_result) async def _finish(self): """Tell recognizer to stop and handle last result""" last_result_was_final = (self._state == 2) self._state = 3 if last_result_was_final and not self._continuous_mode: # Send final result (because we haven't done it yet) await self._send(self._final_result, True) # self._recognizer.Reset() # TODO: we skip this to prevent ERROR if already reset elif last_result_was_final: # We don't need to do anything but reset ... right? # self._recognizer.Reset() # TODO: we skip this to prevent ERROR if already reset pass else: # Request final result = self._recognizer.FinalResult() await self._handle_final_result(result, skip_send=True) await self._send(self._final_result, True) async def _send(self, json_result, is_final=False): """Send result""" features = {} alternatives = [] if self._return_words: features["words"] = json_result.get("words", []) if self._speaker_detection: features["speaker_vector"] = json_result.get("spk", []) if self._alternatives > 0: alternatives = json_result.get("alternatives", []) transcript = json_result.get("text", "") # Post-processing? if is_final and transcript and self._optimize_final_result: # Optimize final transcription text2num_proc = TextToNumberProcessor(self._language) dt_optimizer = DateAndTimeOptimizer(self._language) transcript = text2num_proc.process(transcript) transcript = dt_optimizer.process(transcript) await self.send_transcript(transcript=transcript, is_final=is_final, confidence=json_result.get( "confidence", -1), features=features, alternatives=alternatives) # ---- Helper functions ---- @staticmethod def normalize_result_format(result: str, alternatives=0, has_words=False): """Vosk has many different formats depending on settings Convert result into a fixed format so we can handle it better""" json_result = json.loads(result) words = None if alternatives > 0 and "alternatives" in json_result: json_result = json_result.get("alternatives", []) # handle array alternatives = None if len(json_result) > 1: alternatives = json_result[1:] if has_words: words = json_result[0].get("result") return VoskProcessor.build_normalized_result( json_result[0], alternatives, words) else: # handle object if has_words: words = json_result.get("result") return VoskProcessor.build_normalized_result( json_result, None, words) @staticmethod def build_normalized_result(json_result, alternatives=None, words=None): """Build a result object that always looks the same""" # text or partial or empty: text = json_result.get( "text", json_result.get("partial", json_result.get("final", ""))) confidence = json_result.get("confidence", -1) speaker_vec = json_result.get("spk") result = { "text": text, "confidence": confidence, "alternatives": alternatives } if words is not None: result["words"] = words if speaker_vec is not None: result["spk"] = speaker_vec return result @staticmethod def append_to_result(given_result, new_result): """Append a new result to a previous one, typically used for 'intermediate' final result text""" text = new_result.get("text") if not text: return given_result #else: # we can do more post-processing here maybe if "text" in given_result: given_result["text"] += ", " + text if "confidence" in new_result: # sloppy confidence merge (take the worst) given_result["confidence"] = min( given_result.get("confidence", -1), new_result.get("confidence", -1)) if "words" in new_result: # append words given_words = given_result.get("words", []) new_words = new_result.get("words", []) if given_words and len(given_words) and new_words and len( new_words): given_result["words"] = given_words + new_words if "spk" in new_result: # take new speaker data - NOTE: not optimal given_result["spk"] = new_result.get( "spk", given_result.get("spk", [])) return given_result else: new_result["text"] = text return new_result