Example #1
0
async def recognize(websocket, path):
    global model
    global spk_model
    global args
    global loop
    global pool

    rec = None
    phrase_list = None
    sample_rate = args.sample_rate
    show_words = args.show_words
    max_alternatives = args.max_alternatives

    logging.info('Connection from %s', websocket.remote_address)

    while True:

        message = await websocket.recv()

        # Load configuration if provided
        if isinstance(message, str) and 'config' in message:
            jobj = json.loads(message)['config']
            logging.info("Config %s", jobj)
            if 'phrase_list' in jobj:
                phrase_list = jobj['phrase_list']
            if 'sample_rate' in jobj:
                sample_rate = float(jobj['sample_rate'])
            if 'words' in jobj:
                show_words = bool(jobj['words'])
            if 'max_alternatives' in jobj:
                max_alternatives = int(jobj['max_alternatives'])
            continue

        # Create the recognizer, word list is temporary disabled since not every model supports it
        if not rec:
            if phrase_list:
                rec = KaldiRecognizer(
                    model, sample_rate,
                    json.dumps(phrase_list, ensure_ascii=False))
            else:
                rec = KaldiRecognizer(model, sample_rate)
            rec.SetWords(show_words)
            rec.SetMaxAlternatives(max_alternatives)
            if spk_model:
                rec.SetSpkModel(spk_model)

        response, stop = await loop.run_in_executor(pool, process_chunk, rec,
                                                    message)
        await websocket.send(response)
        if stop: break
Example #2
0
        "Please download the speaker model from https://alphacephei.com/vosk/models and unpack as {} in the current folder."
        .format(spk_model_path))
    exit(1)

wf = wave.open(sys.argv[1], "rb")
if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype(
) != "NONE":
    print("Audio file must be WAV format mono PCM.")
    exit(1)

# Large vocabulary free form recognition
model = Model(lang="en-us")
spk_model = SpkModel(spk_model_path)
#rec = KaldiRecognizer(model, wf.getframerate(), spk_model)
rec = KaldiRecognizer(model, wf.getframerate())
rec.SetSpkModel(spk_model)

# We compare speakers with cosine distance. We can keep one or several fingerprints for the speaker in a database
# to distingusih among users.
spk_sig = [
    -1.110417, 0.09703002, 1.35658, 0.7798632, -0.305457, -0.339204, 0.6186931,
    -0.4521213, 0.3982236, -0.004530723, 0.7651616, 0.6500852, -0.6664245,
    0.1361499, 0.1358056, -0.2887807, -0.1280468, -0.8208137, -1.620276,
    -0.4628615, 0.7870904, -0.105754, 0.9739769, -0.3258137, -0.7322628,
    -0.6212429, -0.5531687, -0.7796484, 0.7035915, 1.056094, -0.4941756,
    -0.6521456, -0.2238328, -0.003737517, 0.2165709, 1.200186, -0.7737719,
    0.492015, 1.16058, 0.6135428, -0.7183084, 0.3153541, 0.3458071, -1.418189,
    -0.9624157, 0.4168292, -1.627305, 0.2742135, -0.6166027, 0.1962581,
    -0.6406527, 0.4372789, -0.4296024, 0.4898657, -0.9531326, -0.2945702,
    0.7879696, -1.517101, -0.9344181, -0.5049928, -0.005040941, -0.4637912,
    0.8223695, -1.079849, 0.8871287, -0.9732434, -0.5548235, 1.879138,
Example #3
0
class VoskProcessor(EngineInterface):
    """Process chunks with Vosk"""
    def __init__(self, send_message, options: dict = None):
        """Create Vosk processor"""
        super().__init__(send_message)
        # Options
        if not options:
            options = {}
        # Common options - See 'EngineInterface'
        self._sample_rate = options.get("samplerate", float(16000))
        self._language = options.get("language")
        if self._language:
            self._language = self._language.replace(
                "_", "-")  # make sure we have xx-XX format
            self.language_code_short = re.split("[-]",
                                                self._language)[0].lower()
        else:
            self.language_code_short = None
        self._asr_model_path = options.get("model", None)
        self._continuous_mode = options.get("continuous", False)
        self._optimize_final_result = options.get("optimizeFinalResult", False)
        # Specific options
        self._alternatives = options.get("alternatives", int(1))
        self._return_words = options.get("words", False)
        try_speaker_detection = options.get("speaker", False)
        self._phrase_list = options.get("phrases")
        # example: self._phrase_list = ["hallo", "kannst du mich hören", "[unk]"]
        # NOTE: speaker detection does not work in all configurations
        if try_speaker_detection:
            self._speaker_detection = (settings.has_speaker_detection_model
                                       and self._alternatives == 0)
        else:
            self._speaker_detection = False
        # Recognizer
        if self._asr_model_path:
            # Reset language because model has higher priority
            if self._asr_model_path in settings.asr_model_paths:
                model_index = settings.asr_model_paths.index(
                    self._asr_model_path)
                self._language = settings.asr_model_languages[model_index]
            else:
                self._language = ""
        elif not self._language or self._language not in settings.asr_model_languages:
            self._asr_model_path = settings.asr_model_paths[0]
            self._language = settings.asr_model_languages[0]
        else:
            model_index = settings.asr_model_languages.index(self._language)
            self._asr_model_path = settings.asr_model_paths[model_index]
        asr_model_path = settings.asr_models_folder + self._asr_model_path
        # Speaker model
        spk_model_path = settings.speaker_models_folder + settings.speaker_model_paths[
            0]
        # Make sure paths exist and load models
        if self._asr_model_path not in settings.asr_model_paths:
            raise RuntimeError(
                "ASR model path is not defined in available paths")
        if not os.path.exists(asr_model_path):
            raise RuntimeError("ASR model path seems to be wrong")
        if self._speaker_detection and not os.path.exists(spk_model_path):
            raise RuntimeError("Speaker model path seems to be wrong")
        self._model = Model(asr_model_path)
        if self._speaker_detection:
            self._spk_model = SpkModel(spk_model_path)
        # Use phrase list?
        if self._phrase_list and len(self._phrase_list) > 0:
            self._recognizer = KaldiRecognizer(
                self._model, self._sample_rate,
                json.dumps(self._phrase_list, ensure_ascii=False))
        else:
            self._recognizer = KaldiRecognizer(self._model, self._sample_rate)
        self._recognizer.SetMaxAlternatives(self._alternatives)
        if self._return_words:
            self._recognizer.SetWords(True)
        if self._speaker_detection:
            self._recognizer.SetSpkModel(self._spk_model)
        self._partial_result = {}
        self._last_partial_str = ""
        self._final_result = {}
        # states - 0: waiting for input, 1: got partial result, 2: got final result, 3: closing
        self._state = 0
        #
        # TODO: GPU support: check Vosk examples to find out how to enable GPU ... :-P
        # Example code:
        # from vosk import GpuInit, GpuInstantiate
        # GpuInit()
        # def thread_init():
        #     GpuInstantiate()
        # pool = concurrent.futures.ThreadPoolExecutor(initializer=thread_init)

    async def process(self, chunk: bytes):
        """Feed audio chunks to recognizer"""
        result = None
        if self._state == 3:
            pass
        elif self._recognizer.AcceptWaveform(chunk):
            # Silence detected
            result = self._recognizer.Result()
            self._state = 2
            await self._handle_final_result(result)
        else:
            # Partial results possible
            result = self._recognizer.PartialResult()
            self._state = 1
            await self._handle_partial_result(result)
        # End?
        #if not self.accept_chunks:
        #    await self._finish()

    async def finish_processing(self):
        """Wait for last process and end"""
        # End?
        await self._finish()

    async def close(self):
        """Reset recognizer and remove"""
        #if self._recognizer:
        #self._recognizer.Reset()   # this throws an error!? Maye because its closed already?
        #self._recognizer = None

    def get_options(self):
        """Get Vosk options for active setup"""
        active_options = {
            "language": self._language,
            "model": self._asr_model_path,
            "samplerate": self._sample_rate,
            "optimizeFinalResult": self._optimize_final_result,
            "alternatives": self._alternatives,
            "continuous": self._continuous_mode,
            "words": self._return_words,
            "speaker": self._speaker_detection
        }
        if self._phrase_list and len(self._phrase_list) > 0:
            # NOTE: this can be very large, for now we use a placeholder
            active_options["phrases"] = []
            #active_options["phrases"] = self._phrase_list
        else:
            active_options["phrases"] = []
        return active_options

    async def _handle_partial_result(self, result):
        """Handle a partial result"""
        if result and self._last_partial_str != result:
            self._last_partial_str = result
            norm_result = VoskProcessor.normalize_result_format(
                result, self._alternatives, self._return_words)
            self._partial_result = norm_result
            #print("PARTIAL: ", self._partial_result)
            await self._send(self._partial_result, False)

    async def _handle_final_result(self, result, skip_send=False):
        """Handle a final result"""
        if result:
            #print("FINAL: ", result)
            norm_result = VoskProcessor.normalize_result_format(
                result, self._alternatives, self._return_words)
            if self._continuous_mode:
                # In continous mode we send "intermediate" final results
                self._final_result = norm_result
                if not skip_send:
                    await self._send(self._final_result, True)
            else:
                # In non-continous mode we remember one big result
                self._final_result = VoskProcessor.append_to_result(
                    self._final_result, norm_result)
            #print("FINAL (auto): ", self._final_result)

    async def _finish(self):
        """Tell recognizer to stop and handle last result"""
        last_result_was_final = (self._state == 2)
        self._state = 3
        if last_result_was_final and not self._continuous_mode:
            # Send final result (because we haven't done it yet)
            await self._send(self._final_result, True)
            # self._recognizer.Reset()  # TODO: we skip this to prevent ERROR if already reset
        elif last_result_was_final:
            # We don't need to do anything but reset ... right?
            # self._recognizer.Reset()  # TODO: we skip this to prevent ERROR if already reset
            pass
        else:
            # Request final
            result = self._recognizer.FinalResult()
            await self._handle_final_result(result, skip_send=True)
            await self._send(self._final_result, True)

    async def _send(self, json_result, is_final=False):
        """Send result"""
        features = {}
        alternatives = []
        if self._return_words:
            features["words"] = json_result.get("words", [])
        if self._speaker_detection:
            features["speaker_vector"] = json_result.get("spk", [])
        if self._alternatives > 0:
            alternatives = json_result.get("alternatives", [])
        transcript = json_result.get("text", "")
        # Post-processing?
        if is_final and transcript and self._optimize_final_result:
            # Optimize final transcription
            text2num_proc = TextToNumberProcessor(self._language)
            dt_optimizer = DateAndTimeOptimizer(self._language)
            transcript = text2num_proc.process(transcript)
            transcript = dt_optimizer.process(transcript)
        await self.send_transcript(transcript=transcript,
                                   is_final=is_final,
                                   confidence=json_result.get(
                                       "confidence", -1),
                                   features=features,
                                   alternatives=alternatives)

    # ---- Helper functions ----

    @staticmethod
    def normalize_result_format(result: str, alternatives=0, has_words=False):
        """Vosk has many different formats depending on settings
        Convert result into a fixed format so we can handle it better"""
        json_result = json.loads(result)
        words = None
        if alternatives > 0 and "alternatives" in json_result:
            json_result = json_result.get("alternatives", [])
            # handle array
            alternatives = None
            if len(json_result) > 1:
                alternatives = json_result[1:]
            if has_words:
                words = json_result[0].get("result")
            return VoskProcessor.build_normalized_result(
                json_result[0], alternatives, words)
        else:
            # handle object
            if has_words:
                words = json_result.get("result")
            return VoskProcessor.build_normalized_result(
                json_result, None, words)

    @staticmethod
    def build_normalized_result(json_result, alternatives=None, words=None):
        """Build a result object that always looks the same"""
        # text or partial or empty:
        text = json_result.get(
            "text", json_result.get("partial", json_result.get("final", "")))
        confidence = json_result.get("confidence", -1)
        speaker_vec = json_result.get("spk")
        result = {
            "text": text,
            "confidence": confidence,
            "alternatives": alternatives
        }
        if words is not None:
            result["words"] = words
        if speaker_vec is not None:
            result["spk"] = speaker_vec
        return result

    @staticmethod
    def append_to_result(given_result, new_result):
        """Append a new result to a previous one, typically used for
        'intermediate' final result text"""
        text = new_result.get("text")
        if not text:
            return given_result
        #else:            # we can do more post-processing here maybe
        if "text" in given_result:
            given_result["text"] += ", " + text
            if "confidence" in new_result:
                # sloppy confidence merge (take the worst)
                given_result["confidence"] = min(
                    given_result.get("confidence", -1),
                    new_result.get("confidence", -1))
            if "words" in new_result:
                # append words
                given_words = given_result.get("words", [])
                new_words = new_result.get("words", [])
                if given_words and len(given_words) and new_words and len(
                        new_words):
                    given_result["words"] = given_words + new_words
            if "spk" in new_result:
                # take new speaker data - NOTE: not optimal
                given_result["spk"] = new_result.get(
                    "spk", given_result.get("spk", []))
            return given_result
        else:
            new_result["text"] = text
            return new_result