Python KaldiRecognizer.SetSpkModel Examples

Programming Language: Python

Namespace/Package Name: vosk

Class/Type: KaldiRecognizer

Method/Function: SetSpkModel

Examples at hotexamples.com: 3

Python KaldiRecognizer.SetSpkModel - 3 examples found. These are the top rated real world Python examples of vosk.KaldiRecognizer.SetSpkModel extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

AcceptWaveform(30)

FinalResult(30)

KaldiRecognizer(30)

PartialResult(30)

Result(30)

SetWords(10)

SetMaxAlternatives(6)

SetSpkModel(3)

GetMetadata(2)

Reset(2)

SetNLSML(1)

SetPartialWords(1)

uttConfidence(1)

Example #1

Show file

async def recognize(websocket, path):
    global model
    global spk_model
    global args
    global loop
    global pool

    rec = None
    phrase_list = None
    sample_rate = args.sample_rate
    show_words = args.show_words
    max_alternatives = args.max_alternatives

    logging.info('Connection from %s', websocket.remote_address)

    while True:

        message = await websocket.recv()

        # Load configuration if provided
        if isinstance(message, str) and 'config' in message:
            jobj = json.loads(message)['config']
            logging.info("Config %s", jobj)
            if 'phrase_list' in jobj:
                phrase_list = jobj['phrase_list']
            if 'sample_rate' in jobj:
                sample_rate = float(jobj['sample_rate'])
            if 'words' in jobj:
                show_words = bool(jobj['words'])
            if 'max_alternatives' in jobj:
                max_alternatives = int(jobj['max_alternatives'])
            continue

        # Create the recognizer, word list is temporary disabled since not every model supports it
        if not rec:
            if phrase_list:
                rec = KaldiRecognizer(
                    model, sample_rate,
                    json.dumps(phrase_list, ensure_ascii=False))
            else:
                rec = KaldiRecognizer(model, sample_rate)
            rec.SetWords(show_words)
            rec.SetMaxAlternatives(max_alternatives)
            if spk_model:
                rec.SetSpkModel(spk_model)

        response, stop = await loop.run_in_executor(pool, process_chunk, rec,
                                                    message)
        await websocket.send(response)
        if stop: break

Example #2

Show file

        "Please download the speaker model from https://alphacephei.com/vosk/models and unpack as {} in the current folder."
        .format(spk_model_path))
    exit(1)

wf = wave.open(sys.argv[1], "rb")
if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype(
) != "NONE":
    print("Audio file must be WAV format mono PCM.")
    exit(1)

# Large vocabulary free form recognition
model = Model(lang="en-us")
spk_model = SpkModel(spk_model_path)
#rec = KaldiRecognizer(model, wf.getframerate(), spk_model)
rec = KaldiRecognizer(model, wf.getframerate())
rec.SetSpkModel(spk_model)

# We compare speakers with cosine distance. We can keep one or several fingerprints for the speaker in a database
# to distingusih among users.
spk_sig = [
    -1.110417, 0.09703002, 1.35658, 0.7798632, -0.305457, -0.339204, 0.6186931,
    -0.4521213, 0.3982236, -0.004530723, 0.7651616, 0.6500852, -0.6664245,
    0.1361499, 0.1358056, -0.2887807, -0.1280468, -0.8208137, -1.620276,
    -0.4628615, 0.7870904, -0.105754, 0.9739769, -0.3258137, -0.7322628,
    -0.6212429, -0.5531687, -0.7796484, 0.7035915, 1.056094, -0.4941756,
    -0.6521456, -0.2238328, -0.003737517, 0.2165709, 1.200186, -0.7737719,
    0.492015, 1.16058, 0.6135428, -0.7183084, 0.3153541, 0.3458071, -1.418189,
    -0.9624157, 0.4168292, -1.627305, 0.2742135, -0.6166027, 0.1962581,
    -0.6406527, 0.4372789, -0.4296024, 0.4898657, -0.9531326, -0.2945702,
    0.7879696, -1.517101, -0.9344181, -0.5049928, -0.005040941, -0.4637912,
    0.8223695, -1.079849, 0.8871287, -0.9732434, -0.5548235, 1.879138,

Example #3

Show file

class VoskProcessor(EngineInterface):
    """Process chunks with Vosk"""
    def __init__(self, send_message, options: dict = None):
        """Create Vosk processor"""
        super().__init__(send_message)
        # Options
        if not options:
            options = {}
        # Common options - See 'EngineInterface'
        self._sample_rate = options.get("samplerate", float(16000))
        self._language = options.get("language")
        if self._language:
            self._language = self._language.replace(
                "_", "-")  # make sure we have xx-XX format
            self.language_code_short = re.split("[-]",
                                                self._language)[0].lower()
        else:
            self.language_code_short = None
        self._asr_model_path = options.get("model", None)
        self._continuous_mode = options.get("continuous", False)
        self._optimize_final_result = options.get("optimizeFinalResult", False)
        # Specific options
        self._alternatives = options.get("alternatives", int(1))
        self._return_words = options.get("words", False)
        try_speaker_detection = options.get("speaker", False)
        self._phrase_list = options.get("phrases")
        # example: self._phrase_list = ["hallo", "kannst du mich hören", "[unk]"]
        # NOTE: speaker detection does not work in all configurations
        if try_speaker_detection:
            self._speaker_detection = (settings.has_speaker_detection_model
                                       and self._alternatives == 0)
        else:
            self._speaker_detection = False
        # Recognizer
        if self._asr_model_path:
            # Reset language because model has higher priority
            if self._asr_model_path in settings.asr_model_paths:
                model_index = settings.asr_model_paths.index(
                    self._asr_model_path)
                self._language = settings.asr_model_languages[model_index]
            else:
                self._language = ""
        elif not self._language or self._language not in settings.asr_model_languages:
            self._asr_model_path = settings.asr_model_paths[0]
            self._language = settings.asr_model_languages[0]
        else:
            model_index = settings.asr_model_languages.index(self._language)
            self._asr_model_path = settings.asr_model_paths[model_index]
        asr_model_path = settings.asr_models_folder + self._asr_model_path
        # Speaker model
        spk_model_path = settings.speaker_models_folder + settings.speaker_model_paths[
            0]
        # Make sure paths exist and load models
        if self._asr_model_path not in settings.asr_model_paths:
            raise RuntimeError(
                "ASR model path is not defined in available paths")
        if not os.path.exists(asr_model_path):
            raise RuntimeError("ASR model path seems to be wrong")
        if self._speaker_detection and not os.path.exists(spk_model_path):
            raise RuntimeError("Speaker model path seems to be wrong")
        self._model = Model(asr_model_path)
        if self._speaker_detection:
            self._spk_model = SpkModel(spk_model_path)
        # Use phrase list?
        if self._phrase_list and len(self._phrase_list) > 0:
            self._recognizer = KaldiRecognizer(
                self._model, self._sample_rate,
                json.dumps(self._phrase_list, ensure_ascii=False))
        else:
            self._recognizer = KaldiRecognizer(self._model, self._sample_rate)
        self._recognizer.SetMaxAlternatives(self._alternatives)
        if self._return_words:
            self._recognizer.SetWords(True)
        if self._speaker_detection:
            self._recognizer.SetSpkModel(self._spk_model)
        self._partial_result = {}
        self._last_partial_str = ""
        self._final_result = {}
        # states - 0: waiting for input, 1: got partial result, 2: got final result, 3: closing
        self._state = 0
        #
        # TODO: GPU support: check Vosk examples to find out how to enable GPU ... :-P
        # Example code:
        # from vosk import GpuInit, GpuInstantiate
        # GpuInit()
        # def thread_init():
        #     GpuInstantiate()
        # pool = concurrent.futures.ThreadPoolExecutor(initializer=thread_init)

    async def process(self, chunk: bytes):
        """Feed audio chunks to recognizer"""
        result = None
        if self._state == 3:
            pass
        elif self._recognizer.AcceptWaveform(chunk):
            # Silence detected
            result = self._recognizer.Result()
            self._state = 2
            await self._handle_final_result(result)
        else:
            # Partial results possible
            result = self._recognizer.PartialResult()
            self._state = 1
            await self._handle_partial_result(result)
        # End?
        #if not self.accept_chunks:
        #    await self._finish()

    async def finish_processing(self):
        """Wait for last process and end"""
        # End?
        await self._finish()

    async def close(self):
        """Reset recognizer and remove"""
        #if self._recognizer:
        #self._recognizer.Reset()   # this throws an error!? Maye because its closed already?
        #self._recognizer = None

    def get_options(self):
        """Get Vosk options for active setup"""
        active_options = {
            "language": self._language,
            "model": self._asr_model_path,
            "samplerate": self._sample_rate,
            "optimizeFinalResult": self._optimize_final_result,
            "alternatives": self._alternatives,
            "continuous": self._continuous_mode,
            "words": self._return_words,
            "speaker": self._speaker_detection
        }
        if self._phrase_list and len(self._phrase_list) > 0:
            # NOTE: this can be very large, for now we use a placeholder
            active_options["phrases"] = []
            #active_options["phrases"] = self._phrase_list
        else:
            active_options["phrases"] = []
        return active_options

    async def _handle_partial_result(self, result):
        """Handle a partial result"""
        if result and self._last_partial_str != result:
            self._last_partial_str = result
            norm_result = VoskProcessor.normalize_result_format(
                result, self._alternatives, self._return_words)
            self._partial_result = norm_result
            #print("PARTIAL: ", self._partial_result)
            await self._send(self._partial_result, False)

    async def _handle_final_result(self, result, skip_send=False):
        """Handle a final result"""
        if result:
            #print("FINAL: ", result)
            norm_result = VoskProcessor.normalize_result_format(
                result, self._alternatives, self._return_words)
            if self._continuous_mode:
                # In continous mode we send "intermediate" final results
                self._final_result = norm_result
                if not skip_send:
                    await self._send(self._final_result, True)
            else:
                # In non-continous mode we remember one big result
                self._final_result = VoskProcessor.append_to_result(
                    self._final_result, norm_result)
            #print("FINAL (auto): ", self._final_result)

    async def _finish(self):
        """Tell recognizer to stop and handle last result"""
        last_result_was_final = (self._state == 2)
        self._state = 3
        if last_result_was_final and not self._continuous_mode:
            # Send final result (because we haven't done it yet)
            await self._send(self._final_result, True)
            # self._recognizer.Reset()  # TODO: we skip this to prevent ERROR if already reset
        elif last_result_was_final:
            # We don't need to do anything but reset ... right?
            # self._recognizer.Reset()  # TODO: we skip this to prevent ERROR if already reset
            pass
        else:
            # Request final
            result = self._recognizer.FinalResult()
            await self._handle_final_result(result, skip_send=True)
            await self._send(self._final_result, True)

    async def _send(self, json_result, is_final=False):
        """Send result"""
        features = {}
        alternatives = []
        if self._return_words:
            features["words"] = json_result.get("words", [])
        if self._speaker_detection:
            features["speaker_vector"] = json_result.get("spk", [])
        if self._alternatives > 0:
            alternatives = json_result.get("alternatives", [])
        transcript = json_result.get("text", "")
        # Post-processing?
        if is_final and transcript and self._optimize_final_result:
            # Optimize final transcription
            text2num_proc = TextToNumberProcessor(self._language)
            dt_optimizer = DateAndTimeOptimizer(self._language)
            transcript = text2num_proc.process(transcript)
            transcript = dt_optimizer.process(transcript)
        await self.send_transcript(transcript=transcript,
                                   is_final=is_final,
                                   confidence=json_result.get(
                                       "confidence", -1),
                                   features=features,
                                   alternatives=alternatives)

    # ---- Helper functions ----

    @staticmethod
    def normalize_result_format(result: str, alternatives=0, has_words=False):
        """Vosk has many different formats depending on settings
        Convert result into a fixed format so we can handle it better"""
        json_result = json.loads(result)
        words = None
        if alternatives > 0 and "alternatives" in json_result:
            json_result = json_result.get("alternatives", [])
            # handle array
            alternatives = None
            if len(json_result) > 1:
                alternatives = json_result[1:]
            if has_words:
                words = json_result[0].get("result")
            return VoskProcessor.build_normalized_result(
                json_result[0], alternatives, words)
        else:
            # handle object
            if has_words:
                words = json_result.get("result")
            return VoskProcessor.build_normalized_result(
                json_result, None, words)

    @staticmethod
    def build_normalized_result(json_result, alternatives=None, words=None):
        """Build a result object that always looks the same"""
        # text or partial or empty:
        text = json_result.get(
            "text", json_result.get("partial", json_result.get("final", "")))
        confidence = json_result.get("confidence", -1)
        speaker_vec = json_result.get("spk")
        result = {
            "text": text,
            "confidence": confidence,
            "alternatives": alternatives
        }
        if words is not None:
            result["words"] = words
        if speaker_vec is not None:
            result["spk"] = speaker_vec
        return result

    @staticmethod
    def append_to_result(given_result, new_result):
        """Append a new result to a previous one, typically used for
        'intermediate' final result text"""
        text = new_result.get("text")
        if not text:
            return given_result
        #else:            # we can do more post-processing here maybe
        if "text" in given_result:
            given_result["text"] += ", " + text
            if "confidence" in new_result:
                # sloppy confidence merge (take the worst)
                given_result["confidence"] = min(
                    given_result.get("confidence", -1),
                    new_result.get("confidence", -1))
            if "words" in new_result:
                # append words
                given_words = given_result.get("words", [])
                new_words = new_result.get("words", [])
                if given_words and len(given_words) and new_words and len(
                        new_words):
                    given_result["words"] = given_words + new_words
            if "spk" in new_result:
                # take new speaker data - NOTE: not optimal
                given_result["spk"] = new_result.get(
                    "spk", given_result.get("spk", []))
            return given_result
        else:
            new_result["text"] = text
            return new_result