Exemple #1
0
def get_transcript(wav_filename):
    """
    From a WAV filename, use vosk to generate a Transcript object
    See example code https://github.com/alphacep/vosk-api/blob/master/python/example/test_simple.py
    """
    transcript_text = []
    transcript = Transcript()
    wav_file = wave.open(wav_filename, "rb")
    if wav_file.getnchannels() != 1 or wav_file.getsampwidth(
    ) != 2 or wav_file.getcomptype() != "NONE":
        print("Audio file must be WAV format mono PCM.")
        raise Exception("Audio file must be WAV format mono PCM.")
    model = Model(MODEL_DIR)
    rec = KaldiRecognizer(model, wav_file.getframerate())
    rec.SetWords(True)
    while True:
        data = wav_file.readframes(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            result = rec.Result()
            json_result = json.loads(result)
            if (exists(json_result, 'result')):
                for word in json_result['result']:
                    item = Item()
                    item.start_time = word['start']
                    item.end_time = word['end']
                    item.confidence = word['conf']
                    item.content = word['word']
                    transcript.items.append(item)
            if (exists(json_result, 'text')):
                transcript_text.append(json_result['text'])
    transcript.text = ' '.join(transcript_text)
    return transcript
def reconize(model_path, process):
    vosk_model = Model(model_path)
    reconizer = KaldiRecognizer(vosk_model, sample_rate)
    reconizer.SetWords(True)

    while True:
        data = process.stdout.read(8000)
        if len(data) == 0:
            break
        if reconizer.AcceptWaveform(data):
            yield format_result(reconizer.Result())

    yield format_result(reconizer.FinalResult())
Exemple #3
0
async def recognize(websocket, path):
    global model
    global spk_model
    global args
    global loop
    global pool

    rec = None
    phrase_list = None
    sample_rate = args.sample_rate
    show_words = args.show_words
    max_alternatives = args.max_alternatives

    logging.info('Connection from %s', websocket.remote_address)

    while True:

        message = await websocket.recv()

        # Load configuration if provided
        if isinstance(message, str) and 'config' in message:
            jobj = json.loads(message)['config']
            logging.info("Config %s", jobj)
            if 'phrase_list' in jobj:
                phrase_list = jobj['phrase_list']
            if 'sample_rate' in jobj:
                sample_rate = float(jobj['sample_rate'])
            if 'words' in jobj:
                show_words = bool(jobj['words'])
            if 'max_alternatives' in jobj:
                max_alternatives = int(jobj['max_alternatives'])
            continue

        # Create the recognizer, word list is temporary disabled since not every model supports it
        if not rec:
            if phrase_list:
                rec = KaldiRecognizer(
                    model, sample_rate,
                    json.dumps(phrase_list, ensure_ascii=False))
            else:
                rec = KaldiRecognizer(model, sample_rate)
            rec.SetWords(show_words)
            rec.SetMaxAlternatives(max_alternatives)
            if spk_model:
                rec.SetSpkModel(spk_model)

        response, stop = await loop.run_in_executor(pool, process_chunk, rec,
                                                    message)
        await websocket.send(response)
        if stop: break
Exemple #4
0
    def process_entry(self, inputdata):
        logging.info(f'Recognizing {inputdata[0]}')

        rec = KaldiRecognizer(self.model, 16000)
        rec.SetWords(True)

        stream = self.resample_ffmpeg(inputdata[0])
        result, tot_samples = self.recognize_stream(rec, stream)
        final_result = self.format_result(result)

        if inputdata[1] != '':
            with open(inputdata[1], 'w', encoding='utf-8') as fh:
                fh.write(final_result)
        else:
            print(final_result)
        return final_result, tot_samples
Exemple #5
0
    def StreamingRecognize(self, request_iterator, context):
        request = next(request_iterator)
        partial = request.config.specification.partial_results
        recognizer = KaldiRecognizer(
            self.model, request.config.specification.sample_rate_hertz)
        recognizer.SetMaxAlternatives(
            request.config.specification.max_alternatives)
        recognizer.SetWords(
            request.config.specification.enable_word_time_offsets)

        for request in request_iterator:
            res = recognizer.AcceptWaveform(request.audio_content)
            if res:
                yield self.get_response(recognizer.Result())
            elif partial:
                yield self.get_response(recognizer.PartialResult())
        yield self.get_response(recognizer.FinalResult())
Exemple #6
0
    def text_from_sound_file(self, file):
        wf = wave.open(file, 'rb')
        if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype(
        ) != 'NONE':
            print(f'{self.log_prefix}audio file must be WAV format mono PCM.')
            exit(1)

        rec = KaldiRecognizer(self._model, wf.getframerate())
        rec.SetWords(True)

        while True:
            data = wf.readframes(4000)
            if len(data) == 0:
                break
            rec.AcceptWaveform(data)
            # if rec.AcceptWaveform(data):
            #     print(rec.Result())
            # else:
            #     print(rec.PartialResult())

        return json.loads(rec.FinalResult())['text']
    def _recognize_vosk(self):
        SetLogLevel(0)
        if not os.path.exists("vosk-model-small-es-0.3"):
            raise Exception("Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder.")
        text = []
        wf = wave.open(self.file_name, "rb")
        if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
            raise Exception("Audio file must be WAV format mono PCM.")
        model = Model("vosk-model-small-es-0.3")
        rec = KaldiRecognizer(model, wf.getframerate())
        rec.SetWords(True)

        while True:
            data = wf.readframes(4000)
            if len(data) == 0:
                break
            rec.AcceptWaveform(data)

        try:
            res = json.loads(rec.FinalResult())
            return res['text']
        except Exception as e:
            print(e)
            return ""
            if 'model' in file and '.zip' not in file:
                if platform.system() == "Windows":
                    status = subprocess.call('copy %s model /e'%('./VTT/'+file), shell=True)
                else:
                    status = subprocess.call('cp -r %s %s'%(os.getcwd()+'/VTT/'+file+"", os.getcwd()+'/VTT/model'), shell=True)

        print("Finished")
    except Exception:
        print(Exception)
        print ("Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder.")
        exit (1)

sample_rate=16000
model = Model("./VTT/model")
rec = KaldiRecognizer(model, sample_rate)
rec.SetWords(True)


process = subprocess.Popen(['ffmpeg', '-loglevel', 'quiet', '-i',
                            sys.argv[0],
                            '-ar', str(sample_rate) , '-ac', '1', '-f', 's16le', '-'],
                            stdout=subprocess.PIPE)


WORDS_PER_LINE = 7

def transcribe():
    results = []
    subs = []
    while True:
       data = process.stdout.read(4000)
Exemple #9
0
def process_files_list(files_list, search_words):
    global count_words
    global count_raw_words
    global count_pure_words
    global testing_texts_to_write
    global validation_texts_to_write
    global amount_all_audio_files
    global count_all_audio_files
    global last_count_all_audio_files
    global process_audio_files_threshold
    global inappropriate_words
    global debug_raw_words
    for filename in files_list:
        count_all_audio_files += 1
        if (count_all_audio_files - last_count_all_audio_files) > (
                process_audio_files_threshold - 1):
            last_count_all_audio_files = count_all_audio_files
            print('%d audio files processed, %d files left...' %
                  (last_count_all_audio_files,
                   amount_all_audio_files - last_count_all_audio_files))
        wf = wave.open(filename, "rb")
        if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype(
        ) != "NONE":
            print("Audio file must be wav format mono PCM.")
            exit(1)

        rec = KaldiRecognizer(model, wf.getframerate())
        rec.SetWords(True)

        while True:
            data = wf.readframes(4000)
            if len(data) == 0:
                break
            rec.AcceptWaveform(data)

        try:
            json_obj = json.loads(rec.FinalResult())
        except ValueError:
            continue

        result_field_name = 'result'
        if result_field_name in json_obj:
            for result in json_obj[result_field_name]:
                word = result['word']
                for search_word in search_words:
                    if search_word in word:
                        start = result['start']
                        end = result['end']
                        saved_start = start
                        saved_end = end
                        symbol_duration = (end - start) / len(word)
                        inappropriate = symbol_duration < search_words[
                            search_word]
                        if inappropriate:
                            inappropriate_words[search_word] += 1
                        is_pure_word = word == search_word
                        if not is_pure_word:
                            found = str(word).find(search_word)
                            start += found * symbol_duration
                            end -= (
                                len(word) -
                                (found + len(search_word))) * symbol_duration
                        segment = AudioSegment.from_wav(filename)
                        segment = segment[start * 1000:end * 1000]
                        out_category_path = os.path.join(
                            out_folder, search_word)
                        Path(out_category_path).mkdir(parents=True,
                                                      exist_ok=True)
                        out_path = os.path.join(out_category_path,
                                                ntpath.basename(filename))
                        wav_ext = '.wav'
                        offset = len(out_path) - len(wav_ext)
                        out_path = out_path[:offset] + '_0' + out_path[offset:]
                        while os.path.isfile(out_path):
                            right = out_path.rfind(wav_ext)
                            left = out_path[:right].rfind('_')
                            new_num = int(out_path[(left + 1):right]) + 1
                            out_path = out_path[:(
                                left + 1)] + str(new_num) + out_path[right:]
                        segment.export(out_path, format="wav")
                        if os.path.isfile(out_path):
                            text_to_write = search_word + '/' + ntpath.basename(
                                out_path) + '\n'
                            count_words[search_word] += 1
                            if is_pure_word:
                                count_pure_words[search_word] += 1
                                if not inappropriate:
                                    validation_texts_to_write[
                                        search_word].append(text_to_write)
                            else:
                                count_raw_words[search_word] += 1
                                if not inappropriate:
                                    testing_texts_to_write[search_word].append(
                                        text_to_write)
                                if debug_raw_words:
                                    segment = AudioSegment.from_wav(filename)
                                    segment = segment[saved_start *
                                                      1000:saved_end * 1000]
                                    out_category_path = os.path.join(
                                        out_folder,
                                        os.path.join(search_word, 'raw'))
                                    Path(out_category_path).mkdir(
                                        parents=True, exist_ok=True)
                                    segment.export(os.path.join(
                                        out_category_path,
                                        ntpath.basename(out_path)),
                                                   format="wav")
                        else:
                            print(
                                'Failed to cut segment for word "%s" from file %s'
                                % (search_word, filename))
Exemple #10
0
class VoskProcessor(EngineInterface):
    """Process chunks with Vosk"""
    def __init__(self, send_message, options: dict = None):
        """Create Vosk processor"""
        super().__init__(send_message)
        # Options
        if not options:
            options = {}
        # Common options - See 'EngineInterface'
        self._sample_rate = options.get("samplerate", float(16000))
        self._language = options.get("language")
        if self._language:
            self._language = self._language.replace(
                "_", "-")  # make sure we have xx-XX format
            self.language_code_short = re.split("[-]",
                                                self._language)[0].lower()
        else:
            self.language_code_short = None
        self._asr_model_path = options.get("model", None)
        self._continuous_mode = options.get("continuous", False)
        self._optimize_final_result = options.get("optimizeFinalResult", False)
        # Specific options
        self._alternatives = options.get("alternatives", int(1))
        self._return_words = options.get("words", False)
        try_speaker_detection = options.get("speaker", False)
        self._phrase_list = options.get("phrases")
        # example: self._phrase_list = ["hallo", "kannst du mich hören", "[unk]"]
        # NOTE: speaker detection does not work in all configurations
        if try_speaker_detection:
            self._speaker_detection = (settings.has_speaker_detection_model
                                       and self._alternatives == 0)
        else:
            self._speaker_detection = False
        # Recognizer
        if self._asr_model_path:
            # Reset language because model has higher priority
            if self._asr_model_path in settings.asr_model_paths:
                model_index = settings.asr_model_paths.index(
                    self._asr_model_path)
                self._language = settings.asr_model_languages[model_index]
            else:
                self._language = ""
        elif not self._language or self._language not in settings.asr_model_languages:
            self._asr_model_path = settings.asr_model_paths[0]
            self._language = settings.asr_model_languages[0]
        else:
            model_index = settings.asr_model_languages.index(self._language)
            self._asr_model_path = settings.asr_model_paths[model_index]
        asr_model_path = settings.asr_models_folder + self._asr_model_path
        # Speaker model
        spk_model_path = settings.speaker_models_folder + settings.speaker_model_paths[
            0]
        # Make sure paths exist and load models
        if self._asr_model_path not in settings.asr_model_paths:
            raise RuntimeError(
                "ASR model path is not defined in available paths")
        if not os.path.exists(asr_model_path):
            raise RuntimeError("ASR model path seems to be wrong")
        if self._speaker_detection and not os.path.exists(spk_model_path):
            raise RuntimeError("Speaker model path seems to be wrong")
        self._model = Model(asr_model_path)
        if self._speaker_detection:
            self._spk_model = SpkModel(spk_model_path)
        # Use phrase list?
        if self._phrase_list and len(self._phrase_list) > 0:
            self._recognizer = KaldiRecognizer(
                self._model, self._sample_rate,
                json.dumps(self._phrase_list, ensure_ascii=False))
        else:
            self._recognizer = KaldiRecognizer(self._model, self._sample_rate)
        self._recognizer.SetMaxAlternatives(self._alternatives)
        if self._return_words:
            self._recognizer.SetWords(True)
        if self._speaker_detection:
            self._recognizer.SetSpkModel(self._spk_model)
        self._partial_result = {}
        self._last_partial_str = ""
        self._final_result = {}
        # states - 0: waiting for input, 1: got partial result, 2: got final result, 3: closing
        self._state = 0
        #
        # TODO: GPU support: check Vosk examples to find out how to enable GPU ... :-P
        # Example code:
        # from vosk import GpuInit, GpuInstantiate
        # GpuInit()
        # def thread_init():
        #     GpuInstantiate()
        # pool = concurrent.futures.ThreadPoolExecutor(initializer=thread_init)

    async def process(self, chunk: bytes):
        """Feed audio chunks to recognizer"""
        result = None
        if self._state == 3:
            pass
        elif self._recognizer.AcceptWaveform(chunk):
            # Silence detected
            result = self._recognizer.Result()
            self._state = 2
            await self._handle_final_result(result)
        else:
            # Partial results possible
            result = self._recognizer.PartialResult()
            self._state = 1
            await self._handle_partial_result(result)
        # End?
        #if not self.accept_chunks:
        #    await self._finish()

    async def finish_processing(self):
        """Wait for last process and end"""
        # End?
        await self._finish()

    async def close(self):
        """Reset recognizer and remove"""
        #if self._recognizer:
        #self._recognizer.Reset()   # this throws an error!? Maye because its closed already?
        #self._recognizer = None

    def get_options(self):
        """Get Vosk options for active setup"""
        active_options = {
            "language": self._language,
            "model": self._asr_model_path,
            "samplerate": self._sample_rate,
            "optimizeFinalResult": self._optimize_final_result,
            "alternatives": self._alternatives,
            "continuous": self._continuous_mode,
            "words": self._return_words,
            "speaker": self._speaker_detection
        }
        if self._phrase_list and len(self._phrase_list) > 0:
            # NOTE: this can be very large, for now we use a placeholder
            active_options["phrases"] = []
            #active_options["phrases"] = self._phrase_list
        else:
            active_options["phrases"] = []
        return active_options

    async def _handle_partial_result(self, result):
        """Handle a partial result"""
        if result and self._last_partial_str != result:
            self._last_partial_str = result
            norm_result = VoskProcessor.normalize_result_format(
                result, self._alternatives, self._return_words)
            self._partial_result = norm_result
            #print("PARTIAL: ", self._partial_result)
            await self._send(self._partial_result, False)

    async def _handle_final_result(self, result, skip_send=False):
        """Handle a final result"""
        if result:
            #print("FINAL: ", result)
            norm_result = VoskProcessor.normalize_result_format(
                result, self._alternatives, self._return_words)
            if self._continuous_mode:
                # In continous mode we send "intermediate" final results
                self._final_result = norm_result
                if not skip_send:
                    await self._send(self._final_result, True)
            else:
                # In non-continous mode we remember one big result
                self._final_result = VoskProcessor.append_to_result(
                    self._final_result, norm_result)
            #print("FINAL (auto): ", self._final_result)

    async def _finish(self):
        """Tell recognizer to stop and handle last result"""
        last_result_was_final = (self._state == 2)
        self._state = 3
        if last_result_was_final and not self._continuous_mode:
            # Send final result (because we haven't done it yet)
            await self._send(self._final_result, True)
            # self._recognizer.Reset()  # TODO: we skip this to prevent ERROR if already reset
        elif last_result_was_final:
            # We don't need to do anything but reset ... right?
            # self._recognizer.Reset()  # TODO: we skip this to prevent ERROR if already reset
            pass
        else:
            # Request final
            result = self._recognizer.FinalResult()
            await self._handle_final_result(result, skip_send=True)
            await self._send(self._final_result, True)

    async def _send(self, json_result, is_final=False):
        """Send result"""
        features = {}
        alternatives = []
        if self._return_words:
            features["words"] = json_result.get("words", [])
        if self._speaker_detection:
            features["speaker_vector"] = json_result.get("spk", [])
        if self._alternatives > 0:
            alternatives = json_result.get("alternatives", [])
        transcript = json_result.get("text", "")
        # Post-processing?
        if is_final and transcript and self._optimize_final_result:
            # Optimize final transcription
            text2num_proc = TextToNumberProcessor(self._language)
            dt_optimizer = DateAndTimeOptimizer(self._language)
            transcript = text2num_proc.process(transcript)
            transcript = dt_optimizer.process(transcript)
        await self.send_transcript(transcript=transcript,
                                   is_final=is_final,
                                   confidence=json_result.get(
                                       "confidence", -1),
                                   features=features,
                                   alternatives=alternatives)

    # ---- Helper functions ----

    @staticmethod
    def normalize_result_format(result: str, alternatives=0, has_words=False):
        """Vosk has many different formats depending on settings
        Convert result into a fixed format so we can handle it better"""
        json_result = json.loads(result)
        words = None
        if alternatives > 0 and "alternatives" in json_result:
            json_result = json_result.get("alternatives", [])
            # handle array
            alternatives = None
            if len(json_result) > 1:
                alternatives = json_result[1:]
            if has_words:
                words = json_result[0].get("result")
            return VoskProcessor.build_normalized_result(
                json_result[0], alternatives, words)
        else:
            # handle object
            if has_words:
                words = json_result.get("result")
            return VoskProcessor.build_normalized_result(
                json_result, None, words)

    @staticmethod
    def build_normalized_result(json_result, alternatives=None, words=None):
        """Build a result object that always looks the same"""
        # text or partial or empty:
        text = json_result.get(
            "text", json_result.get("partial", json_result.get("final", "")))
        confidence = json_result.get("confidence", -1)
        speaker_vec = json_result.get("spk")
        result = {
            "text": text,
            "confidence": confidence,
            "alternatives": alternatives
        }
        if words is not None:
            result["words"] = words
        if speaker_vec is not None:
            result["spk"] = speaker_vec
        return result

    @staticmethod
    def append_to_result(given_result, new_result):
        """Append a new result to a previous one, typically used for
        'intermediate' final result text"""
        text = new_result.get("text")
        if not text:
            return given_result
        #else:            # we can do more post-processing here maybe
        if "text" in given_result:
            given_result["text"] += ", " + text
            if "confidence" in new_result:
                # sloppy confidence merge (take the worst)
                given_result["confidence"] = min(
                    given_result.get("confidence", -1),
                    new_result.get("confidence", -1))
            if "words" in new_result:
                # append words
                given_words = given_result.get("words", [])
                new_words = new_result.get("words", [])
                if given_words and len(given_words) and new_words and len(
                        new_words):
                    given_result["words"] = given_words + new_words
            if "spk" in new_result:
                # take new speaker data - NOTE: not optimal
                given_result["spk"] = new_result.get(
                    "spk", given_result.get("spk", []))
            return given_result
        else:
            new_result["text"] = text
            return new_result