def get_transcript(wav_filename): """ From a WAV filename, use vosk to generate a Transcript object See example code https://github.com/alphacep/vosk-api/blob/master/python/example/test_simple.py """ transcript_text = [] transcript = Transcript() wav_file = wave.open(wav_filename, "rb") if wav_file.getnchannels() != 1 or wav_file.getsampwidth( ) != 2 or wav_file.getcomptype() != "NONE": print("Audio file must be WAV format mono PCM.") raise Exception("Audio file must be WAV format mono PCM.") model = Model(MODEL_DIR) rec = KaldiRecognizer(model, wav_file.getframerate()) rec.SetWords(True) while True: data = wav_file.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): result = rec.Result() json_result = json.loads(result) if (exists(json_result, 'result')): for word in json_result['result']: item = Item() item.start_time = word['start'] item.end_time = word['end'] item.confidence = word['conf'] item.content = word['word'] transcript.items.append(item) if (exists(json_result, 'text')): transcript_text.append(json_result['text']) transcript.text = ' '.join(transcript_text) return transcript
def reconize(model_path, process): vosk_model = Model(model_path) reconizer = KaldiRecognizer(vosk_model, sample_rate) reconizer.SetWords(True) while True: data = process.stdout.read(8000) if len(data) == 0: break if reconizer.AcceptWaveform(data): yield format_result(reconizer.Result()) yield format_result(reconizer.FinalResult())
async def recognize(websocket, path): global model global spk_model global args global loop global pool rec = None phrase_list = None sample_rate = args.sample_rate show_words = args.show_words max_alternatives = args.max_alternatives logging.info('Connection from %s', websocket.remote_address) while True: message = await websocket.recv() # Load configuration if provided if isinstance(message, str) and 'config' in message: jobj = json.loads(message)['config'] logging.info("Config %s", jobj) if 'phrase_list' in jobj: phrase_list = jobj['phrase_list'] if 'sample_rate' in jobj: sample_rate = float(jobj['sample_rate']) if 'words' in jobj: show_words = bool(jobj['words']) if 'max_alternatives' in jobj: max_alternatives = int(jobj['max_alternatives']) continue # Create the recognizer, word list is temporary disabled since not every model supports it if not rec: if phrase_list: rec = KaldiRecognizer( model, sample_rate, json.dumps(phrase_list, ensure_ascii=False)) else: rec = KaldiRecognizer(model, sample_rate) rec.SetWords(show_words) rec.SetMaxAlternatives(max_alternatives) if spk_model: rec.SetSpkModel(spk_model) response, stop = await loop.run_in_executor(pool, process_chunk, rec, message) await websocket.send(response) if stop: break
def process_entry(self, inputdata): logging.info(f'Recognizing {inputdata[0]}') rec = KaldiRecognizer(self.model, 16000) rec.SetWords(True) stream = self.resample_ffmpeg(inputdata[0]) result, tot_samples = self.recognize_stream(rec, stream) final_result = self.format_result(result) if inputdata[1] != '': with open(inputdata[1], 'w', encoding='utf-8') as fh: fh.write(final_result) else: print(final_result) return final_result, tot_samples
def StreamingRecognize(self, request_iterator, context): request = next(request_iterator) partial = request.config.specification.partial_results recognizer = KaldiRecognizer( self.model, request.config.specification.sample_rate_hertz) recognizer.SetMaxAlternatives( request.config.specification.max_alternatives) recognizer.SetWords( request.config.specification.enable_word_time_offsets) for request in request_iterator: res = recognizer.AcceptWaveform(request.audio_content) if res: yield self.get_response(recognizer.Result()) elif partial: yield self.get_response(recognizer.PartialResult()) yield self.get_response(recognizer.FinalResult())
def text_from_sound_file(self, file): wf = wave.open(file, 'rb') if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype( ) != 'NONE': print(f'{self.log_prefix}audio file must be WAV format mono PCM.') exit(1) rec = KaldiRecognizer(self._model, wf.getframerate()) rec.SetWords(True) while True: data = wf.readframes(4000) if len(data) == 0: break rec.AcceptWaveform(data) # if rec.AcceptWaveform(data): # print(rec.Result()) # else: # print(rec.PartialResult()) return json.loads(rec.FinalResult())['text']
def _recognize_vosk(self): SetLogLevel(0) if not os.path.exists("vosk-model-small-es-0.3"): raise Exception("Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder.") text = [] wf = wave.open(self.file_name, "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE": raise Exception("Audio file must be WAV format mono PCM.") model = Model("vosk-model-small-es-0.3") rec = KaldiRecognizer(model, wf.getframerate()) rec.SetWords(True) while True: data = wf.readframes(4000) if len(data) == 0: break rec.AcceptWaveform(data) try: res = json.loads(rec.FinalResult()) return res['text'] except Exception as e: print(e) return ""
if 'model' in file and '.zip' not in file: if platform.system() == "Windows": status = subprocess.call('copy %s model /e'%('./VTT/'+file), shell=True) else: status = subprocess.call('cp -r %s %s'%(os.getcwd()+'/VTT/'+file+"", os.getcwd()+'/VTT/model'), shell=True) print("Finished") except Exception: print(Exception) print ("Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder.") exit (1) sample_rate=16000 model = Model("./VTT/model") rec = KaldiRecognizer(model, sample_rate) rec.SetWords(True) process = subprocess.Popen(['ffmpeg', '-loglevel', 'quiet', '-i', sys.argv[0], '-ar', str(sample_rate) , '-ac', '1', '-f', 's16le', '-'], stdout=subprocess.PIPE) WORDS_PER_LINE = 7 def transcribe(): results = [] subs = [] while True: data = process.stdout.read(4000)
def process_files_list(files_list, search_words): global count_words global count_raw_words global count_pure_words global testing_texts_to_write global validation_texts_to_write global amount_all_audio_files global count_all_audio_files global last_count_all_audio_files global process_audio_files_threshold global inappropriate_words global debug_raw_words for filename in files_list: count_all_audio_files += 1 if (count_all_audio_files - last_count_all_audio_files) > ( process_audio_files_threshold - 1): last_count_all_audio_files = count_all_audio_files print('%d audio files processed, %d files left...' % (last_count_all_audio_files, amount_all_audio_files - last_count_all_audio_files)) wf = wave.open(filename, "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype( ) != "NONE": print("Audio file must be wav format mono PCM.") exit(1) rec = KaldiRecognizer(model, wf.getframerate()) rec.SetWords(True) while True: data = wf.readframes(4000) if len(data) == 0: break rec.AcceptWaveform(data) try: json_obj = json.loads(rec.FinalResult()) except ValueError: continue result_field_name = 'result' if result_field_name in json_obj: for result in json_obj[result_field_name]: word = result['word'] for search_word in search_words: if search_word in word: start = result['start'] end = result['end'] saved_start = start saved_end = end symbol_duration = (end - start) / len(word) inappropriate = symbol_duration < search_words[ search_word] if inappropriate: inappropriate_words[search_word] += 1 is_pure_word = word == search_word if not is_pure_word: found = str(word).find(search_word) start += found * symbol_duration end -= ( len(word) - (found + len(search_word))) * symbol_duration segment = AudioSegment.from_wav(filename) segment = segment[start * 1000:end * 1000] out_category_path = os.path.join( out_folder, search_word) Path(out_category_path).mkdir(parents=True, exist_ok=True) out_path = os.path.join(out_category_path, ntpath.basename(filename)) wav_ext = '.wav' offset = len(out_path) - len(wav_ext) out_path = out_path[:offset] + '_0' + out_path[offset:] while os.path.isfile(out_path): right = out_path.rfind(wav_ext) left = out_path[:right].rfind('_') new_num = int(out_path[(left + 1):right]) + 1 out_path = out_path[:( left + 1)] + str(new_num) + out_path[right:] segment.export(out_path, format="wav") if os.path.isfile(out_path): text_to_write = search_word + '/' + ntpath.basename( out_path) + '\n' count_words[search_word] += 1 if is_pure_word: count_pure_words[search_word] += 1 if not inappropriate: validation_texts_to_write[ search_word].append(text_to_write) else: count_raw_words[search_word] += 1 if not inappropriate: testing_texts_to_write[search_word].append( text_to_write) if debug_raw_words: segment = AudioSegment.from_wav(filename) segment = segment[saved_start * 1000:saved_end * 1000] out_category_path = os.path.join( out_folder, os.path.join(search_word, 'raw')) Path(out_category_path).mkdir( parents=True, exist_ok=True) segment.export(os.path.join( out_category_path, ntpath.basename(out_path)), format="wav") else: print( 'Failed to cut segment for word "%s" from file %s' % (search_word, filename))
class VoskProcessor(EngineInterface): """Process chunks with Vosk""" def __init__(self, send_message, options: dict = None): """Create Vosk processor""" super().__init__(send_message) # Options if not options: options = {} # Common options - See 'EngineInterface' self._sample_rate = options.get("samplerate", float(16000)) self._language = options.get("language") if self._language: self._language = self._language.replace( "_", "-") # make sure we have xx-XX format self.language_code_short = re.split("[-]", self._language)[0].lower() else: self.language_code_short = None self._asr_model_path = options.get("model", None) self._continuous_mode = options.get("continuous", False) self._optimize_final_result = options.get("optimizeFinalResult", False) # Specific options self._alternatives = options.get("alternatives", int(1)) self._return_words = options.get("words", False) try_speaker_detection = options.get("speaker", False) self._phrase_list = options.get("phrases") # example: self._phrase_list = ["hallo", "kannst du mich hören", "[unk]"] # NOTE: speaker detection does not work in all configurations if try_speaker_detection: self._speaker_detection = (settings.has_speaker_detection_model and self._alternatives == 0) else: self._speaker_detection = False # Recognizer if self._asr_model_path: # Reset language because model has higher priority if self._asr_model_path in settings.asr_model_paths: model_index = settings.asr_model_paths.index( self._asr_model_path) self._language = settings.asr_model_languages[model_index] else: self._language = "" elif not self._language or self._language not in settings.asr_model_languages: self._asr_model_path = settings.asr_model_paths[0] self._language = settings.asr_model_languages[0] else: model_index = settings.asr_model_languages.index(self._language) self._asr_model_path = settings.asr_model_paths[model_index] asr_model_path = settings.asr_models_folder + self._asr_model_path # Speaker model spk_model_path = settings.speaker_models_folder + settings.speaker_model_paths[ 0] # Make sure paths exist and load models if self._asr_model_path not in settings.asr_model_paths: raise RuntimeError( "ASR model path is not defined in available paths") if not os.path.exists(asr_model_path): raise RuntimeError("ASR model path seems to be wrong") if self._speaker_detection and not os.path.exists(spk_model_path): raise RuntimeError("Speaker model path seems to be wrong") self._model = Model(asr_model_path) if self._speaker_detection: self._spk_model = SpkModel(spk_model_path) # Use phrase list? if self._phrase_list and len(self._phrase_list) > 0: self._recognizer = KaldiRecognizer( self._model, self._sample_rate, json.dumps(self._phrase_list, ensure_ascii=False)) else: self._recognizer = KaldiRecognizer(self._model, self._sample_rate) self._recognizer.SetMaxAlternatives(self._alternatives) if self._return_words: self._recognizer.SetWords(True) if self._speaker_detection: self._recognizer.SetSpkModel(self._spk_model) self._partial_result = {} self._last_partial_str = "" self._final_result = {} # states - 0: waiting for input, 1: got partial result, 2: got final result, 3: closing self._state = 0 # # TODO: GPU support: check Vosk examples to find out how to enable GPU ... :-P # Example code: # from vosk import GpuInit, GpuInstantiate # GpuInit() # def thread_init(): # GpuInstantiate() # pool = concurrent.futures.ThreadPoolExecutor(initializer=thread_init) async def process(self, chunk: bytes): """Feed audio chunks to recognizer""" result = None if self._state == 3: pass elif self._recognizer.AcceptWaveform(chunk): # Silence detected result = self._recognizer.Result() self._state = 2 await self._handle_final_result(result) else: # Partial results possible result = self._recognizer.PartialResult() self._state = 1 await self._handle_partial_result(result) # End? #if not self.accept_chunks: # await self._finish() async def finish_processing(self): """Wait for last process and end""" # End? await self._finish() async def close(self): """Reset recognizer and remove""" #if self._recognizer: #self._recognizer.Reset() # this throws an error!? Maye because its closed already? #self._recognizer = None def get_options(self): """Get Vosk options for active setup""" active_options = { "language": self._language, "model": self._asr_model_path, "samplerate": self._sample_rate, "optimizeFinalResult": self._optimize_final_result, "alternatives": self._alternatives, "continuous": self._continuous_mode, "words": self._return_words, "speaker": self._speaker_detection } if self._phrase_list and len(self._phrase_list) > 0: # NOTE: this can be very large, for now we use a placeholder active_options["phrases"] = [] #active_options["phrases"] = self._phrase_list else: active_options["phrases"] = [] return active_options async def _handle_partial_result(self, result): """Handle a partial result""" if result and self._last_partial_str != result: self._last_partial_str = result norm_result = VoskProcessor.normalize_result_format( result, self._alternatives, self._return_words) self._partial_result = norm_result #print("PARTIAL: ", self._partial_result) await self._send(self._partial_result, False) async def _handle_final_result(self, result, skip_send=False): """Handle a final result""" if result: #print("FINAL: ", result) norm_result = VoskProcessor.normalize_result_format( result, self._alternatives, self._return_words) if self._continuous_mode: # In continous mode we send "intermediate" final results self._final_result = norm_result if not skip_send: await self._send(self._final_result, True) else: # In non-continous mode we remember one big result self._final_result = VoskProcessor.append_to_result( self._final_result, norm_result) #print("FINAL (auto): ", self._final_result) async def _finish(self): """Tell recognizer to stop and handle last result""" last_result_was_final = (self._state == 2) self._state = 3 if last_result_was_final and not self._continuous_mode: # Send final result (because we haven't done it yet) await self._send(self._final_result, True) # self._recognizer.Reset() # TODO: we skip this to prevent ERROR if already reset elif last_result_was_final: # We don't need to do anything but reset ... right? # self._recognizer.Reset() # TODO: we skip this to prevent ERROR if already reset pass else: # Request final result = self._recognizer.FinalResult() await self._handle_final_result(result, skip_send=True) await self._send(self._final_result, True) async def _send(self, json_result, is_final=False): """Send result""" features = {} alternatives = [] if self._return_words: features["words"] = json_result.get("words", []) if self._speaker_detection: features["speaker_vector"] = json_result.get("spk", []) if self._alternatives > 0: alternatives = json_result.get("alternatives", []) transcript = json_result.get("text", "") # Post-processing? if is_final and transcript and self._optimize_final_result: # Optimize final transcription text2num_proc = TextToNumberProcessor(self._language) dt_optimizer = DateAndTimeOptimizer(self._language) transcript = text2num_proc.process(transcript) transcript = dt_optimizer.process(transcript) await self.send_transcript(transcript=transcript, is_final=is_final, confidence=json_result.get( "confidence", -1), features=features, alternatives=alternatives) # ---- Helper functions ---- @staticmethod def normalize_result_format(result: str, alternatives=0, has_words=False): """Vosk has many different formats depending on settings Convert result into a fixed format so we can handle it better""" json_result = json.loads(result) words = None if alternatives > 0 and "alternatives" in json_result: json_result = json_result.get("alternatives", []) # handle array alternatives = None if len(json_result) > 1: alternatives = json_result[1:] if has_words: words = json_result[0].get("result") return VoskProcessor.build_normalized_result( json_result[0], alternatives, words) else: # handle object if has_words: words = json_result.get("result") return VoskProcessor.build_normalized_result( json_result, None, words) @staticmethod def build_normalized_result(json_result, alternatives=None, words=None): """Build a result object that always looks the same""" # text or partial or empty: text = json_result.get( "text", json_result.get("partial", json_result.get("final", ""))) confidence = json_result.get("confidence", -1) speaker_vec = json_result.get("spk") result = { "text": text, "confidence": confidence, "alternatives": alternatives } if words is not None: result["words"] = words if speaker_vec is not None: result["spk"] = speaker_vec return result @staticmethod def append_to_result(given_result, new_result): """Append a new result to a previous one, typically used for 'intermediate' final result text""" text = new_result.get("text") if not text: return given_result #else: # we can do more post-processing here maybe if "text" in given_result: given_result["text"] += ", " + text if "confidence" in new_result: # sloppy confidence merge (take the worst) given_result["confidence"] = min( given_result.get("confidence", -1), new_result.get("confidence", -1)) if "words" in new_result: # append words given_words = given_result.get("words", []) new_words = new_result.get("words", []) if given_words and len(given_words) and new_words and len( new_words): given_result["words"] = given_words + new_words if "spk" in new_result: # take new speaker data - NOTE: not optimal given_result["spk"] = new_result.get( "spk", given_result.get("spk", [])) return given_result else: new_result["text"] = text return new_result