def speech_to_text(palabras=[], *args): import os import pyaudio import json if not os.path.exists("model"): print( "Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder." ) exit(1) model = Model("model") rec = KaldiRecognizer(model, 16000) p = pyaudio.PyAudio() stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000) stream.start_stream() sw = 0 word = "Palabra no encontrada" cont = 1 while sw == 0: cont = cont + 1 print(cont) data = stream.read(10000, exception_on_overflow=False) if len(data) == 0: print("breaking process") break if rec.AcceptWaveform(data): print("result ->" + json.loads(rec.Result())['text']) print("result json ->" + rec.Result()) word = json.loads(rec.Result())['text'] else: print("PartialResult ->" + json.loads(rec.PartialResult())['partial']) print("PartialResult json ->" + rec.PartialResult()) word = json.loads(rec.Result())['text'] for init in palabras: if (word == init): sw = 1 print("palabra aceptada") stream.stop_stream() # stream.close() p.terminate() return word
def audio_to_txt(file_name): os.system(f'ffmpeg -i {file_name} out.wav') model = Model("model") # Large vocabulary free form recognition rec = KaldiRecognizer(model, 16000) wf = wave.open('out.wav', "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype( ) != "NONE": print("Audio file must be WAV format mono PCM.") exit(1) rec = KaldiRecognizer(model, wf.getframerate()) transcript = '' while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): json_dict = json.loads(rec.Result()) transcript += json_dict['text'] else: rec.PartialResult() os.system(f'rm out.wav') return transcript
def recognition(): if not os.path.exists("model"): print( "Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder." ) exit(1) wf = wave.open(sys.argv[1], "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype( ) != "NONE": print("Audio file must be WAV format mono PCM.") exit(1) model = Model("model") # You can also specify the possible word or phrase list as JSON list, the order doesn't have to be strict rec = KaldiRecognizer( model, wf.getframerate(), '["oh one two three four five six seven eight nine zero", "[unk]"]') while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): print(rec.Result()) else: print(rec.PartialResult()) print(rec.FinalResult())
def translate_file(filename="last5.wav"): SetLogLevel(0) if not os.path.exists("model"): print( "Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder." ) exit(1) filepath = "./" + filename wf = wave.open(filepath, "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype( ) != "NONE": print("Audio file must be WAV format mono PCM.") exit(1) model = Model("./model") rec = KaldiRecognizer(model, wf.getframerate()) while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): print(rec.Result()) else: print(rec.PartialResult()) results = rec.FinalResult() return json.loads(results)[ "text"] #["results"] for confidence of each word
class VoskVoiceToTextCalculator(Calculator): def __init__(self, name, s, options=None): from vosk import Model, KaldiRecognizer super().__init__(name, s, options) self.model = Model("model") self.rec = KaldiRecognizer(self.model, 16000) self.output_data = [None, None] def process(self): audio = self.get(0) if isinstance(audio, AudioData): if self.rec.AcceptWaveform(audio.audio): result = self.rec.Result() try: result_json = json.loads(result) except json.decoder.JSONDecodeError as e: print("Voice2Text: Failed to parse voice json:", e) print(result) else: if 'text' in result_json: text = result_json['text'] if text: print("Voice2Text:", repr(text), result_json) self.set_output(0, VoiceTextData(text, audio.timestamp, info=result_json)) else: partial_result = self.rec.PartialResult() partial_json = json.loads(partial_result) if 'partial' in partial_json: text = partial_json['partial'] if text: print("Voice2Text (partial): ", repr(text)) self.set_output(1, VoiceTextData(text, audio.timestamp, info=partial_json)) return True return False
def speech_to_text(args): if not os.path.exists(os.path.join('models', args.model)): print( "Please download the model from https://alphacephei.com/vosk/models and unpack to 'models' folder.") exit(1) for filepath in glob.iglob(os.path.join(os.getcwd(), args.data, '*.wav')): print(filepath) wf = wave.open(args.data, "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE": print("Audio file must be WAV format mono PCM.") exit(1) model = Model(args.model) rec = KaldiRecognizer(model, wf.getframerate()) while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): print(rec.Result()) else: print(rec.PartialResult()) print(rec.FinalResult()) hypothesis_path = os.path.join(args.hypothesis, filepath.split('.')[0] + '.txt') with open(hypothesis_path, 'w') as hypothesis: hypothesis.write(rec.FinalResult())
def listen(model: Model, spk_model: SpkModel = None, speech_chunk_sec: float = 0.5, buffer_sec: float = 1): with ExitStack() as stack: rate = model.SampleFrequency() if spk_model: rec = KaldiRecognizer(model, spk_model, rate) else: rec = KaldiRecognizer(model, rate) p = stack.enter_context(_pyaudio()) s = stack.enter_context( _pyaudio_open_stream(p, format=paInt16, channels=1, rate=rate, input=True, frames_per_buffer=int(rate * buffer_sec))) while True: data = s.read(int(rate * speech_chunk_sec)) if rec.AcceptWaveform(data): res = json.loads(rec.Result()) logging.info(res) else: res = json.loads(rec.PartialResult()) logging.info(res)
async def processVoice(waveChunk, recognizer: KaldiRecognizer): """ Recognize audio chunk and process with terminal.onText() """ signature = None text = '' final = False try: final = recognizer.AcceptWaveform(waveChunk) if final: # Фраза распознана полностью j = json.loads(recognizer.FinalResult()) # Получить распознанный текст text = str(j['text']).strip() if 'text' in j else '' else: # Получить распознанный текст j = json.loads(recognizer.PartialResult()) text = str(j['partial']).strip() if 'partial' in j else '' # Попытаться извлечь сигнатуру голоса: signature = j["spk"] if 'spk' in j else [] except KeyboardInterrupt as e: onCtrlC() raise e except Exception as e: logError(f'Exception processing phrase chunk : {e}') return (final, text, signature)
def transcribe(path: str) -> str: """Transcribe.""" # check if the models is already present if not _download_model(): raise ValueError("Unable to automatically download the model.") exit(1) wf = wave.open(path, "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE": logger.info("Audio file must be WAV format mono PCM.") exit(1) model = Model(MODEL_PATH) rec = KaldiRecognizer(model, wf.getframerate()) while True: data = wf.readframes(wf.getnframes()) if len(data) == 0: break if rec.AcceptWaveform(data): full_result = rec.Result() text = json.loads(full_result)["text"] else: partial_result = rec.PartialResult() text = json.loads(partial_result)["partial"] return text
class WakeWordDetector: """唤醒词检测器,对 `vosk-api <https://github.com/alphacep/vosk-api>`_ 的简单封装,默认的唤醒词是 `'阿Q'` 和 `'R-Cute'`。 如果要自定义唤醒词,请参考 https://github.com/alphacep/vosk-api/blob/master/python/example/test_words.py """ def __init__( self, sr=16000, lang='en', grammar='[ "a b c d e f g h i j k l m n o p q r s t u v w x y z key cute", "[unk]" ]' ): self.load(lang) self._det = KaldiRecognizer(util.cache[f'vosk.{lang}'], sr, grammar) def _detected(self, text): if text == 'r q': return '阿Q' elif text == 'r cute': return 'R-Cute' def load(self, lang='en'): """load language model in advance""" model = util.cache.get(f'vosk.{lang}', Model(util.data_file(f'vosk/{lang}'))) util.cache[f'vosk.{lang}'] = model def detect(self, source, timeout=None): """开始检测 :param source: 声音来源 :param timeout: 超时,即检测的最长时间(秒),默认为 `None` ,表示不设置超时,知道检测到唤醒词才返回 :type timeout: float, optional :return: 检测到的唤醒词模型对应的唤醒词,若超时没检测到唤醒词则返回 `None` :rtype: str """ self._cancel = False # possible race condition? if timeout: count = 0.0 self._det.FinalResult() # clear buffer while True: segment = source.read() if self._det.AcceptWaveform(segment.raw_data): p = self._detected(json.loads(self._det.Result())['text']) else: p = self._detected( json.loads(self._det.PartialResult())['partial']) if p: return p if self._cancel: return # raise RuntimeError('Hotword detection cancelled by another thread') elif timeout: count += segment.duration_seconds if count > timeout: return # self._detected(self._det.FinalResult()['text']) def cancel(self): """停止检测""" self._cancel = True
def mic_listen(self): text = "" open_stream = True p = pyaudio.PyAudio() while True: if open_stream: self.publish("gui/boomer/task", "opening stream...") stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000) stream.start_stream() open_stream = False elif len(text) != 0: try: stream.stop_stream() except Exception: pass return text rec = KaldiRecognizer(self.sr_model, 16000) start_time = time.time() self.publish("gui/boomer/eye", "0") while True: try: data = stream.read(4000) if len(data) == 0: break elif time.time() - start_time > 7: break else: rec.AcceptWaveform(data) output = json.loads(rec.PartialResult()) output = output["partial"] self.debug_msg("Kaldi_r", "Output: " + output) if self.name in output: self.publish("gui/boomer/eye", "1") if self.name not in output and len(output) > 7: break elif self.name in output and ' '.join( str(output).split()[1:len(output) - 1]) in self.keywords: self.debug_msg("Kaldi_R", "got keyword") text = output break except Exception as e: self.publish("gui/boomer/task", str(e)) try: stream.stop_stream() except Exception: pass open_stream = True break
def StreamingRecognize(self, request_iterator, context): request = next(request_iterator) partial = request.config.specification.partial_results recognizer = KaldiRecognizer(self.model, request.config.specification.sample_rate_hertz) for request in request_iterator: res = recognizer.AcceptWaveform(request.audio_content) if res: yield self.get_response(recognizer.Result()) elif partial: yield self.get_response(recognizer.PartialResult()) yield self.get_response(recognizer.FinalResult())
class Recognizer: def __init__(self, pathToModel): self.answer = "None" self.modelFlag = False self.pyAudioFlag = False self.pathToModel = pathToModel def setupModel(self): if not os.path.exists(self.pathToModel): print( "Please download the model from https://github.com/alphacep/vosk-api/blob/master/doc/models.md and unpack as 'model' in the current folder." ) exit(1) self.model = Model(self.pathToModel) self.rec = KaldiRecognizer(self.model, 16000) self.modelFlag = True self.startPyaudio() def startPyaudio(self): self.p = pyaudio.PyAudio() self.stream = self.p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000) self.stream.start_stream() self.pyAudioFlag = True def stopPyaudio(self): self.stream.stop_stream() self.pyAudioFlag = False def runTimedRecognition(self): n = 1000 while True: data = self.stream.read(4000) if len(data) == 0: break if self.rec.AcceptWaveform(data): result = self.rec.Result() print(result) d = json.loads(str(result)) myStr = d["text"] print(myStr) self.answer = myStr return myStr else: print(self.rec.PartialResult()) if n == 0: break print(n) n -= 1
def creat_text_gpu(path): wf = wave.open(path.replace('.wav', '_mono.wav'), "rb") rec = KaldiRecognizer(model, wf.getframerate()) while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): pass else: rec.PartialResult() write_file(parse_json(rec.FinalResult()), path.split('/')[-1].replace('.wav', ''))
def StreamingRecognize(self, request_iterator, context): request = next(request_iterator) partial = request.config.specification.partial_results recognizer = KaldiRecognizer( self.model, request.config.specification.sample_rate_hertz) recognizer.SetMaxAlternatives( request.config.specification.max_alternatives) recognizer.SetWords( request.config.specification.enable_word_time_offsets) for request in request_iterator: res = recognizer.AcceptWaveform(request.audio_content) if res: yield self.get_response(recognizer.Result()) elif partial: yield self.get_response(recognizer.PartialResult()) yield self.get_response(recognizer.FinalResult())
def offline_record_recognize_audio(): model = Model(r"D:\pythonProject1\models") # полный путь к модели rec = KaldiRecognizer(model, 8000) p = pyaudio.PyAudio() stream = p.open( format=pyaudio.paInt16, channels=1, rate=8000, input=True, frames_per_buffer=8000 ) stream.start_stream() while True: data = stream.read(4000) if len(data) == 0: break print(rec.Result() if rec.AcceptWaveform(data) else rec.PartialResult()) print(rec.FinalResult())
def Speech2Text(): wf = wave.open("recording.wav", "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype( ) != "NONE": print("Audio file must be WAV format mono PCM.") exit(1) model = Model("model1") # You can also specify the possible word list rec = KaldiRecognizer(model, wf.getframerate(), "money purse police shoot") while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): print(rec.Result()) else: print(rec.PartialResult()) res = json.loads(rec.FinalResult()) print("Speech2Text: " + res['text']) return res['text']
def vosk_model(address): SetLogLevel(2) wf = wave.open(address, "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype( ) != "NONE": print("Audio file must be WAV format mono PCM.") exit(1) model = Model("../audio_utils/tests/vosk_test/model") rec = KaldiRecognizer(model, wf.getframerate()) while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): print(rec.Result()) else: print(rec.PartialResult()) print(rec.FinalResult())
def recognize_file(filepath): wf = wave.open(filepath, "rb") print("press_f") print(wf.getnchannels()) print(wf.getsampwidth()) print(wf.getcomptype()) if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype( ) != "NONE": print("Audio file must be WAV format mono PCM.", file=sys.stderr) rec = KaldiRecognizer(MODEL, wf.getframerate()) recognition = "" while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): recognition = recognition + json.loads(rec.Result())["text"] + " " else: rec.PartialResult() recognition += json.loads(rec.FinalResult())["text"] return recognition
from vosk import Model, KaldiRecognizer, SetLogLevel import sys import os import wave import json SetLogLevel(0) wf = wave.open(sys.argv[1], "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype( ) != "NONE": print("Audio file must be WAV format mono PCM.") exit(1) model = Model(lang="en-us") rec = KaldiRecognizer(model, wf.getframerate()) rec.SetMaxAlternatives(10) rec.SetWords(True) while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): print(json.loads(rec.Result())) else: print(json.loads(rec.PartialResult())) print(json.loads(rec.FinalResult()))
def trigger_microphone(n_clicks): if n_clicks == 0: return '' print('trigger microphone %d' % n_clicks) import termux termux.Microphone.stop() pwd = os.environ['PWD'] aac_file = "%s/microphone.aac" % pwd wave_file = "%s/microphone.wave" % pwd if os.path.exists(aac_file): os.remove(aac_file) termux.Microphone.record(aac_file, encoder='aac', limit=5, count=2) import time time.sleep(6) os.system('faad -o %s %s' % (wave_file, aac_file)) if False: import speech_recognition as sr r = sr.Recognizer() with sr.WavFile(wave_file) as source: audio = r.record(source) text = r.recognize_sphinx(audio) else: from vosk import Model, KaldiRecognizer, SetLogLevel import wave import numpy as np model_name = 'vosk-model-small-en-us-0.15' if not os.path.exists(model_name): os.system('wget http://alphacephei.com/vosk/models/%s.zip' % model_name) os.system('unzip %s.zip' % model_name) wf = wave.open(wave_file, "rb") model = Model(model_name) rec = KaldiRecognizer(model, wf.getframerate()) nch = wf.getnchannels() depth = wf.getsampwidth() typ = {1: np.uint8, 2: np.uint16, 4: np.uint32}.get(depth) sdata = wf.readframes(64000) data = np.frombuffer(sdata, dtype=typ) ch_data = data[0::nch] sdata = ch_data.tobytes() if True: outwav = wave.open('good.wave', 'w') outwav.setparams(wf.getparams()) outwav.setnchannels(1) outwav.writeframes(ch_data.tobytes()) outwav.close() if rec.AcceptWaveform(sdata): result = rec.Result() result = json.loads(result) text = result['text'] else: result = rec.PartialResult() result = json.loads(result) text = result['partial'] result = rec.FinalResult() result = json.loads(result) text += result['text'] print('finish microphone') print('text:%s' % text) return text
p = pyaudio.PyAudio() stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000) stream.start_stream() while True: data = stream.read(4000) if len(data) == 0: break if rec.AcceptWaveform(data): text = json.loads(rec.Result())['text'] print(text) tmp = dict.get(text) try: if tmp: os.system(tmp) except BaseException: print("хуй тебе") #os.system(dict[tmp]) else: #print(rec.PartialResult().split('"partial" : "')) print(rec.PartialResult()) print(rec.FinalResult())
recognizedResults = voskSpeechRecognitionEngine.Result() print(recognizedResults) # Prepare recognized text to send parsedRecognizedResults = recognizedResults.split( '"text" : "')[1].split('"')[0] # Send results with yarp port outputBottle.clear() outputBottle.addString("Recognized: " + str(parsedRecognizedResults)) voskSpeechRecognition_outputPort.write(outputBottle) # If detect and recognize parcial results else: # Print partial results recognizedPartialResults = voskSpeechRecognitionEngine.PartialResult() print(recognizedPartialResults) # Close YARP ports print("[INFO] Closing YARP ports ...") voskSpeechRecognition_inputPort.close() voskSpeechRecognition_outputPort.close() print("") print("") print( "**************************************************************************" ) print("Program finished") print( "**************************************************************************"
class Decoder: def __init__(self, info): model = Model(os.getcwd() + "/modules/model") self.rec = KaldiRecognizer(model, 8000) self.ip, self.port = info["front"] def decode_file(self, aud_file): SetLogLevel(0) sentence = "" results = "" confidence = 0 tot = 0 wf = wave.open(aud_file, "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype( ) != "NONE": #checking certain file characteristics print("Audio aud_file must be WAV format mono PCM.") exit(1) while True: #loop for doing voice recognition data = wf.readframes(4000) if len(data) == 0: #done reading audio file break if self.rec.AcceptWaveform( data): #finished recognition on segment of audio file items = self.rec.Result() results = json.loads(items) if len(results.items( )) > 1: #false recognition, sometimes nothing is detected for i in results["result"]: confidence += i["conf"] tot += 1 sentence = sentence + " " + results["text"] else: print(self.rec.PartialResult()) f_res = json.loads(self.rec.FinalResult()) if len(f_res.items()) > 1: return f_res["text"] wf.close() if tot > 0 and confidence / tot > .8: #checking confidence of recognition return sentence.lower().strip() elif tot > 0: print("confidence too low: " + str(confidence / tot)) return "" def listen_stream(self): HOST = self.ip PORT = self.port CHUNK = 32768 TIMEOUT = 10 while True: with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: totData = 0 connDied = False ret = self.try_connection(HOST, PORT, s, "send CNRDY") if ret == False: s.close() continue print("connected") s.sendall(b"CNRDY\0") #sending connection ready data = b"" s.settimeout(2) while b"YEETO" not in data: #getting rid of bad data try: data = s.recv(CHUNK) print("bad data : {}".format(len(data))) if len(data) == 0: print("conn died during handshake") time.sleep(2) connDied = True break except: print("timed out from connection and didn't get YEETO") connDied = True break if connDied: continue s.settimeout(None) s.sendall( b"FLUSH\0") #letting front know bad data has been flushed FTOT, FTEMP = self.init_temp_tot_wave( ) #init FTOT and FTEMP files while True: temp = self.open_temp_wave(FTEMP) #get temorary wave file try: data = s.recv(CHUNK) except: print("connection with {} {} died".format(HOST, PORT)) connDied = True break size = len(data) totData += size if data == None or size == 0: #check for when we #receive packets of zero size print("connection from front-end closed") print(f"FRONT CLOSE tot data received : {totData}") break print(f"got data: {len(data)}") temp.writeframesraw(data) temp.close() self.combine_files([FTOT, FTEMP]) #combining wave file data if (self.detect_silence(FTOT)): #2 seconds of silence detected break if connDied: break try: s.close() print(f"BACK CLOSE tot data received : {totData}") if totData != 0: #we got zero data from the connection self.send_gdata() break except BrokenPipeError: print(f"connection died with {HOST} port {PORT}") results = self.decode_file(FTOT) #get results from file print("FINAL RESULT from stream: " + results) return results def clear_socket(self): #prototype for clearing socket data HOST = self.ip PORT = self.port with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: self.try_connection(HOST, PORT, sock, "CLEAR SOCKET") sock.settimeout(TIMEOUT) # 10 second timeout size = 1 while size > 0: sock.recv(1024) #just receive data and throw it away sock.close() def send_cnerr(self): HOST = self.ip PORT = self.port with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: print("sending connection error") self.try_connection(HOST, PORT, sock, "SEND CNERR") sock.sendall(b"CNERR\0") sock.close() def send_gdata(self): HOST = self.ip PORT = self.port with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: print("sending good data") self.try_connection(HOST, PORT, sock, "SEND GDATA") sock.sendall(b"GDATA\0") sock.close() def init_temp_tot_wave(self): FTOT = "./temp/recv.wav" FTEMP = "./temp/temp.wav" tot = wave.open(FTOT, 'wb') tot.setnchannels(1) #mono tot.setsampwidth(2) tot.setframerate(8000) tot.close() temp = wave.open(FTEMP, 'wb') temp.setnchannels(1) #mono temp.setsampwidth(2) temp.setframerate(8000) temp.close() return FTOT, FTEMP def open_temp_wave(self, FTEMP): temp = wave.open(FTEMP, 'wb') temp.setnchannels(1) #mono temp.setsampwidth(2) temp.setframerate(8000) return temp def try_connection(self, HOST, PORT, s, funcName): print("trying to connect " + HOST + " " + str(PORT)) print(f"{funcName} connecting to front-end") time.sleep(2) s.settimeout(5) try: s.connect((HOST, PORT)) s.settimeout(None) return True except ConnectionRefusedError: print("connection to {} on port {} refused.".format(HOST, PORT)) print("will try again in 5 seconds\n") time.sleep(5) return False except OSError: print("couldn't find {} on port {}".format(HOST, PORT)) print("wil try again in 5 seconds") time.sleep(5) return False except TimeoutError: print("connection timed out for {} port {}".format(HOST, PORT)) print("will try again in 5 seconds\n") time.sleep(5) return False def send_mstop(self): HOST = self.ip PORT = self.port with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: print("sending MSTOP") while True: try: sock.connect((HOST, PORT)) break except ConnectionRefusedError: print("connection to {} on port {} refused.".format( HOST, PORT)) print("will try again in 5 seconds\n") time.sleep(5) except OSError: print("couldn't find {} on port {}".format(HOST, PORT)) print("wil try again in 5 seconds") time.sleep(5) sock.sendall(b"MSTOP\0") sock.close() def combine_files(self, files): data = [] for infile in files: w = wave.open(infile, "rb") data.append([w.readframes(w.getnframes())]) w.close() output = wave.open(files[0], "wb") output.setnchannels(1) #mono output.setsampwidth(2) output.setframerate(8000) output.writeframes(data[0][0]) output.writeframes(data[1][0]) output.close() def detect_silence(self, fileName): myaudio = intro = AudioSegment.from_wav(fileName) dBFS = myaudio.dBFS print(dBFS) pieces = silence.detect_silence(myaudio, 1000, dBFS - 0) pieces = [((start / 1000), (stop / 1000)) for start, stop in pieces] #convert to sec for i in pieces: if i[1] - i[0] > 3: print("big silence: " + str(i[0]) + " " + str(i[1])) return True return False
class Tester: def __init__( self, filepath: Optional[str], model_path: str, sample_rate: int, use_gpu: bool = False ): if use_gpu: # Gpu part, uncomment if vosk-api has gpu support from vosk import GpuInit, GpuInstantiate GpuInit() GpuInstantiate() self.sample_rate = sample_rate self.model = Model(model_path) self.rec = KaldiRecognizer(self.model, sample_rate) self.filepath = filepath def _read(self, out): while True: data = out.read(8000) if len(data) == 0: break if self.rec.AcceptWaveform(data): print(self.rec.Result()) else: print(self.rec.PartialResult()) print(self.rec.FinalResult()) def _test_microphone(self): stream = PyAudio().open( format=paInt16, channels=1, rate=self.sample_rate, input=True, frames_per_buffer=8000 ) stream.start_stream() self._read(stream) def _test_file(self, filepath): process = subprocess.Popen(['ffmpeg', '-loglevel', 'quiet', '-i', filepath, '-ar', str(self.sample_rate), '-ac', '1', '-f', 's16le', '-'], stdout=subprocess.PIPE) self._read(process.stdout) def test(self): if self.filepath is None: self._test_microphone() else: self._test_file(self.filepath)
recognized_words = 0 recognition_report.length = 0 audio_report.write("\nTranscription:\n") while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): json_dict = json.loads(rec.Result()) transcript = json_dict['text'] count_word = recognition_report() recognized_words += count_word else: rec.PartialResult() total_recognized_words += recognized_words current_audio_time = time.time() - start_time total_time += current_audio_time audio_report.write("\n\nThe processing time of the audio file: ") audio_report.write("{}\n".format(time.strftime("%M:%S", time.gmtime(current_audio_time)))) audio_report.write("Number of recognized words: {}".format(recognized_words)) os.remove('audio.wav') average_audio = total_time // number_audio # Creating the final report main_report = open(REPORT_PATH + '.txt', 'w')
from vosk import Model, KaldiRecognizer import sys import json import os if not os.path.exists("model"): print( "Please download the model from https://github.com/alphacep/kaldi-android-demo/releases and unpack as 'model' in the current folder." ) exit(1) model = Model("model") rec = KaldiRecognizer(model, 16000) wf = open(sys.argv[1], "rb") wf.read(44) # skip header while True: data = wf.read(2000) if len(data) == 0: break if rec.AcceptWaveform(data): res = json.loads(rec.Result()) print(res) else: res = json.loads(rec.PartialResult()) print(res) res = json.loads(rec.FinalResult()) print(res)
print("model at " + model_path + " found succesfully") wf = wave.open(sys.argv[2], "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype( ) != "NONE": print("Audio file must be WAV format mono PCM.") exit(1) model = Model(model_path) rec = KaldiRecognizer(model, wf.getframerate(), '["red", "green", "blue", "yellow", "white", "[unk]"]') with open('full_result.json', 'w') as full_result_file, open('partial_result.json', 'w') as partial_result_file: while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): result = rec.Result() full_result_file.write(result) check_color(json.loads(result)) else: partial_result_file.write(rec.PartialResult()) result = rec.FinalResult() full_result_file.write(result) check_color(json.loads(result))
def gen_subparts(input_file, model_dir, verbose=False, partlen=4, progress=False): SetLogLevel(0 if verbose else -1) model = Model(model_dir) rec = KaldiRecognizer(model, 16000) process = subprocess.Popen([ 'ffmpeg', '-loglevel', 'quiet', '-i', input_file, '-ar', str(16000), '-ac', '1', '-f', 's16le', '-' ], stdout=subprocess.PIPE) r = subprocess.run( "ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1" .split() + [input_file], stdout=subprocess.PIPE) duration = float(r.stdout.decode('utf-8').strip()) if progress: pbar = tqdm(total=duration, unit="s") prev_end = 0 while True: data = process.stdout.read(4000) if len(data) == 0: break if rec.AcceptWaveform(data): r = json.loads(rec.Result()) if 'result' in r: resultpart = [] # TODO: use this across AccesptForm for result in r['result']: if len(resultpart) > 0 and float(result['end']) - float( resultpart[0]['start']) >= partlen: yield SubPart(start=resultpart[0]['start'], end=float(resultpart[-1]['end']), text=" ".join(r['word'] for r in resultpart)) prev_end = float(resultpart[-1]['end']) resultpart = [] if float(result['end'] - result['start']) >= partlen: yield SubPart(start=float(result['start']), end=float(result['end']), text=result['word']) prev_end = float(result['end']) resultpart = [] else: resultpart.append(result) if progress: pbar.update(float(result['end'] - pbar.n)) if len(resultpart) > 0: yield SubPart(start=float(resultpart[0]['start']), end=float(resultpart[-1]['end']), text=" ".join(r['word'] for r in resultpart)) prev_end = float(resultpart[-1]['end']) resultpart = [] else: pass #print(rec.PartialResult()) #pprint(rec.PartialResult()) if progress: pbar.close() r = json.loads(rec.PartialResult()) text = r['partial'] yield SubPart(start=prev_end, end=duration, text=text)
class VoskInput(BaseInput): """ Uses the `vosk` package to do speech recognition. """ def __init__(self): super(VoskInput, self).__init__() self.current_utterance = "" self.realtime = True # indicates that audio can be streamed in model_name = crystal.core.get_config( 'vosk_model') or 'vosk-model-small-en-us-0.3' log.info(f"Using vosk model: {model_name}") self.model = Model(f"models/{model_name}") self.rec = None self.__final_result = None def process_audio(self, raw_audio: bytes, sample_rate: int, sample_width: int): if not self.rec: self.rec = KaldiRecognizer(self.model, sample_rate) full = self.rec.AcceptWaveform(raw_audio) if full: result = self.rec.Result() else: result = self.rec.PartialResult() log.debug(result) result = json.loads(result) if "result" in result: self.__final_result = result if "text" in result: text = result["text"] elif "partial" in result: text = result["partial"] if text: self.current_utterance = text return self.current_utterance def get_full_result(self): if self.__final_result: result = self.__final_result else: result = self.rec.FinalResult() result = json.loads(result) log.debug(result) self.rec = None self.current_utterance = "" self.__final_result = None full_text = result["text"] # HACK: auto correct text to match domain vocabulary. Sorry. full_text = full_text.replace("palace music", "pause music") full_text = full_text.replace("applause music", "pause music") if any(x in full_text for x in ["turn on", "turn off", "turned on", "turned off"]): full_text = full_text.replace("the land", "the lamp").replace( "the lamb", "the lamp") if full_text.endswith("the lam"): full_text = full_text.replace("the lam", "the lamp") if any(x in full_text for x in ["timer", "alarm"]): full_text = full_text.replace("crystal said", "crystal set") if full_text.endswith("to pm"): full_text = full_text.replace("to pm", "2 pm") elif full_text.endswith(" a m"): full_text = full_text.replace(" a m", " am") if full_text.startswith("christo"): full_text = full_text.replace("christo", "crystal") elif full_text.startswith("crews to"): full_text = full_text.replace("crews to", "crystal") elif full_text.startswith("christian"): full_text = full_text.replace("christian", "crystal") return full_text