class Recognizer: def __init__(self, model: Model): self._rec = KaldiRecognizer(model, 16000) self._tpye_mappimg = {"wav": "wav", "mpeg": "mp3"} def __del__(self): self._rec = None def recognize(self, contents: bytes) -> str: if self._rec.AcceptWaveform(contents): pass return json.loads(self._rec.Result()) def format_normalize(self, file: File, type: str = "wav") -> bytes: audio = AudioSegment.from_file(file, self._tpye_mappimg[type]) audio = audio.set_frame_rate(16000) buf = io.BytesIO() audio.export(buf, format="wav") return buf.getvalue()
def use_offline_recognition(): recognized_data = "" if not os.path.exists("models/vosk-model-small-ru-0.4"): print("Please download the model from:\n" "https://alphacephei.com/vosk/models and unpack as 'model' in the current folder.") exit(1) wave_audio_file = wave.open("microphone-results.wav", "rb") model = Model("models/vosk-model-small-ru-0.4") offline_recognizer = KaldiRecognizer(model, wave_audio_file.getframerate()) data = wave_audio_file.readframes(wave_audio_file.getnframes()) if len(data) > 0: if offline_recognizer.AcceptWaveform(data): recognized_data = offline_recognizer.Result() recognized_data = json.loads(recognized_data) recognized_data = recognized_data["text"] print(recognized_data) return recognized_data
def stt_ru_offline(): model = Model("model") rec = KaldiRecognizer(model, 16000) p = pyaudio.PyAudio() stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000) stream.start_stream() while True: data = stream.read(4000, exception_on_overflow=False) if len(data) == 0: break if rec.AcceptWaveform(data): x = json.loads(rec.Result()) if len(x["text"]): return x["text"] else: # print(rec.PartialResult()) pass
def recognize(model, wav_file_path): """ Speech to text recognizer for russian speech using vosk models path to russian vosk model should be configured in config.py file """ with wave.open(wav_file_path, "rb") as wf: if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE": raise TypeError("Audio file must be WAV format mono PCM.") rec = KaldiRecognizer(model, wf.getframerate()) while True: data = wf.readframes(4000) if len(data) == 0: break rec.AcceptWaveform(data) json_ = json.loads(rec.FinalResult()) return json_['text']
def transcribe(audio_path, output_path, model_id="vosk-model-en-in-0.4"): """ :param audio_path: :param model_id: (Get models from https://alphacephei.com/vosk/models and extract in vosk_models folder.) :return: """ model_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "vosk_models", model_id) sample_rate = 16000 model = Model(model_path) rec = KaldiRecognizer(model, sample_rate) process = subprocess.Popen([ 'ffmpeg', '-loglevel', 'quiet', '-i', audio_path, '-ar', str(sample_rate), '-ac', '1', '-f', 's16le', '-' ], stdout=subprocess.PIPE) text = "" while True: data = process.stdout.read(4000) if len(data) == 0: break if rec.AcceptWaveform(data): # print(rec.Result()) result = json.loads(rec.Result()) text = "%s %s" % (text, result["text"]) else: # print(rec.PartialResult()) pass print(rec.FinalResult()) MdFile(file_path=output_path).dump_to_file(metadata={}, content=text, dry_run=False)
def driver(): # enqueues stuff if not os.path.exists("model"): print( "Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current " "folder.") exit(1) model = Model("model") rec = KaldiRecognizer(model, 16000) p = pyaudio.PyAudio() stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000) stream.start_stream() while True: data = stream.read(4000) if len(data) == 0: break if rec.AcceptWaveform(data): inputQueue.put(rec.Result())
def transcribe_wav(filepath): wf = wave.open(filepath, 'rb') n_channels = wf.getnchannels() # If not mono, convert it to mono if n_channels != 1: mono_filename = f'{filepath}.monofile.wav' mono = wave.open(mono_filename, 'wb') mono.setparams(wf.getparams()) mono.setnchannels(1) mono.writeframes( audioop.tomono(wf.readframes(float('inf')), wf.getsampwidth(), 1, 1)) mono.close() wf = wave.open(mono_filename, 'rb') os.remove(mono_filename) rec = KaldiRecognizer(model, wf.getframerate()) transcription = '' while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): transcription += str(eval(rec.Result())['text']) + ' ' transcription += str(eval(rec.FinalResult())['text']) os.remove(filepath) return transcription
def run_asr(f): try: wf = f + str(uuid.uuid4()) + ".wav" if MAX_WAV_LEN: os.system( f"ffmpeg -hide_banner -loglevel panic -n -i {shlex.quote(f)} -ss 0 -t {MAX_WAV_LEN} -ar {SAMPLE_RATE} -ac 1 {shlex.quote(wf)}" ) else: os.system( f"ffmpeg -hide_banner -loglevel panic -n -i {shlex.quote(f)} -ar {SAMPLE_RATE} -ac 1 {shlex.quote(wf)}" ) o_wf = wave.open(wf, "rb") data = o_wf.readframes(o_wf.getnframes()) o_wf.close() os.remove(wf) rec = KaldiRecognizer(model, SAMPLE_RATE) rec.AcceptWaveform(data) return json.loads(rec.FinalResult()) except Exception as ex: return {"error": str(ex)}
def transcribe(): try: worker.log.info('[%s] New user entry on /transcribe' % (strftime("%d/%b/%d %H:%M:%S", gmtime()))) is_metadata = False # get response content type if request.headers.get('accept').lower() == 'application/json': is_metadata = True elif request.headers.get('accept').lower() == 'text/plain': is_metadata = False else: raise ValueError('Not accepted header') # get input file if 'file' in request.files.keys(): file = request.files['file'] worker.getAudio(file) rec = KaldiRecognizer(model, spkModel, worker.rate, worker.ONLINE) rec.AcceptWaveform(worker.data) data_ = rec.FinalResult() confidence = rec.uttConfidence() if is_metadata: data_ = rec.GetMetadata() data = worker.get_response(data_, confidence, is_metadata) worker.clean() else: raise ValueError('No audio file was uploaded') return data, 200 except ValueError as error: return str(error), 400 except Exception as e: worker.log.error(e) return 'Server Error', 500
def myCommand(): # "listens for commands" # We imported vosk up above. p = pyaudio.PyAudio() stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000) stream.start_stream() model = Model("model-en") rec = KaldiRecognizer(model, 16000) while True: data = stream.read(2000) if len(data) == 0: break if rec.AcceptWaveform(data): #print(rec.Result()) # I commented out this line and added the 3 lines below myResult = rec.Result() myList = myResult.split("text") command = myList[1] stream.stop_stream() stream.close() p.terminate() return command ######## END STT SPEECH TO TEXT FUNCTION THAT RETURNS THE VARIABLE: command ###############################################################################################
def transcribe(file_name): sound = AudioSegment.from_wav(file_name) sound = sound.set_channels(1) sound.export("generate/audio.wav", format="wav") SetLogLevel(-1) if not os.path.exists("model"): print( "Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder." ) exit(1) wf = wave.open('generate/audio.wav', "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype( ) != "NONE": print("Audio file must be WAV format mono PCM.") exit(1) model = Model("model") rec = KaldiRecognizer(model, wf.getframerate()) while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): pass # print(rec.Result()) else: pass # print(rec.PartialResult()) r = rec.FinalResult() r = json.loads(r) return r['text']
def my_link(): time.sleep(15) c = '0' + '.wav' counter = 0 while (c in os.listdir(audio_path)): #for i in os.listdir(audio_path): #if i.endswith('.wav'): #sound = AudioSegment.from_wav('C:/Users/admin/Downloads/' + i ) #sound = sound.set_channels(1) # To make it MONO Channel #sound = sound.set_frame_rate(44100) # Sample Frame rate taken here = 44,100 Hz #sound.export('C:/Users/admin/Downloads/' + i , format="wav") wf = wave.open(audio_path + '/' + c, 'rb') model = Model("vosk-model-small-en-in-0.4") rec = KaldiRecognizer(model, wf.getframerate()) while True: data = wf.readframes(CHUNK) if len(data) == 0: break if rec.AcceptWaveform(data): pass dict = ast.literal_eval( rec.FinalResult()) #changing the string to dictionary print(c) print(dict["text"]) l.append(dict["text"]) time.sleep(0.01) counter = counter + 1 c = '0' + ' ' + '(' + str(counter) + ')' + '.wav' #for i in l: return render_template('index.html')
def _recognize_vosk(self): SetLogLevel(0) if not os.path.exists("vosk-model-small-es-0.3"): raise Exception("Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder.") text = [] wf = wave.open(self.file_name, "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE": raise Exception("Audio file must be WAV format mono PCM.") model = Model("vosk-model-small-es-0.3") rec = KaldiRecognizer(model, wf.getframerate()) rec.SetWords(True) while True: data = wf.readframes(4000) if len(data) == 0: break rec.AcceptWaveform(data) try: res = json.loads(rec.FinalResult()) return res['text'] except Exception as e: print(e) return ""
def upload_voicecar(request): if request.method == "POST": myFile = request.FILES.get("myfile", None) if not myFile: print("no files for upload!") return HttpResponse("no files for upload!") destination = open(os.path.join("media/voice", myFile.name), 'wb+') for chunk in myFile.chunks(): destination.write(chunk) destination.close() rec = KaldiRecognizer(vosk_model, 16000) wf = wave.open(BASE_DIR + '/media/voice/voicecar.wav', "rb") while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): rec.Result() data = json.loads(rec.FinalResult()) voicetext = data['text'] print(voicetext) qian = dragon_cf['voicerec']['qian'].split(',') yizhiqian = dragon_cf['voicerec']['yizhiqian'].split(',') hou = dragon_cf['voicerec']['hou'].split(',') yizhihou = dragon_cf['voicerec']['yizhihou'].split(',') zuo = dragon_cf['voicerec']['zuo'].split(',') you = dragon_cf['voicerec']['you'].split(',') ting = dragon_cf['voicerec']['ting'].split(',') if voicetext in qian: print("qianqianqian") robot.forward(0.5) time.sleep(0.1) robot.stop() elif voicetext in yizhiqian: print("yizhiqian") robot.forward(0.5) elif voicetext in hou: print("hou") robot.backward(0.5) time.sleep(0.1) robot.stop() elif voicetext in yizhihou: print("yizhihou") robot.backward(0.5) elif voicetext in zuo: print("zuo") robot.left(0.5) time.sleep(0.1) robot.stop() elif voicetext in you: print("you") robot.right(0.5) time.sleep(0.1) robot.stop() elif voicetext in ting: print("tingtingting") robot.stop() return HttpResponse("upload over!")
class Decoder: def __init__(self, info): model = Model(os.getcwd() + "/modules/model") self.rec = KaldiRecognizer(model, 8000) self.ip, self.port = info["front"] def decode_file(self, aud_file): SetLogLevel(0) sentence = "" results = "" confidence = 0 tot = 0 wf = wave.open(aud_file, "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype( ) != "NONE": #checking certain file characteristics print("Audio aud_file must be WAV format mono PCM.") exit(1) while True: #loop for doing voice recognition data = wf.readframes(4000) if len(data) == 0: #done reading audio file break if self.rec.AcceptWaveform( data): #finished recognition on segment of audio file items = self.rec.Result() results = json.loads(items) if len(results.items( )) > 1: #false recognition, sometimes nothing is detected for i in results["result"]: confidence += i["conf"] tot += 1 sentence = sentence + " " + results["text"] else: print(self.rec.PartialResult()) f_res = json.loads(self.rec.FinalResult()) if len(f_res.items()) > 1: return f_res["text"] wf.close() if tot > 0 and confidence / tot > .8: #checking confidence of recognition return sentence.lower().strip() elif tot > 0: print("confidence too low: " + str(confidence / tot)) return "" def listen_stream(self): HOST = self.ip PORT = self.port CHUNK = 32768 TIMEOUT = 10 while True: with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: totData = 0 connDied = False ret = self.try_connection(HOST, PORT, s, "send CNRDY") if ret == False: s.close() continue print("connected") s.sendall(b"CNRDY\0") #sending connection ready data = b"" s.settimeout(2) while b"YEETO" not in data: #getting rid of bad data try: data = s.recv(CHUNK) print("bad data : {}".format(len(data))) if len(data) == 0: print("conn died during handshake") time.sleep(2) connDied = True break except: print("timed out from connection and didn't get YEETO") connDied = True break if connDied: continue s.settimeout(None) s.sendall( b"FLUSH\0") #letting front know bad data has been flushed FTOT, FTEMP = self.init_temp_tot_wave( ) #init FTOT and FTEMP files while True: temp = self.open_temp_wave(FTEMP) #get temorary wave file try: data = s.recv(CHUNK) except: print("connection with {} {} died".format(HOST, PORT)) connDied = True break size = len(data) totData += size if data == None or size == 0: #check for when we #receive packets of zero size print("connection from front-end closed") print(f"FRONT CLOSE tot data received : {totData}") break print(f"got data: {len(data)}") temp.writeframesraw(data) temp.close() self.combine_files([FTOT, FTEMP]) #combining wave file data if (self.detect_silence(FTOT)): #2 seconds of silence detected break if connDied: break try: s.close() print(f"BACK CLOSE tot data received : {totData}") if totData != 0: #we got zero data from the connection self.send_gdata() break except BrokenPipeError: print(f"connection died with {HOST} port {PORT}") results = self.decode_file(FTOT) #get results from file print("FINAL RESULT from stream: " + results) return results def clear_socket(self): #prototype for clearing socket data HOST = self.ip PORT = self.port with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: self.try_connection(HOST, PORT, sock, "CLEAR SOCKET") sock.settimeout(TIMEOUT) # 10 second timeout size = 1 while size > 0: sock.recv(1024) #just receive data and throw it away sock.close() def send_cnerr(self): HOST = self.ip PORT = self.port with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: print("sending connection error") self.try_connection(HOST, PORT, sock, "SEND CNERR") sock.sendall(b"CNERR\0") sock.close() def send_gdata(self): HOST = self.ip PORT = self.port with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: print("sending good data") self.try_connection(HOST, PORT, sock, "SEND GDATA") sock.sendall(b"GDATA\0") sock.close() def init_temp_tot_wave(self): FTOT = "./temp/recv.wav" FTEMP = "./temp/temp.wav" tot = wave.open(FTOT, 'wb') tot.setnchannels(1) #mono tot.setsampwidth(2) tot.setframerate(8000) tot.close() temp = wave.open(FTEMP, 'wb') temp.setnchannels(1) #mono temp.setsampwidth(2) temp.setframerate(8000) temp.close() return FTOT, FTEMP def open_temp_wave(self, FTEMP): temp = wave.open(FTEMP, 'wb') temp.setnchannels(1) #mono temp.setsampwidth(2) temp.setframerate(8000) return temp def try_connection(self, HOST, PORT, s, funcName): print("trying to connect " + HOST + " " + str(PORT)) print(f"{funcName} connecting to front-end") time.sleep(2) s.settimeout(5) try: s.connect((HOST, PORT)) s.settimeout(None) return True except ConnectionRefusedError: print("connection to {} on port {} refused.".format(HOST, PORT)) print("will try again in 5 seconds\n") time.sleep(5) return False except OSError: print("couldn't find {} on port {}".format(HOST, PORT)) print("wil try again in 5 seconds") time.sleep(5) return False except TimeoutError: print("connection timed out for {} port {}".format(HOST, PORT)) print("will try again in 5 seconds\n") time.sleep(5) return False def send_mstop(self): HOST = self.ip PORT = self.port with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: print("sending MSTOP") while True: try: sock.connect((HOST, PORT)) break except ConnectionRefusedError: print("connection to {} on port {} refused.".format( HOST, PORT)) print("will try again in 5 seconds\n") time.sleep(5) except OSError: print("couldn't find {} on port {}".format(HOST, PORT)) print("wil try again in 5 seconds") time.sleep(5) sock.sendall(b"MSTOP\0") sock.close() def combine_files(self, files): data = [] for infile in files: w = wave.open(infile, "rb") data.append([w.readframes(w.getnframes())]) w.close() output = wave.open(files[0], "wb") output.setnchannels(1) #mono output.setsampwidth(2) output.setframerate(8000) output.writeframes(data[0][0]) output.writeframes(data[1][0]) output.close() def detect_silence(self, fileName): myaudio = intro = AudioSegment.from_wav(fileName) dBFS = myaudio.dBFS print(dBFS) pieces = silence.detect_silence(myaudio, 1000, dBFS - 0) pieces = [((start / 1000), (stop / 1000)) for start, stop in pieces] #convert to sec for i in pieces: if i[1] - i[0] > 3: print("big silence: " + str(i[0]) + " " + str(i[1])) return True return False
def detectKeywords(libpath): audio_stream = AudiostreamSource() extractor = FeatureExtractor(libpath) detector = AudioRecognition(libpath) framerate = 16000 model = Model("model") #Let's define a custom dictionary rec = KaldiRecognizer( model, framerate, '["oh one two three four five six seven eight nine zero", "[unk]"]') extactor_gain = 1.0 #Add one or more keyword models keywordIdAlexa = detector.addModel( '../../models/Hotword/alexa_v3.0.35.premium', 0.85) bufsize = detector.getInputDataSize() print("Audio Recognition Version: " + detector.getVersionString()) command_started = False audio_stream.start() try: while (True): # Wakeword loop if (not command_started): frame = audio_stream.read(bufsize * 2, bufsize * 2) if (not frame): time.sleep(0.01) continue features = extractor.signalToMel(frame, extactor_gain) prediction = detector.runDetection(features) if (prediction != 0): now = datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S") if (prediction == keywordIdAlexa): print("Alexa detected:" + now) os.system(play_command + " ../resources/ding.wav") command_started = True # vosk loop else: frame = audio_stream.read(4000, 4000) if (not frame): time.sleep(0.01) continue if rec.AcceptWaveform(bytes(frame)): print(rec.Result()) command_started = False print(rec.FinalResult()) except KeyboardInterrupt: print("Terminating") audio_stream.stop() sys.exit(0)
ofp.write(json.dumps(transcripts, indent=4)) with open(outCTMFile, 'w') as ofp: for transcript in transcripts: #print('\t%s (%s-%s-%s)\n' % (transcript['transcription'], sessionId, transcript['utterance_start'], transcript['utterance_duration'])) for token in transcript["tokens"]: ofp.write("%s \t 1 \t %.2f \t %.2f \t %s\n" % (sessionId, token["start"], token["duration"], token["baseform"])) print(' ') # get the list of JSON dictionaries results = [] if(useSegmentsInVosk): for segment in tqdm(segments): if(len(segment.bytes) == 0): continue; if(rec.AcceptWaveform(segment.bytes)): part_result = json.loads(rec.Result()) results.append(part_result) part_result = json.loads(rec.FinalResult()) if(part_result): results.append(part_result) else: # recognize speech using vosk model in streaming mode wf = wave.open(audioFile, "rb") while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): part_result = json.loads(rec.Result()) results.append(part_result)
class AnkiSpeechToCommand(): """ Manages speech-to-text for Anki-related commands.""" def __init__(self, command_config="commands.json", alert_sound_enabled=True): """Constructor for AnkiSpeechToCommand. Initialises vosk speech-to-text module, AnkiConnect API handler object, and derives word commands from a JSON file. Args: command_config (str, optional): Filename for the JSON command file. Defaults to "commands.json". alert_sound_enabled (bool, optional): Controls confirmation sound for attach, pause, and unpause commands. Defaults to True. Raises: json.decoder.JSONDecodeError: Handles decode errors from the JSON command file, such as malformed syntax. AnkiVoiceError: Handles anki-voice errors, in particular here for missing command definitions. """ # Verify speech-to-text engine (vosk) model exists if not Path(Path(__file__).resolve().parent, "Model").is_dir(): print("Please download the model from https://github.com/alphacep/vosk-api/blob/master/doc/models.md and unpack as 'model' (directory) in the current folder.") sys.exit(1) # Configure speech-to-text engine SetLogLevel(-10) self._model = Model("model") self._recogniser = KaldiRecognizer(self._model, 16000) self._stream = pyaudio.PyAudio().open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=2048) self._stream.start_stream() # Create AnkiConnect API handler object self._anki_action = AnkiActionHandler( alert_sound_enabled=alert_sound_enabled) # Behaviour configuration self._speech_to_text_paused = False self._alert_sound_enabled = alert_sound_enabled # Parse command JSON configuation self.command_config_load(command_config) # tba self.engine = pyttsx3.init() def command_config_load(self, command_config): try: with open(command_config) as command_config_raw: command_config_json = json.load(command_config_raw) for command in ["attach", "show", "again", "difficult", "good", "easy", "pause", "unpause", "close", "quit"]: if command not in command_config_json: raise Exception( f"Malformed commands in {command_config}. Missing the command (key): {command}") if command == "attach": self._attach_commands = [ command] + command_config_json[command]["related_words"] elif command == "show": self._show_commands = [ command] + command_config_json[command]["related_words"] elif command == "again": self._again_commands = [ command] + command_config_json[command]["related_words"] elif command == "difficult": self._difficult_commands = [ command] + command_config_json[command]["related_words"] elif command == "good": self._good_commands = [ command] + command_config_json[command]["related_words"] elif command == "easy": self._easy_commands = [ command] + command_config_json[command]["related_words"] elif command == "pause": self._pause_commands = [ command] + command_config_json[command]["related_words"] elif command == "unpause": self._unpause_commands = [ command] + command_config_json[command]["related_words"] elif command == "close": self._close_commands = [ command] + command_config_json[command]["related_words"] elif command == "quit": self._quit_commands = [ command] + command_config_json[command]["related_words"] except json.decoder.JSONDecodeError as ex: logging.error( f"A JSON decoder error occured when attempting to obtain Anki command words: {ex}") sys.exit(1) except AnkiVoiceError as ex: logging.error( f"An anki-voice error occured: {ex}") sys.exit(1) except Exception as ex: logging.error( f"An unknown exception occured when attempting to obtain Anki command words: {ex}") sys.exit(1) def run(self): """Starts thread to handle speech-to-text module functionality.""" self._command_detection = threading.Thread( target=self._cyclic_word_detection) self._command_detection.start() def pause(self): """Pauses speech-to-text monitoring (except for 'unpause' commands).""" self._speech_to_text_paused = True print("Executed: pause") if self._alert_sound_enabled: audio_feedback_queue.put_nowait("Success: Paused.") def unpause(self): """Unpauses speech-to-text monitoring (permitting any commands).""" self._speech_to_text_paused = False print("Executed: unpause") if self._alert_sound_enabled: audio_feedback_queue.put_nowait("Success: Unpaused.") def quit(self): """Triggers exit of anki-voice.""" print("Executed: quit") sys.exit(0) def _cyclic_word_detection(self): """Loops through audio input and identifies speech to text for possible commands.""" while True: data = self._stream.read(2048, exception_on_overflow=False) if len(data) == 0: break if self._recogniser.AcceptWaveform(data): res = json.loads(self._recogniser.Result()) # Identify sentence blocks if "text" in res: detected_words = res["text"].lower() if detected_words != "": self._action_command(detected_words) def _action_command(self, detected_words): """Analyses speech-to-text strings for anki-voice commands. Args: detected_words (str): The words identified through speech-to-text analysis. """ # Verify if paused, and if so, only proceed if command is to unpause if self._speech_to_text_paused: if detected_words not in self._unpause_commands: return # Process commands print("Detected:", detected_words) if detected_words in self._attach_commands: self._anki_action.get_current_card_information( called_through_attach_command=True) elif detected_words in self._show_commands: self._anki_action.show() elif detected_words in self._again_commands: self._anki_action.again() elif detected_words in self._difficult_commands: self._anki_action.difficult() elif detected_words in self._good_commands: self._anki_action.good() elif detected_words in self._easy_commands: self._anki_action.easy() elif detected_words in self._pause_commands: self.pause() elif detected_words in self._unpause_commands: self.unpause() elif detected_words in self._close_commands: self._anki_action.close() elif detected_words in self._quit_commands: self.quit() def __del__(self): """Destructor for AnkiSpeechToCommand. Stops pyaudio stream used by vosk speech-to-text module. Raises: AttributeError: Handles situation where "module" folder validation fails in constructor. Not required to be logged. """ try: self._stream.stop_stream() except AttributeError as ex: pass except Exception as ex: logging.error( f"An unknown exception occured when attempting to stop the pyaudio stream for the vosk module: {ex}")
def main(): configuration = Configuration("config/config.yaml") if not os.path.exists("model/" + configuration.config_list["language"]): print( "Please download the model from https://github.com/alphacep/vosk-api/blob/master/doc/models.md and unpack as 'model' in the current folder." ) exit(1) configuration.generate_nlu_file() ##HOTWORD hotword = Hotword(configuration.config_list["hotword"]) ##TEXT TO SPEECH tts = Tts() tts.setVoice(configuration.config_list["voice_id"]) ##PYAUDIO p = pyaudio.PyAudio() stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000) stream.start_stream() ##VOSK model = Model("model/" + configuration.config_list["language"]) rec = KaldiRecognizer(model, 16000) ###SNIPS nlu = Nlu("nlu/" + configuration.config_list["language"] + "/dataset.json") # Load plugins plugin_directories = [os.path.normpath('plugins')] plugins_list = PluginList(plugin_directories) plugins_list.find_plugins() while True: data = stream.read(8000, exception_on_overflow=False) if len(data) == 0: break if rec.AcceptWaveform(data): rec_result = json.loads(rec.Result()) if rec_result["text"].count(hotword.getWord()) > 0: tts.speak(configuration.config_list["sentence_welcome"]) hotword.setState(True) if hotword.getState() == True: if rec_result["text"] != "": parsing = nlu.parse(rec_result["text"]) if parsing["intent"][ "probability"] >= configuration.config_list[ "min_probability"]: for plugin in plugins_list._plugins: plugin_object = plugins_list._plugins[ plugin].plugin_class if plugin_object.has_intent( parsing["intent"]["intentName"]) == True: response = plugin_object.get_response( parsing["intent"]["intentName"], parsing["slots"]) tts.speak(response) hotword.setState(False) elif parsing["intent"]["intentName"] == None: hotword.setState(True) else: tts.speak( "je ne suis pas sur d'avoir compris, peux-tu répéter?" )
class VoskInput(AudioInput): """ Input from Vosk using the given language model. """ def __init__(self, notifier, rate=16000, wav_dir=None, model=os.path.join(_MODEL_DIR, 'model')): """ @see AudioInput.__init__() :type rate: :param rate: The override for the rate, if not the model's one. :type wav_dir: :param wav_dir: Where to save the wave files, if anywhere. :type model: :param model: The path to the Vosk model file. """ # Load in and configure the model. if not os.path.exists(model): raise IOError("Not found: %s" % (model, )) LOG.info("Loading model from %s, this could take a while", model) SetLogLevel(1 if LOG.getLogger().getEffectiveLevel() >= 20 else 2) self._model = Model(model) self._recognizer = KaldiRecognizer(self._model, rate) LOG.info("Model loaded") # Wen can now init the superclass super(VoskInput, self).__init__(notifier, format=pyaudio.paInt16, channels=1, rate=rate, wav_dir=wav_dir) # Where we put the results self._results = [] def _feed_raw(self, data): """ @see AudioInput._feed_raw() """ # Attempt to decode it if self._recognizer.AcceptWaveform(data): self._add_result(self._recognizer.Result()) def _decode(self): """ @see AudioInput._decode() """ # Collect anything remaining self._add_result(self._recognizer.FinalResult()) # Ensure it's clear for next time self._recognizer.Reset() # Tokenize tokens = [] LOG.debug("Decoding: %s" % self._results) for result in self._results: word = result.get('word', '').strip() conf = result.get('conf', 0.0) if word and conf: tokens.append(Token(word, conf, True)) # Done self._results = [] # And give them all back LOG.debug("Got: %s" % ' '.join(str(i) for i in tokens)) return tokens def _add_result(self, json_result): """ Add in any result we have from the given JSON string. """ result = json.loads(json_result) LOG.debug("Got %s" % json_result) # See what we got, if anything if 'result' in result: # A full result, which is the best self._results.extend(result['result']) elif 'text' in result: # A decoded text string for word in result['text'].split(): if word: self._results.append({'word': word, 'conf': 1.0})
def transcribe_to_sql(self, duration, side, original_file_name, rec_date, src, dst, linkedid): trans_start = time.time() # datetime.datetime.now() if self.source_id == self.sources['master']: original_file_name = linkedid + ('-in.wav' if side == 0 else '-out.wav') transcribation_date = datetime.datetime.now().strftime( '%Y-%m-%dT%H:%M:%S') print('transcribing', self.temp_file_path + self.temp_file_name) # read file wf = wave.open(self.temp_file_path + self.temp_file_name, "rb") # read model model = Model(self.model_path) rec = KaldiRecognizer(model, wf.getframerate()) # recognizing phrases_count = 0 confidences = [] while True: conf_score = [] data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): accept = json.loads(rec.Result()) if accept['text'] != '': accept_start = str(accept['result'][0]['start']) accept_end = accept['result'][-1:][0]['end'] accept_text = str(accept['text']) for result_rec in accept['result']: conf_score.append(float(result_rec['conf'])) conf_mid = str(sum(conf_score) / len(conf_score)) confidences.append(sum(conf_score) / len(conf_score)) # conf_score = [] self.save_result(duration, accept_text, accept_start, accept_end, side, transcribation_date, conf_mid, original_file_name, rec_date, src, dst, linkedid) phrases_count += 1 if len(confidences): self.confidence_of_file = sum(confidences) / len(confidences) else: self.confidence_of_file = 0 trans_end = time.time() # datetime.datetime.now() self.perf_log(2, trans_start, trans_end, duration, linkedid) if phrases_count == 0: self.save_result(duration, '', '0', '0', side, transcribation_date, 0, original_file_name, rec_date, src, dst, linkedid)
def trigger_microphone(n_clicks): if n_clicks == 0: return '' print('trigger microphone %d' % n_clicks) import termux termux.Microphone.stop() pwd = os.environ['PWD'] aac_file = "%s/microphone.aac" % pwd wave_file = "%s/microphone.wave" % pwd if os.path.exists(aac_file): os.remove(aac_file) termux.Microphone.record(aac_file, encoder='aac', limit=5, count=2) import time time.sleep(6) os.system('faad -o %s %s' % (wave_file, aac_file)) if False: import speech_recognition as sr r = sr.Recognizer() with sr.WavFile(wave_file) as source: audio = r.record(source) text = r.recognize_sphinx(audio) else: from vosk import Model, KaldiRecognizer, SetLogLevel import wave import numpy as np model_name = 'vosk-model-small-en-us-0.15' if not os.path.exists(model_name): os.system('wget http://alphacephei.com/vosk/models/%s.zip' % model_name) os.system('unzip %s.zip' % model_name) wf = wave.open(wave_file, "rb") model = Model(model_name) rec = KaldiRecognizer(model, wf.getframerate()) nch = wf.getnchannels() depth = wf.getsampwidth() typ = {1: np.uint8, 2: np.uint16, 4: np.uint32}.get(depth) sdata = wf.readframes(64000) data = np.frombuffer(sdata, dtype=typ) ch_data = data[0::nch] sdata = ch_data.tobytes() if True: outwav = wave.open('good.wave', 'w') outwav.setparams(wf.getparams()) outwav.setnchannels(1) outwav.writeframes(ch_data.tobytes()) outwav.close() if rec.AcceptWaveform(sdata): result = rec.Result() result = json.loads(result) text = result['text'] else: result = rec.PartialResult() result = json.loads(result) text = result['partial'] result = rec.FinalResult() result = json.loads(result) text += result['text'] print('finish microphone') print('text:%s' % text) return text
def gen_subparts(input_file, model_dir, verbose=False, partlen=4, progress=False): SetLogLevel(0 if verbose else -1) model = Model(model_dir) rec = KaldiRecognizer(model, 16000) process = subprocess.Popen([ 'ffmpeg', '-loglevel', 'quiet', '-i', input_file, '-ar', str(16000), '-ac', '1', '-f', 's16le', '-' ], stdout=subprocess.PIPE) r = subprocess.run( "ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1" .split() + [input_file], stdout=subprocess.PIPE) duration = float(r.stdout.decode('utf-8').strip()) if progress: pbar = tqdm(total=duration, unit="s") prev_end = 0 while True: data = process.stdout.read(4000) if len(data) == 0: break if rec.AcceptWaveform(data): r = json.loads(rec.Result()) if 'result' in r: resultpart = [] # TODO: use this across AccesptForm for result in r['result']: if len(resultpart) > 0 and float(result['end']) - float( resultpart[0]['start']) >= partlen: yield SubPart(start=resultpart[0]['start'], end=float(resultpart[-1]['end']), text=" ".join(r['word'] for r in resultpart)) prev_end = float(resultpart[-1]['end']) resultpart = [] if float(result['end'] - result['start']) >= partlen: yield SubPart(start=float(result['start']), end=float(result['end']), text=result['word']) prev_end = float(result['end']) resultpart = [] else: resultpart.append(result) if progress: pbar.update(float(result['end'] - pbar.n)) if len(resultpart) > 0: yield SubPart(start=float(resultpart[0]['start']), end=float(resultpart[-1]['end']), text=" ".join(r['word'] for r in resultpart)) prev_end = float(resultpart[-1]['end']) resultpart = [] else: pass #print(rec.PartialResult()) #pprint(rec.PartialResult()) if progress: pbar.close() r = json.loads(rec.PartialResult()) text = r['partial'] yield SubPart(start=prev_end, end=duration, text=text)
class SpeechDetector: @classmethod def hotword_list(cl): return util.hotword_list_snowboy() @classmethod def hotword_model_list(cl): return util.hotword_model_list_snowboy() def __init__(self, hotword_model=[util.restream(f'snowboy/hotword_models/{a}') for a in ['阿Q.pmdl']], sensitivity=.5, lang='zh', audio_gain=1, silence_timeout=2, recognition_timeout=10): if not isinstance(hotword_model, list): hotword_model = [hotword_model] if isinstance(sensitivity, list): assert len(hotword_model) == len(sensitivity), 'Number of hotword does not match number of sensitivity' else: sensitivity = [sensitivity]* len(hotword_model) self._detect = snowboydetect.SnowboyDetect(restream_filename=util.resouce('snowboy/common.res').encode(), model_str=",".join(hotword_models).encode()) self._detect.SetAudioGain(audio_gain) self._detect.ApplyFrontend(False) self._detect.SetSensitivity(','.join([str(s) for s in sensitivity]).encode()) assert lang.lower() in ['en', 'zh', 'cn'], 'Only english and chinese is supported' self._rec = KaldiRecognizer(Model(util.resouce('sphinx/vosk-model-en-us-daanzu-20200328-lgraph') if lang=='en' else util.restream('sphinx/vosk-model-cn-0.1')), self._detect.SampleRate()) self._hotwords = [w.split('/')[-1].split('.')[0] for w in hotword_model] self._recognition_timeout = int(recognition_timeout/self.required_buffer_size) self._silence_timeout = int(silence_timeout/self.required_buffer_size) @property def required_samplerate(self): return 16000 @property def required_bit_depth(self): return 16 @property def required_channels(self): return 1 @property def required_buffer_size(self): return self.required_samplerate * self.required_bit_depth//8 * self.required_channels // 10 # 0.1 sec def stop(self): self._stop = True def detect(self, stream, *, hotword_callback=None, speech_callback=None): self._stop = False recognizing = False recognition_count = silence_count = 0 for data in stream: if self._stop: return status = self._detect.RunDetection(data) if status == -1: logger.warning("Error initializing streams or reading audio data") if recognizing: if self._rec.AcceptWaveform(data): speech_callback(json.loads(self._rec.Result())['text'].replace(' ','')) recognizing = False else: recognition_count += 1 if status == -2: # silence detected silence_count += 1 else: silence_count = 0 if recognition_count >= self._recognition_timeout or silence_count >= self._silence_timeout: speech_callback(json.loads(self._rec.FinalResult())['text'].replace(' ','')) recognizing = False elif status > 0: hotword_callback and hotword_callback(self._hotwords[status]) if speech_callback: recognition_count = silence_count = 0 recognizing = True def detect_once(self, stream, *, hotword_callback=None, speech_callback=None): self._stop = False recognizing = False recognition_count = silence_count = 0 for data in stream: if self._stop: return status = self._detect.RunDetection(data) if status == -1: logger.warning("Error initializing streams or reading audio data") if recognizing: if self._rec.AcceptWaveform(data): speech_callback(json.loads(self._rec.Result())['text']) return else: recognition_count += 1 if status == -2: # silence detected silence_count += 1 else: silence_count = 0 if recognition_count >= self._recognition_timeout or silence_count >= self._silence_timeout: speech_callback(json.loads(self._rec.FinalResult())['text']) return elif status > 0: hotword_callback and hotword_callback(self._hotwords[status]) if speech_callback: recognizing = True else: return
class Tester: def __init__( self, filepath: Optional[str], model_path: str, sample_rate: int, use_gpu: bool = False ): if use_gpu: # Gpu part, uncomment if vosk-api has gpu support from vosk import GpuInit, GpuInstantiate GpuInit() GpuInstantiate() self.sample_rate = sample_rate self.model = Model(model_path) self.rec = KaldiRecognizer(self.model, sample_rate) self.filepath = filepath def _read(self, out): while True: data = out.read(8000) if len(data) == 0: break if self.rec.AcceptWaveform(data): print(self.rec.Result()) else: print(self.rec.PartialResult()) print(self.rec.FinalResult()) def _test_microphone(self): stream = PyAudio().open( format=paInt16, channels=1, rate=self.sample_rate, input=True, frames_per_buffer=8000 ) stream.start_stream() self._read(stream) def _test_file(self, filepath): process = subprocess.Popen(['ffmpeg', '-loglevel', 'quiet', '-i', filepath, '-ar', str(self.sample_rate), '-ac', '1', '-f', 's16le', '-'], stdout=subprocess.PIPE) self._read(process.stdout) def test(self): if self.filepath is None: self._test_microphone() else: self._test_file(self.filepath)
class VoskInput(BaseInput): """ Uses the `vosk` package to do speech recognition. """ def __init__(self): super(VoskInput, self).__init__() self.current_utterance = "" self.realtime = True # indicates that audio can be streamed in model_name = crystal.core.get_config( 'vosk_model') or 'vosk-model-small-en-us-0.3' log.info(f"Using vosk model: {model_name}") self.model = Model(f"models/{model_name}") self.rec = None self.__final_result = None def process_audio(self, raw_audio: bytes, sample_rate: int, sample_width: int): if not self.rec: self.rec = KaldiRecognizer(self.model, sample_rate) full = self.rec.AcceptWaveform(raw_audio) if full: result = self.rec.Result() else: result = self.rec.PartialResult() log.debug(result) result = json.loads(result) if "result" in result: self.__final_result = result if "text" in result: text = result["text"] elif "partial" in result: text = result["partial"] if text: self.current_utterance = text return self.current_utterance def get_full_result(self): if self.__final_result: result = self.__final_result else: result = self.rec.FinalResult() result = json.loads(result) log.debug(result) self.rec = None self.current_utterance = "" self.__final_result = None full_text = result["text"] # HACK: auto correct text to match domain vocabulary. Sorry. full_text = full_text.replace("palace music", "pause music") full_text = full_text.replace("applause music", "pause music") if any(x in full_text for x in ["turn on", "turn off", "turned on", "turned off"]): full_text = full_text.replace("the land", "the lamp").replace( "the lamb", "the lamp") if full_text.endswith("the lam"): full_text = full_text.replace("the lam", "the lamp") if any(x in full_text for x in ["timer", "alarm"]): full_text = full_text.replace("crystal said", "crystal set") if full_text.endswith("to pm"): full_text = full_text.replace("to pm", "2 pm") elif full_text.endswith(" a m"): full_text = full_text.replace(" a m", " am") if full_text.startswith("christo"): full_text = full_text.replace("christo", "crystal") elif full_text.startswith("crews to"): full_text = full_text.replace("crews to", "crystal") elif full_text.startswith("christian"): full_text = full_text.replace("christian", "crystal") return full_text
class SpeechRecognizer: """语音识别器,对 |CMUSphinx vosk| 的简单封装 .. |CMUSphinx vosk| raw:: html <a href='https://github.com/alphacep/vosk-api' target='blank'>CMUSphinx vosk</a> :param lang: 语言,目前支持中文 `'zh'` 或英文 `'en'` ,默认中文 :type lang: str, optinal """ def __init__(self, lang='zh'): lang = lang.lower() self._lang = lang assert lang in ['en', 'zh', 'cn'], 'Only english and chinese is supported' self._rec = KaldiRecognizer( Model( util.resource('sphinx/vosk-model-en-us-daanzu-20200328-lgraph' ) if lang == 'en' else util.resource('sphinx/vosk-model-cn-0.1')), 16000) self._detect = snowboydetect.SnowboyDetect( resource_filename=util.resource('snowboy/common.res').encode(), model_str=util.resource('snowboy/hotword_models/阿Q.pmdl').encode()) self._detect.SetAudioGain(2) self._detect.ApplyFrontend(False) self._detect.SetSensitivity('0.5'.encode()) def recognize(self, stream, timeout=10, silence_timeout=2): """开始识别 :param stream: 音频数据流 :param timeout: 超时,即最长的识别时间(秒),默认为 `10`,设为 `None` 则表示不设置超时 :type timeout: float, optinal :param silence_timeout: 停顿超时(秒),超过这个时间没有说话则表示已经说完,默认为 `2`,设为 `None` 则表示不设置停顿超时 :type silence_timeout: float, optinal :return: 识别到的短语或句子 :rtype: str """ self._cancel = False recognition_count = silence_count = 0.0 for data in stream: if self._cancel: raise Exception( 'Speech recognition cancelled by another thread') if self._rec.AcceptWaveform(data): text = self._rec.Result() break ln = len( data ) / 32000 # 1 second = 16000(samplerate) * 2 bytes_per_sample recognition_count += ln if timeout and recognition_count > timeout: text = self._rec.FinalResult() break if self._detect.RunDetection(data) == -2: # silence silence_count += ln if silence_timeout and silence_count > silence_timeout: text = self._rec.FinalResult() break text = json.loads(text)['text'] if not self._lang == 'en': text = text.replace(' ', '') return text def cancel(self): """停止识别""" self._cancel = True
def my_link(): print("entered into function for processing") time.sleep(15) c = '0' + '.wav' counter = 0 conn = connect() model = Model("vosk-model-small-en-in-0.4") pth = os.listdir(audio_path) #print(pth) print("entering into while loop") while (c in pth): #sound = AudioSegment.from_wav('C:/Users/admin/Downloads/' + i ) #sound = sound.set_channels(1) # To make it MONO Channel #sound = sound.set_frame_rate(44100) # Sample Frame rate taken here = 44,100 Hz #sound.export('C:/Users/admin/Downloads/' + i , format="wav") wf = wave.open(audio_path + '/' + c, 'rb') rec = KaldiRecognizer(model, wf.getframerate()) while True: data = wf.readframes(CHUNK) if len(data) == 0: break if rec.AcceptWaveform(data): pass dict = ast.literal_eval( rec.FinalResult()) #changing the string to dictionary print(c) s = dict["text"] print(s) #integrate tc model temp = remove_punct(s) temp = tknz_text(temp) temp = remove_stopwords(temp) temp = stmng(temp) #for removing punctions puncs = set([ '"', '(', ')', '.', ',', '-', '<', '>', '/', '\',%', '\\x', '!', '?', "'", 's' ]) temp2 = [] for i in temp: if i[0].isalpha() == True: temp2.append(i) #for removing spaces temp1 = [] for i in temp2: if i not in ("", '', " ", ' '): temp1.append(i) fg, word = check_list(conn, temp1) if fg == 1: #flash("Abusive Detected") print("Abusive Detected") else: print("Normal Text") print() #close_the_connection(conn) time.sleep(0.01) counter = counter + 1 c = '0' + ' ' + '(' + str(counter) + ')' + '.wav' pth = os.listdir(audio_path) time.sleep(5) print("exiting while loop") delete() return redirect('http://127.0.0.1:5000/')
model = Model("model") rec = KaldiRecognizer(model, 16000) p = pyaudio.PyAudio() stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000) stream.start_stream() result = "" init_time = time.time() while True: current_time = time.time() if current_time - init_time < 5: data = stream.read(4000) if len(data) == 0: break if rec.AcceptWaveform(data): #print(rec.Result()) result = result + " " + json.loads(rec.Result())['text'] print(result) else: pass else: break result = result + json.load(rec.FinalResult())['text'] print(result)
def my_link(): time.sleep(15) c = '0' + '.wav' counter = 0 conn = connect() model = Model("vosk-model-small-en-in-0.4") pth = os.listdir(audio_path) while (c in pth): wf = wave.open(audio_path + '/' + c, 'rb') rec = KaldiRecognizer(model, wf.getframerate()) while True: data = wf.readframes(CHUNK) if len(data) == 0: break if rec.AcceptWaveform(data): pass dict = ast.literal_eval( rec.FinalResult()) #changing the string to dictionary print(c) #print(dict["text"]) s = dict["text"] print(s) #Text classification starts temp = remove_punct(s) temp = tknz_text(temp) temp = remove_stopwords(temp) temp = stmng(temp) #for removing punctions puncs = set([ '"', '(', ')', '.', ',', '-', '<', '>', '/', '\',%', '\\x', '!', '?', "'", 's' ]) temp2 = [] for i in temp: if i[0].isalpha() == True: temp2.append(i) #for removing spaces temp1 = [] for i in temp2: if i not in ("", '', " ", ' '): temp1.append(i) print(temp1) fg, word = check_list(conn, temp1) if fg == 1: print("Abusive Detected") else: print("Normal Text") print() #close_the_connection(conn) time.sleep(0.01) counter = counter + 1 c = '0' + ' ' + '(' + str(counter) + ')' + '.wav' pth = os.listdir(audio_path) #print(pth, "-->", c) time.sleep(5) try: delete() except: return render_template('index.html') return render_template('index.html')