def translate_file(filename="last5.wav"): SetLogLevel(-1) if not os.path.exists("model"): print( "Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder." ) exit(1) filepath = "./" + filename wf = wave.open(filepath, "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype( ) != "NONE": print("Audio file must be WAV format mono PCM.") exit(1) model = Model("./model") rec = KaldiRecognizer(model, 16000) while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): res = rec.FinalResult() #print(rec.FinalResult()) #else: #print(rec.PartialResult()) try: #for some reason res doesnt get assigned post loop results = res #print("results: " +results) except UnboundLocalError: results = rec.FinalResult( ) #rec.FinalResult() holds the words in this case results_json = json.loads(results) #print(results_json["text"]) return (results_json["text"]) #["results"] for confidence of each word
def __init__(self, command_config="commands.json", alert_sound_enabled=True): """Constructor for AnkiSpeechToCommand. Initialises vosk speech-to-text module, AnkiConnect API handler object, and derives word commands from a JSON file. Args: command_config (str, optional): Filename for the JSON command file. Defaults to "commands.json". alert_sound_enabled (bool, optional): Controls confirmation sound for attach, pause, and unpause commands. Defaults to True. Raises: json.decoder.JSONDecodeError: Handles decode errors from the JSON command file, such as malformed syntax. AnkiVoiceError: Handles anki-voice errors, in particular here for missing command definitions. """ # Verify speech-to-text engine (vosk) model exists if not Path(Path(__file__).resolve().parent, "Model").is_dir(): print("Please download the model from https://github.com/alphacep/vosk-api/blob/master/doc/models.md and unpack as 'model' (directory) in the current folder.") sys.exit(1) # Configure speech-to-text engine SetLogLevel(-10) self._model = Model("model") self._recogniser = KaldiRecognizer(self._model, 16000) self._stream = pyaudio.PyAudio().open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=2048) self._stream.start_stream() # Create AnkiConnect API handler object self._anki_action = AnkiActionHandler( alert_sound_enabled=alert_sound_enabled) # Behaviour configuration self._speech_to_text_paused = False self._alert_sound_enabled = alert_sound_enabled # Parse command JSON configuation self.command_config_load(command_config) # tba self.engine = pyttsx3.init()
def translate_file(filename="last5.wav"): SetLogLevel(0) if not os.path.exists("model"): print( "Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder." ) exit(1) filepath = "./" + filename wf = wave.open(filepath, "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype( ) != "NONE": print("Audio file must be WAV format mono PCM.") exit(1) model = Model("./model") rec = KaldiRecognizer(model, wf.getframerate()) while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): print(rec.Result()) else: print(rec.PartialResult()) results = rec.FinalResult() return json.loads(results)[ "text"] #["results"] for confidence of each word
def speech_to_text(file): SetLogLevel(0) timestamp = [] text = [] #ouverture fichier audio with wave.open(file, "rb") as wav_file: #entrainement modèle avec les données model = Model("model") rec = KaldiRecognizer(model, wav_file.getframerate()) #lecture du fichier audio par bloc de frames data = wav_file.readframes(4000) while len(data) != 0: if rec.AcceptWaveform(data): #récupération de la transcription par json res = json.loads(rec.Result()) #ajout de l'horodatage et de la transcription dans les listes #si ils existent if ('result' in res): timestamp.append(res['result'][0]['start']) text.append(res['text']) #lecture du bloc de frames suivant data = wav_file.readframes(4000) return timestamp, text
def __init__(self, fileName): """ 初始化 :param fileName: """ SetLogLevel(0) self.rec, self.process = self.judgeCondition(fileName)
def main(): argv = sys.argv[1:] model_path = "./model" filename = "" try: opts, _ = getopt.getopt(argv, "f:m:", ["file_name =", "model_path ="]) #print(opts) #print(args) except: print("Error with arguments") return for opt, arg in opts: if opt in ['-f', '--file_name']: filename = arg elif opt in ['-m', '--model_path']: model_path = arg print("FILE: ", filename, " MODEL: ", model_path) if not os.path.exists(model_path): print( "Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder." ) return SetLogLevel(-1) sample_rate = 16000 model = Model(model_path) rec = KaldiRecognizer(model, sample_rate) process = subprocess.Popen([ 'ffmpeg', '-loglevel', 'quiet', '-i', filename, '-ar', str(sample_rate), '-ac', '1', '-f', 's16le', '-' ], stdout=subprocess.PIPE) result = "" while True: data = process.stdout.read(4000) if len(data) == 0: break if rec.AcceptWaveform(data): data = json.loads(rec.Result()) result += data['text'] #print(result) data = json.loads(rec.FinalResult()) result += data['text'] print("\n") print(result)
def __init__(self, wav_audio, model="model-indian"): self.model = Model(model) self.wav_audio = wav_audio self.transcript = "" self.timestamped_text = [] self._wf = None self._output_wav = None # Remove logging SetLogLevel(-1)
def set_up(self): """ set up vosk :return: """ if not os.path.exists(self.model_path): print( "Please download the model from " "https://github.com/alphacep/vosk-api/blob/master/doc/models.md " "and unpack as 'model' in the current folder.") exit(1) SetLogLevel(level=0)
def init_app(self, config): SetLogLevel(0) model_path = config.MODEL_PATH self.rate = int(config.RATE) if not os.path.exists(model_path): print("Error in model path. Such directory does not exist!") exit(1) self.model = Model(model_path) # self.stt_recognizer = KaldiRecognizer(self.model, rate) # self.psql_client = PostgresClient() # self.psql_client.init_app(config=config) self.sftp_client = SftpClient() self.sftp_client.init_app(config=config) self.config = config
def speech_recog(fileIn): datalist = [] SetLogLevel(0) if not os.path.exists("model"): print( "Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder." ) exit(1) sample_rate = 16000 model = Model("model") rec = KaldiRecognizer(model, sample_rate) try: process = subprocess.Popen([ 'ffmpeg', '-loglevel', 'quiet', '-i', fileIn, '-ar', str(sample_rate), '-ac', '1', '-f', 's16le', '-' ], stdout=subprocess.PIPE) except IndexError: raise while True: data = process.stdout.read(4000) if len(data) == 0: break if rec.AcceptWaveform(data): result = rec.Result() datalist.append(json.loads(result)) finalResult = rec.FinalResult() datalist.append(json.loads(finalResult)) print(fileIn) for entry in datalist: if "result" in entry: for word in entry["result"]: word.update({"file": fileIn}) words = words_from_list(datalist) with open(os.path.splitext(fileIn)[0] + ".json", "w") as output_json: output_json.write(json.dumps(datalist)) return words
def __init__(self, callback=None, **kwargs): """ Start recording the microphone and analyse audio with Vosk :param callback: The callback function to call to send the text :param kwargs: """ SpeechRecognition.__init__(self, kwargs.get('audio_file_path', None)) self.main_controller_callback = callback self.language = kwargs.get('language', "model-fr") self.log_level = kwargs.get('log_level', -1) self.grammar_file = kwargs.get('grammar_file', None) SetLogLevel(self.log_level) self.set_callback(self.vosk_callback) self.start_processing()
def init(): """Initialise logging config for the application""" config_file_name = os.environ.get('CONFIG_FILE_NAME', 'logging_config.yaml') print( f'Configuring the logging system from config file: {config_file_name}', flush=True) try: with open(os.path.join(os.path.dirname(__file__), config_file_name), 'r') as fin: yml = yaml.load(fin, Loader=yaml.FullLoader) logging.config.dictConfig(yml) except (TypeError, FileNotFoundError, ValueError): print('Failed to initialise the logging framework', file=sys.stderr) traceback.print_exc(file=sys.stderr) # Set Vosk log level to silence output SetLogLevel(-1) # Set TensorFlow C++ logging to silence non error os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
def vosk_model(address): SetLogLevel(2) wf = wave.open(address, "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype( ) != "NONE": print("Audio file must be WAV format mono PCM.") exit(1) model = Model("../audio_utils/tests/vosk_test/model") rec = KaldiRecognizer(model, wf.getframerate()) while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): print(rec.Result()) else: print(rec.PartialResult()) print(rec.FinalResult())
def decode_file(self, aud_file): SetLogLevel(0) sentence = "" results = "" confidence = 0 tot = 0 wf = wave.open(aud_file, "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype( ) != "NONE": #checking certain file characteristics print("Audio aud_file must be WAV format mono PCM.") exit(1) while True: #loop for doing voice recognition data = wf.readframes(4000) if len(data) == 0: #done reading audio file break if self.rec.AcceptWaveform( data): #finished recognition on segment of audio file items = self.rec.Result() results = json.loads(items) if len(results.items( )) > 1: #false recognition, sometimes nothing is detected for i in results["result"]: confidence += i["conf"] tot += 1 sentence = sentence + " " + results["text"] else: print(self.rec.PartialResult()) f_res = json.loads(self.rec.FinalResult()) if len(f_res.items()) > 1: return f_res["text"] wf.close() if tot > 0 and confidence / tot > .8: #checking confidence of recognition return sentence.lower().strip() elif tot > 0: print("confidence too low: " + str(confidence / tot)) return ""
def decode_file(self, aud_file): SetLogLevel(0) wf = wave.open(aud_file, "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype( ) != "NONE": print("Audio aud_file must be WAV format mono PCM.") exit(1) results = [] while True: data = wf.readframes(4000) if len(data) == 0: break if self.rec.AcceptWaveform(data): results.append(self.rec.Result()) for i in results: y = json.loads(i) print("---VOSK TEXT---", y["text"]) print("results:", results) return results
def _recognize_vosk(self): SetLogLevel(0) if not os.path.exists("vosk-model-small-es-0.3"): raise Exception("Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder.") text = [] wf = wave.open(self.file_name, "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE": raise Exception("Audio file must be WAV format mono PCM.") model = Model("vosk-model-small-es-0.3") rec = KaldiRecognizer(model, wf.getframerate()) rec.SetWords(True) while True: data = wf.readframes(4000) if len(data) == 0: break rec.AcceptWaveform(data) try: res = json.loads(rec.FinalResult()) return res['text'] except Exception as e: print(e) return ""
def speech_to_text(model, audio_file, output_text_file, verbose): SetLogLevel(-1) process = subprocess.Popen( [ "ffmpeg", "-loglevel", "quiet", "-i", audio_file, "-ar", str(sample_rate), "-ac", "1", "-f", "s16le", "-", ], stdout=subprocess.PIPE, ) output_f = None if output_text_file: output_f = open(output_text_file, mode="w") else: verbose = True for result in reconize(model, process): if result: if output_f: output_f.write(f"{result}\n") if verbose: print(result) if output_f: output_f.close()
def __init__(self, notifier, rate=16000, wav_dir=None, model=os.path.join(_MODEL_DIR, 'model')): """ @see AudioInput.__init__() :type rate: :param rate: The override for the rate, if not the model's one. :type wav_dir: :param wav_dir: Where to save the wave files, if anywhere. :type model: :param model: The path to the Vosk model file. """ # Load in and configure the model. if not os.path.exists(model): raise IOError("Not found: %s" % (model, )) LOG.info("Loading model from %s, this could take a while", model) SetLogLevel(1 if LOG.getLogger().getEffectiveLevel() >= 20 else 2) self._model = Model(model) self._recognizer = KaldiRecognizer(self._model, rate) LOG.info("Model loaded") # Wen can now init the superclass super(VoskInput, self).__init__(notifier, format=pyaudio.paInt16, channels=1, rate=rate, wav_dir=wav_dir) # Where we put the results self._results = []
def transcribe(file_name): sound = AudioSegment.from_wav(file_name) sound = sound.set_channels(1) sound.export("generate/audio.wav", format="wav") SetLogLevel(-1) if not os.path.exists("model"): print( "Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder." ) exit(1) wf = wave.open('generate/audio.wav', "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype( ) != "NONE": print("Audio file must be WAV format mono PCM.") exit(1) model = Model("model") rec = KaldiRecognizer(model, wf.getframerate()) while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): pass # print(rec.Result()) else: pass # print(rec.PartialResult()) r = rec.FinalResult() r = json.loads(r) return r['text']
#!/usr/bin/env python3 from vosk import Model, KaldiRecognizer, SetLogLevel import sys import os import subprocess import srt import json import datetime SetLogLevel(-1) if not os.path.exists("./VTT/model"): print("Trying to download voice model, this is a one time thing and may take a while...") try: if not os.path.exists("./VTT/model.zip"): import urllib.request print("Downloading...") urllib.request.urlretrieve("https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip", "./VTT/model.zip") import zipfile import platform print("Extracting...") with zipfile.ZipFile("./VTT/model.zip", 'r') as zip_ref: zip_ref.extractall("./VTT/") ##Rename the folder: files = os.listdir('./VTT') for file in files: if 'model' in file and '.zip' not in file: if platform.system() == "Windows": status = subprocess.call('copy %s model /e'%('./VTT/'+file), shell=True) else:
def __init__(self, model_path, text_processor=None): SetLogLevel(-1) self.vosk_model = Model(model_path) self.text_processor = text_processor self.sample_rate = 16000
#!/usr/bin/env python3 from vosk import Model, KaldiRecognizer, SetLogLevel import sys import os import glob import wave from pathlib import Path SetLogLevel(0) home = str(Path.home()) modelDir = os.path.join(home, "git", "callrail_voice_to_text", "Data", "SpeechModel", "model") if not os.path.exists(modelDir): print( "Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the folder:" + modelDir) exit(1) model = Model(modelDir) wavFileDir = os.path.join(home, "git", "callrail_voice_to_text", "Data", "SoundEncodeToWav") # Establish destination directory, attmept to create the directory try: os.mkdir(wavFileDir) except OSError as error: print(error) destinationDir = os.path.join(home, "git", "callrail_voice_to_text", "Data", "WavToTextResults")
def __init__(self, modelpath, log_prefix='[vosk_stt]'): SetLogLevel(0) self.log_prefix = log_prefix self.model = modelpath
def get_recognizer(self, framerate): SetLogLevel(-1) model = Model(os.path.join(c.PLUGIN_PATH, "vosk_alternatives", "model")) rec = KaldiRecognizer(model, framerate) return rec
def __init__(self): """Set the log level and load the Vosk model""" SetLogLevel(config.vosk_log_level) self.model = Model(config.vosk_model_dir)
def gen_subparts(input_file, model_dir, verbose=False, partlen=4, progress=False): SetLogLevel(0 if verbose else -1) model = Model(model_dir) rec = KaldiRecognizer(model, 16000) process = subprocess.Popen([ 'ffmpeg', '-loglevel', 'quiet', '-i', input_file, '-ar', str(16000), '-ac', '1', '-f', 's16le', '-' ], stdout=subprocess.PIPE) r = subprocess.run( "ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1" .split() + [input_file], stdout=subprocess.PIPE) duration = float(r.stdout.decode('utf-8').strip()) if progress: pbar = tqdm(total=duration, unit="s") prev_end = 0 while True: data = process.stdout.read(4000) if len(data) == 0: break if rec.AcceptWaveform(data): r = json.loads(rec.Result()) if 'result' in r: resultpart = [] # TODO: use this across AccesptForm for result in r['result']: if len(resultpart) > 0 and float(result['end']) - float( resultpart[0]['start']) >= partlen: yield SubPart(start=resultpart[0]['start'], end=float(resultpart[-1]['end']), text=" ".join(r['word'] for r in resultpart)) prev_end = float(resultpart[-1]['end']) resultpart = [] if float(result['end'] - result['start']) >= partlen: yield SubPart(start=float(result['start']), end=float(result['end']), text=result['word']) prev_end = float(result['end']) resultpart = [] else: resultpart.append(result) if progress: pbar.update(float(result['end'] - pbar.n)) if len(resultpart) > 0: yield SubPart(start=float(resultpart[0]['start']), end=float(resultpart[-1]['end']), text=" ".join(r['word'] for r in resultpart)) prev_end = float(resultpart[-1]['end']) resultpart = [] else: pass #print(rec.PartialResult()) #pprint(rec.PartialResult()) if progress: pbar.close() r = json.loads(rec.PartialResult()) text = r['partial'] yield SubPart(start=prev_end, end=duration, text=text)
def _enable_logs(cls, vosk_logs): if vosk_logs: SetLogLevel(0) # Vosk logs logging.basicConfig(level=logging.DEBUG)