def getModel(ARGS): # Load DeepSpeech model if os.path.isdir(ARGS.model): model_dir = ARGS.model ARGS.model = os.path.join(model_dir, 'output_graph.pb') ARGS.scorer = os.path.join(model_dir, ARGS.scorer) print('Initializing model...') logging.info("ARGS.model: %s", ARGS.model) model = deepspeech.Model(ARGS.model) # Load DeepSpeech model if os.path.isdir(ARGS.model): model_dir = ARGS.model ARGS.model = os.path.join(model_dir, 'output_graph.pb') ARGS.scorer = os.path.join(model_dir, ARGS.scorer) print('Initializing model...') logging.info("ARGS.model: %s", ARGS.model) model = deepspeech.Model(ARGS.model) #model.addHotWord('Fire', 10) #model.addHotWord('Intruder', 10) #model.addHotWord('Help', 10) #model.addHotWord('Yes', 10) #model.addHotWord('No', 10) if ARGS.scorer: logging.info("ARGS.scorer: %s", ARGS.scorer) model.enableExternalScorer(ARGS.scorer) return model
def _register_(serviceList, pluginProperties): global services, plugin, core, audioRecorder, dsModel, stream, defaudsrc, actions, replacements, pattern services = serviceList plugin = pluginProperties core = services["core"][0] audioRecorder = services["audioRecorder"][0] actions = services["actions"][0] with open(r"englishSTT\en_us_replacements.json") as f: replacements = json.load(f) replacements = dict((re.escape(k), v) for k, v in replacements.items()) pattern = re.compile("|".join(replacements.keys())) defaudsrc = audioRecorder.getAudioSource(device=1) try: dsModel = deepspeech.Model( r"englishSTT/deepspeech-0.9.3-models/deepspeech-0.9.3-models.pbmm", ) dsModel.enableExternalScorer( r"englishSTT/deepspeech-0.9.3-models/deepspeech-0.9.3-models.scorer" ) except RuntimeError: print("Downloading deepspeech models") os.makedirs("englishSTT/deepspeech-0.9.3-models") print("Downloading deepspeech .pbmm") urllib.request.urlretrieve( "https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.pbmm", "englishSTT/deepspeech-0.9.3-models/deepspeech-0.9.3-models.pbmm") print("Downloading deepspeech .scorer") urllib.request.urlretrieve( "https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.scorer", "englishSTT/deepspeech-0.9.3-models/deepspeech-0.9.3-models.scorer" ) dsModel = deepspeech.Model( r"englishSTT/deepspeech-0.9.3-models/deepspeech-0.9.3-models.pbmm", ) dsModel.enableExternalScorer( r"englishSTT/deepspeech-0.9.3-models/deepspeech-0.9.3-models.scorer" ) #dsModel.enableDecoderWithLM( # r"englishSTT\model\lm.binary", # r"englishSTT\model\trie", # 0.75, # 1.85) # DeepSpeech locks up after the first few frames, this clears that up # THIS WAS ADDED WITH DEEPSPEECH 0.6.0, AND MAY NO LONGER BE NEEDED stream = dsModel.createStream() stream.feedAudioContent([0, 0, 0, 0, 65535, 65535, 65535, 65535] * 8192) stream.finishStream() services["userInterface"][0].addCommands({"trigger": trigger})
def main(): model = deepspeech.Model(MODEL_PATH) model.setBeamWidth(BEAM_WIDTH) model.enableExternalScorer(SCORER_PATH) stream = model.createStream() audio = pyaudio.PyAudio() index, name = find_device(audio, 'pulse') print(f'select device {name}') buffer_size = model.sampleRate() // BUFFERS_PER_SECOND audio_stream = audio.open(rate=model.sampleRate(), channels=1, format=audio.get_format_from_width( SAMPLE_WIDTH, unsigned=False), input_device_index=index, input=True, frames_per_buffer=buffer_size, stream_callback=audio_callback) num_iterations = BUFFERS_PER_SECOND * 2 i = 0 while audio_stream.is_active(): stream.feedAudioContent(buffer_queue.get()) if i % num_iterations == 0: text = stream.intermediateDecode() if text.find('stop') >= 0: break print(text) i += 1 print(stream.finishStream()) audio_stream.close()
def maybe_load_model(self): """Load DeepSpeech model if not already loaded.""" if self.model: return assert self.model_path, "No model path" _LOGGER.debug("Loading model from %s (beam width=%s)", self.model_path, self.beam_width) self.model = deepspeech.Model(str(self.model_path), self.beam_width) if (self.language_model_path and self.language_model_path.is_file() and self.trie_path and self.trie_path.is_file()): _LOGGER.debug( "Enabling language model (lm=%s, trie=%s, lm_alpha=%s, lm_beta=%s)", self.language_model_path, self.trie_path, self.lm_alpha, self.lm_beta, ) self.model.enableDecoderWithLM( str(self.language_model_path), str(self.trie_path), self.lm_alpha, self.lm_beta, )
async def main(): BEAM_WIDTH = 500 LM_ALPHA = 0.75 LM_BETA = 1.85 MODEL = "./models/deepspeech-0.8.2-models.pbmm" LANG_MODEL = "./models/lm.binary" TRIE = "./models/trie" model = deepspeech.Model(MODEL, BEAM_WIDTH) model.enableDecoderWithLM(LANG_MODEL, TRIE, LM_ALPHA, LM_BETA) vad_audio = VADAudio(aggressiveness=3, device=11, input_rate=48000) uri = "ws://localhost:8000/ws" ws = await websockets.connect(uri, ping_interval=None) stream_context = model.createStream() for frame in vad_audio.vad_collector(): if frame is not None: model.feedAudioContent(stream_context, np.frombuffer(frame, np.int16)) else: text = model.finishStream(stream_context) try: await ws.send(text) stream_context = model.createStream() returned = await ws.recv() print(returned) except: # there is clearly a better ay to do this but I have a smol brain print("Reconnecting") ws = await websockets.connect(uri, ping_interval=None)
def main(ARGS): # Load DeepSpeech model if os.path.isdir(ARGS.model): model_dir = ARGS.model ARGS.model = os.path.join(model_dir, 'output_graph.pb') ARGS.alphabet = os.path.join( model_dir, ARGS.alphabet if ARGS.alphabet else 'alphabet.txt') ARGS.lm = os.path.join(model_dir, ARGS.lm) ARGS.trie = os.path.join(model_dir, ARGS.trie) print('Initializing model...') logging.info("ARGS.model: %s", ARGS.model) logging.info("ARGS.alphabet: %s", ARGS.alphabet) model = deepspeech.Model(ARGS.model, ARGS.beam_width) if ARGS.lm and ARGS.trie: logging.info("ARGS.lm: %s", ARGS.lm) logging.info("ARGS.trie: %s", ARGS.trie) model.enableDecoderWithLM(ARGS.lm, ARGS.trie, ARGS.lm_alpha, ARGS.lm_beta) # Start audio with VAD vad_audio = VADAudio(aggressiveness=ARGS.vad_aggressiveness, device=ARGS.device, input_rate=ARGS.rate, file=ARGS.file) # this create a generator from vad_collector function. # if new block is saved it will be saved in the buffer queue # when a new frame is required to process the function vad_collector. # provids the next uttrance by using yeild, so it will provide frame and # when next one is asked it will return to the function from the last yeild postion. print("Listening (ctrl-C to exit)...") frames = vad_audio.vad_collector() # Stream from microphone to DeepSpeech using VAD spinner = None if not ARGS.nospinner: spinner = Halo(spinner='line') stream_context = model.createStream() wav_data = bytearray() for frame in frames: if frame is not None: if spinner: spinner.start() logging.debug("streaming frame") model.feedAudioContent(stream_context, np.frombuffer(frame, np.int16)) if ARGS.savewav: wav_data.extend(frame) else: if spinner: spinner.stop() logging.debug("end utterence") if ARGS.savewav: vad_audio.write_wav( os.path.join( ARGS.savewav, datetime.now().strftime( "savewav_%Y-%m-%d_%H-%M-%S_%f.wav")), wav_data) wav_data = bytearray() text = model.finishStream(stream_context) print("Recognized: %s" % text) stream_context = model.createStream()
def __init__(self, results_event, config=None): if len( signature(super(DeepSpeechLocalStreamingSTT, self).__init__).parameters) == 2: super(DeepSpeechLocalStreamingSTT, self).__init__(results_event, config) else: LOG.warning( f"Shorter Signature Found; config will be ignored and results_event will not be handled!" ) super(DeepSpeechLocalStreamingSTT, self).__init__() self.results_event = None # override language with module specific language selection self.language = self.config.get('lang') or self.lang self.queue = None if not self.language.startswith("en"): raise ValueError("DeepSpeech is currently english only") model_path = self.config.get("model_path") or \ os.path.expanduser("~/.local/share/neon/deepspeech-0.9.3-models.pbmm") scorer_path = self.config.get("scorer_path") or \ os.path.expanduser("~/.local/share/neon/deepspeech-0.9.3-models.scorer") if not os.path.isfile(model_path): LOG.error("Model not found and will be downloaded!") LOG.error(model_path) get_model() self.client = deepspeech.Model(model_path) if not scorer_path or not os.path.isfile(scorer_path): LOG.warning("You should provide a valid scorer") LOG.info( "download scorer from https://github.com/mozilla/DeepSpeech") else: self.client.enableExternalScorer(scorer_path)
def __init__(self): self.BEAM_WIDTH = 500 self.LM_ALPHA = 0.75 self.LM_BETA = 1.85 self.model_dir = 'DeepSpeech/data/wernicke/model/' self.model_file = os.path.join(self.model_dir, 'output_graph.pb') # self.model_dir = 'deepspeech-0.6.0-models/' # self.model_file = os.path.join(self.model_dir, 'output_graph.pbmm') self.lm_file = os.path.join(self.model_dir, 'lm.binary') self.trie_file = os.path.join(self.model_dir, 'trie') self.save_dir = 'saved_wavs' os.makedirs(self.save_dir, exist_ok=True) # load segment model log.info('Initializing pyAudioAnalysis classifier model...') [ self.classifier, self.MEAN, self.STD, self.class_names, self.mt_win, self.mt_step, self.st_win, self.st_step, _ ] = aT.load_model("wernicke_server_model") self.fs = 16000 log.info('Initializing deepspeech model...') self.model = deepspeech.Model(self.model_file, self.BEAM_WIDTH) # Temporarily disabling this. I don't think I have nearly enough samples to start doing LM and trie files, etc self.model.enableDecoderWithLM(self.lm_file, self.trie_file, self.LM_ALPHA, self.LM_BETA) log.info('Models ready.')
def main(): # setup pre trained model for audio to text transcribing model_file_path = 'deepspeech-0.6.0-models/output_graph.pbmm' beam_width = 500 model = deepspeech.Model(model_file_path, beam_width) lm_file_path = 'deepspeech-0.6.0-models/lm.binary' trie_file_path = 'deepspeech-0.6.0-models/trie' lm_alpha = 0.75 lm_beta = 1.85 model.enableDecoderWithLM(lm_file_path, trie_file_path, lm_alpha, lm_beta) # get MongoDb client podcast_db = get_db() podcast_fs = get_fs(podcast_db) # find all segments that are not transcribed for segment in podcast_db.segment.find({"segment_transcript": None}, \ no_cursor_timeout=True): key = segment['gridfs_key'] # read wav audio file for this segment data = podcast_fs.get(key) audio = wave.open(data, 'rb') # transcribe this audio segment to text transcript = transcribe_audio_to_text(audio, model) audio.close() # print(transcript) segment['segment_transcript'] = transcript # add updated MongoDb record into the collection update_record(podcast_db, segment)
def main(ARGS): # Load DeepSpeech model if os.path.isdir(ARGS.model): model_dir = ARGS.model ARGS.model = os.path.join(model_dir, 'output_graph.pb') ARGS.scorer = os.path.join(model_dir, ARGS.scorer) print('Initializing model...') logging.info("ARGS.model: %s", ARGS.model) model = deepspeech.Model(ARGS.model) if ARGS.scorer: logging.info("ARGS.scorer: %s", ARGS.scorer) model.enableExternalScorer(ARGS.scorer) # Start audio with VAD vad_audio = VADAudio(aggressiveness=ARGS.vad_aggressiveness, device=ARGS.device, input_rate=ARGS.rate, file=ARGS.file) print("Listening (ctrl-C to exit)...") frames = vad_audio.vad_collector() # Stream from microphone to DeepSpeech using VAD spinner = None if not ARGS.nospinner: spinner = Halo(spinner='line') # Deepspeech model stream_context = model.createStream() wav_data = bytearray() for frame in frames: if frame is not None: if spinner: spinner.start() logging.debug("streaming frame") stream_context.feedAudioContent(np.frombuffer(frame, np.int16)) if ARGS.savewav: wav_data.extend(frame) else: if spinner: spinner.stop() logging.debug("end utterence") if ARGS.savewav: vad_audio.write_wav( os.path.join( ARGS.savewav, datetime.now().strftime( "savewav_%Y-%m-%d_%H-%M-%S_%f.wav")), wav_data) wav_data = bytearray() text = stream_context.finishStream() vad_audio.stream.stop_stream() print("Recognized: %s" % text) if text != "": # send text as a ros topic, due to being in a python3 env not ros, calling command line subprocess.run([ 'rostopic', 'pub', '--once', '/gaan/nlp/user_msg', 'std_msgs/String', text ]) # Listen from microphone vad_audio.stream.start_stream() stream_context = model.createStream()
def create_model_from_config(config: Config) -> deepspeech.Model: print("Initialize model...") model = deepspeech.Model(config.model_path) model.setBeamWidth(config.beam_width) model.enableExternalScorer(config.scorer) model.setScorerAlphaBeta(config.scorer_alpha, config.scorer_beta) return model
def __init__(self): super().__init__() start_time = time.perf_counter() LOG.info("Loading DeepSpeech model...") model = self.config['model'] alphabet = self.config['alphabet'] num_context = self.config.get('num_context', 9) beam_width = self.config.get('beam_width', 512) num_features = self.config.get('num_features', 26) lm = self.config.get('lm') trie = self.config.get('trie') self.model = deepspeech.Model(model, num_features, num_context, alphabet, beam_width) if lm is not None and trie is not None: lm_weight = self.config.get('lm_weight', 1.5) vwcw = self.config.get('valid_word_count_weight', 2.25) self.model.enableDecoderWithLM(alphabet, lm, trie, lm_weight, vwcw) LOG.info("Loaded DeepSpeech model in %0.3fs" % (time.perf_counter() - start_time)) self.stream_ctx = None self.can_stream = True
def _get_model(self) -> deepspeech.Model: if not self._model: self._model = deepspeech.Model(self.model_file, self.beam_width) self._model.enableDecoderWithLM(self.lm_file, self.trie_file, self.lm_alpha, self.lm_beta) return self._model
def __init__(self, results_event, config=None): super(DeepSpeechLocalStreamingSTT, self).__init__(results_event, config) # override language with module specific language selection self.language = self.config.get('lang') or self.lang self.queue = None if not self.language.startswith("en"): raise ValueError("DeepSpeech is currently english only") model_path = self.config.get("model_path") or \ os.path.expanduser("~/.local/share/neon/deepspeech-0.8.1-models.pbmm") scorer_path = self.config.get("scorer_path") or \ os.path.expanduser("~/.local/share/neon/deepspeech-0.8.1-models.scorer") if not os.path.isfile(model_path): LOG.error("You need to provide a valid model file") LOG.error(model_path) LOG.info( "download a model from https://github.com/mozilla/DeepSpeech") raise FileNotFoundError if not scorer_path or not os.path.isfile(scorer_path): LOG.warning("You should provide a valid scorer") LOG.info( "download scorer from https://github.com/mozilla/DeepSpeech") self.client = deepspeech.Model(model_path) if scorer_path: self.client.enableExternalScorer(scorer_path)
async def activate(self, site): # self.log('activate') #if not self.active[site]: if os.path.isdir(self.model_path): # self.log('START DS ASR') self.audio_stream[site] = BytesLoop() self.active[site] = True self.started[site] = False await self.client.subscribe('hermod/' + site + '/microphone/audio') # Load DeepSpeech model # self.log('START DS ASR ACTIVATE '+self.model_path) #deepspeech-0.7.0-models.pbmm modelPath = os.path.join(self.model_path, self.modelFile) scorerPath = os.path.join(self.model_path, 'deepspeech-0.7.0-models.scorer') # lm = os.path.join(self.model_path, 'lm.binary') # trie = os.path.join(self.model_path, 'trie') self.log('START DS ASR ACTIVATE ' + modelPath) # self.models[site] = deepspeech.Model(modelPath, 500) # if lm and trie: # self.models[site].enableDecoderWithLM(lm, trie, 0.75, 1.85) self.models[site] = deepspeech.Model(modelPath) self.models[site].enableExternalScorer(scorerPath) self.stream_contexts[site] = self.models[site].createStream()
def deepspeech_stt(): ARGS_model = 'models/deepspeech.pbmm' ARGS_scorer = 'models/deepspeech.scorer' model = deepspeech.Model(ARGS_model) if ARGS_scorer: model.enableExternalScorer(ARGS_scorer) vad_audio = VADAudio(aggressiveness=0, device=None, input_rate=16000, file=None) print("Listening (ctrl-C to exit)...") frames = vad_audio.vad_collector() spinner = None ARGS_savewav = 1 #spinner = Halo(spinner='line') stream_context = model.createStream() wav_data = bytearray() for frame in frames: if frame is not None: if spinner: spinner.start() logging.debug("streaming frame") stream_context.feedAudioContent(np.frombuffer(frame, np.int16)) if ARGS_savewav: wav_data.extend(frame) else: if spinner: spinner.stop() logging.debug("end utterence") if ARGS_savewav: vad_audio.write_wav("input_temp.wav", wav_data) wav_data = bytearray() text = stream_context.finishStream() print("Recognized: %s" % text) stream_context = model.createStream() return text
def __init__(self): dirName = os.path.expanduser('deep_speech_models') model_path = dirName + '/deepspeech-0.7.0-models.pbmm' scorer_path = dirName + '/deepspeech-0.7.0-models.scorer' self.dir_audio = 'audio_tests/' self.model = deepspeech.Model(model_path) self.model.enableExternalScorer(scorer_path)
def main(ARGS): # Load DeepSpeech model if os.path.isdir(ARGS.model): model_dir = ARGS.model ARGS.model = os.path.join(model_dir, 'custom_lm_output_graph.pb') ARGS.alphabet = os.path.join(model_dir, ARGS.alphabet if ARGS.alphabet else 'alphabet.txt') ARGS.lm = os.path.join(model_dir, ARGS.lm) ARGS.trie = os.path.join(model_dir, ARGS.trie) print("Booting up server...") server = Server() print('Initializing model...') logging.info("ARGS.model: %s", ARGS.model) logging.info("ARGS.alphabet: %s", ARGS.alphabet) model = deepspeech.Model(ARGS.model, ARGS.n_features, ARGS.n_context, ARGS.alphabet, ARGS.beam_width) if ARGS.lm and ARGS.trie: logging.info("ARGS.lm: %s", ARGS.lm) logging.info("ARGS.trie: %s", ARGS.trie) model.enableDecoderWithLM(ARGS.alphabet, ARGS.lm, ARGS.trie, ARGS.lm_alpha, ARGS.lm_beta) # Start audio with VAD vad_audio = VADAudio(aggressiveness=ARGS.vad_aggressiveness, device=ARGS.device, input_rate=ARGS.rate) # Stream from microphone to DeepSpeech using VAD stream_context = model.setupStream() wav_data = bytearray() print("Warming up model...") # Warm up the model - For some reason there's a few seconds long pause on the 26th frame we feed into the model, as # it's presumable reallocating memory or something? In any case, if we do this up front latencies are much better. empty_frame = np.zeros((320,), dtype=np.int16) for i in range(26): model.feedAudioContent(stream_context, empty_frame) print("Listening (ctrl-C to exit)...") frames = vad_audio.vad_collector() # frames = audio.frame_generator() count = 0 for frame in frames: if frame is not None: model.feedAudioContent(stream_context, np.frombuffer(frame, np.int16)) count += 1 if count > 20: text = model.intermediateDecode(stream_context) count = 0 if len(text) > 0: print("Intermediate recognition: %s" % text) server.emit_utterance(text, True) else: text = model.finishStream(stream_context) stream_context = model.setupStream() if len(text) > 0: print("Recognized: %s" % text) server.emit_utterance(text, False) stream_context = model.setupStream()
def main(ARGS): # init node pub = rospy.Publisher('chatter', String, queue_size=10) rospy.init_node('talker', anonymous=True) # Load DeepSpeech model if os.path.isdir(ARGS.model): model_dir = ARGS.model ARGS.model = os.path.join(model_dir, 'output_graph.pb') ARGS.lm = os.path.join(model_dir, ARGS.lm) ARGS.trie = os.path.join(model_dir, ARGS.trie) print('Initializing model...') logging.info("ARGS.model: %s", ARGS.model) model = deepspeech.Model(ARGS.model, ARGS.beam_width) if ARGS.lm and ARGS.trie: logging.info("ARGS.lm: %s", ARGS.lm) logging.info("ARGS.trie: %s", ARGS.trie) model.enableDecoderWithLM(ARGS.lm, ARGS.trie, ARGS.lm_alpha, ARGS.lm_beta) # Start audio with VAD vad_audio = VADAudio(aggressiveness=ARGS.vad_aggressiveness, device=ARGS.device, input_rate=ARGS.rate, file=ARGS.file) print("Listening (ctrl-C to exit)...") frames = vad_audio.vad_collector() # Stream from microphone to DeepSpeech using VAD spinner = None if not ARGS.nospinner: spinner = Halo(spinner='line') stream_context = model.createStream() wav_data = bytearray() for frame in frames: if frame is not None: if spinner: spinner.start() logging.debug("streaming frame") model.feedAudioContent(stream_context, np.frombuffer(frame, np.int16)) if ARGS.savewav: wav_data.extend(frame) else: if spinner: spinner.stop() logging.debug("end utterence") if ARGS.savewav: vad_audio.write_wav( os.path.join( ARGS.savewav, datetime.now().strftime( "savewav_%Y-%m-%d_%H-%M-%S_%f.wav")), wav_data) wav_data = bytearray() text = model.finishStream(stream_context) print("Recognized: %s" % text) stream_context = model.createStream() # publish rospy.loginfo(text) pub.publish(text)
def __init__(self, model, scorer=None): logging.basicConfig(level=20) print('Initializing model...') logging.info("Model: %s", model) self.model = deepspeech.Model(model) if scorer: logging.info("Scorer: %s", scorer) self.model.enableExternalScorer(scorer)
def init_stt(output_graph_path, scorer_path): # global model? How are we supposed to do multiprocessing then? global model model = deepspeech.Model(output_graph_path) # It definitely seems very reasonable to adapt the # ctc_decoder_with_kenlm tensorflow operator to use this... model.enableExternalScorer(scorer_path) logging.debug('Process {}: Loaded models'.format(os.getpid()))
def onStart(self): super().onStart() if not self.checkLanguage(): self.downloadLanguage() self._model = deepspeech.Model(f'{self._langPath}/deepspeech-0.6.1-models/output_graph.tflite', 500) self._model.enableDecoderWithLM(f'{self._langPath}/deepspeech-0.6.1-models/lm.binary', f'{self._langPath}/deepspeech-0.6.1-models/trie', 0.75, 1.85)
def load_deepspeech_model(self): N_FEATURES = 25 N_CONTEXT = 9 BEAM_WIDTH = 500 LM_ALPHA = 0.75 LM_BETA = 1.85 ds = deepspeech.Model('deepspeech_model/deepspeech-0.7.3-models.pbmm') return ds
def main(ARGS): # Load DeepSpeech model if os.path.isdir(ARGS.model): model_dir = ARGS.model ARGS.model = os.path.join(model_dir, 'output_graph.pb') ARGS.alphabet = os.path.join( model_dir, ARGS.alphabet if ARGS.alphabet else 'alphabet.txt') ARGS.lm = os.path.join(model_dir, ARGS.lm) ARGS.trie = os.path.join(model_dir, ARGS.trie) #time.sleep(30) to_node("status", "Initializing model...") # print('Initializing model...') logging.info("ARGS.model: %s", ARGS.model) logging.info("ARGS.alphabet: %s", ARGS.alphabet) model = deepspeech.Model(ARGS.model, ARGS.n_features, ARGS.n_context, ARGS.alphabet, ARGS.beam_width) if ARGS.lm and ARGS.trie: logging.info("ARGS.lm: %s", ARGS.lm) logging.info("ARGS.trie: %s", ARGS.trie) model.enableDecoderWithLM(ARGS.alphabet, ARGS.lm, ARGS.trie, ARGS.lm_alpha, ARGS.lm_beta) # Start audio with VAD vad_audio = VADAudio(aggressiveness=ARGS.vad_aggressiveness, device=ARGS.device, input_rate=ARGS.rate) to_node("status", "Listening") #print("Listening (ctrl-C to exit)...") frames = vad_audio.vad_collector() # Stream from microphone to DeepSpeech using VAD spinner = None if not ARGS.nospinner: spinner = Halo(spinner='line') stream_context = model.setupStream() wav_data = bytearray() for frame in frames: if frame is not None: if spinner: spinner.start() logging.debug("streaming frame") model.feedAudioContent(stream_context, np.frombuffer(frame, np.int16)) if ARGS.savewav: wav_data.extend(frame) else: if spinner: spinner.stop() logging.debug("end utterence") if ARGS.savewav: vad_audio.write_wav( os.path.join( ARGS.savewav, datetime.now().strftime( "savewav_%Y-%m-%d_%H-%M-%S_%f.wav")), wav_data) wav_data = bytearray() text = model.finishStream(stream_context) to_node("result", "{}".format(text)) # print("Recognized: %s" % text) stream_context = model.setupStream()
def main(ARGS): #pdb.set_trace() # Load DeepSpeech model if os.path.isdir(ARGS.model): model_dir = ARGS.model ARGS.model = os.path.join(model_dir, 'output_graph.pb') ARGS.scorer = os.path.join(model_dir, ARGS.scorer) print('Initializing model...') logging.info("ARGS.model: %s", ARGS.model) model = deepspeech.Model(ARGS.model) if ARGS.scorer: logging.info("ARGS.scorer: %s", ARGS.scorer) model.enableExternalScorer(ARGS.scorer) audio_file = ARGS.file[:-4] + '.wav' command = "ffmpeg -i {} -ab 160k -ac 1 -ar 16000 -vn {}".format( ARGS.file, audio_file) subprocess.call(command, shell=True) #audio_file = # Start audio with VAD vad_audio = VADAudio(aggressiveness=ARGS.vad_aggressiveness, device=ARGS.device, input_rate=ARGS.rate, file=audio_file) print("Listening (ctrl-C to exit)...") frames = vad_audio.vad_collector() # Stream from microphone to DeepSpeech using VAD spinner = None if not ARGS.nospinner: spinner = Halo(spinner='line') stream_context = model.createStream() wav_data = bytearray() for frame in frames: if frame is not None: if spinner: spinner.start() logging.debug("streaming frame") stream_context.feedAudioContent(np.frombuffer(frame, np.int16)) if ARGS.savewav: wav_data.extend(frame) else: if spinner: spinner.stop() logging.debug("end utterence") if ARGS.savewav: vad_audio.write_wav( os.path.join( ARGS.savewav, datetime.now().strftime( "savewav_%Y-%m-%d_%H-%M-%S_%f.wav")), wav_data) wav_data = bytearray() text = stream_context.finishStream() print("Recognized: %s" % text) if ARGS.keyboard: from pyautogui import typewrite typewrite(text) stream_context = model.createStream()
def __init__(self, model_file, scorer_file, vad_audio): log.info("DeepSpeech model: {}".format(model_file)) self.model = deepspeech.Model(model_file) log.info("DeepSpeech scorer: {}".format(scorer_file)) self.model.enableExternalScorer(scorer_file) self.spinner = Halo(spinner='line') self.vad_audio = vad_audio self.frames = vad_audio.vad_collector()
def TRANSCRIBING_SERVICE(): if transcribing_service._instance is None: transcribing_service._instance=transcribing_service() transcribing_service.model=deepspeech.Model(model_file_path) transcribing_service.model.enableExternalScorer(scorer_file_path) transcribing_service.model.setScorerAlphaBeta(lm_alpha, lm_beta) transcribing_service.model.setBeamWidth(beam_width) return transcribing_service._instance
def main(ARGS): # Load DeepSpeech model if os.path.isdir(ARGS.model): model_dir = ARGS.model ARGS.model = os.path.join(model_dir, 'output_graph.pb') ARGS.scorer = os.path.join(model_dir, ARGS.scorer) print('Initializing model...') logging.info("ARGS.model: %s", ARGS.model) model = deepspeech.Model(ARGS.model) if ARGS.scorer: logging.info("ARGS.scorer: %s", ARGS.scorer) model.enableExternalScorer(ARGS.scorer) # Start audio with VAD vad_audio = VADAudio(aggressiveness=ARGS.vad_aggressiveness, device=ARGS.device, input_rate=ARGS.rate, file=ARGS.file) print("Listening (ctrl-C to exit)...") frames = vad_audio.vad_collector() # Stream from microphone to DeepSpeech using VAD spinner = None if not ARGS.nospinner: spinner = Halo(spinner='line') stream_context = model.createStream() wav_data = bytearray() for frame in frames: if frame is not None: if spinner: spinner.start() logging.debug("streaming frame") stream_context.feedAudioContent(np.frombuffer(frame, np.int16)) if ARGS.savewav: wav_data.extend(frame) else: if spinner: spinner.stop() logging.debug("end utterence") if ARGS.savewav: vad_audio.write_wav("output.wav", wav_data) wav_data = bytearray() audio_file = open("output.wav", "rb") response = speech_to_text.recognize( audio=audio_file, content_type='audio/wav', timestamps=True, word_confidence=True, smart_formatting=True).get_result() print(response['results']) text_output = response['results'][0]['alternatives'][0][ 'transcript'] text = text_output.strip() #text = stream_context.finishStream() print("Recognized: %s" % text) stream_context = model.createStream()
def main(args): # Initialize Ros Node and the Topic Publisher rospy.init_node(ROS_NODE_NAME) publisher = rospy.Publisher(ROS_PUBLISHER_TOPIC_NAME, String, queue_size=10) # Load DeepSpeech model if os.path.isdir(args.model): model_dir = args.model args.model = os.path.join(model_dir, 'output_graph.pb') args.scorer = os.path.join(model_dir, args.scorer) model = deepspeech.Model(args.model) if args.scorer: model.enableExternalScorer(args.scorer) # Add hot words, boost level can be (-inf, +inf) file = open(HOTWORDS_FILEPATH, "r") lines = file.readlines() for line in lines: hot_word, boost_value = line.split(",") model.addHotWord(hot_word, float(boost_value)) # Start audio with VAD vad_audio = VADAudio(aggressiveness=args.vad_aggressiveness, device=args.device, input_rate=args.rate, file=None) print("ROS node '%s' started. Listening for speech (ctrl-C to exit)..." % ROS_NODE_NAME) frames = vad_audio.vad_collector() # Stream from microphone to DeepSpeech using VAD spinner = None if not args.nospinner: spinner = Halo(spinner='line') stream_context = model.createStream() for frame in frames: if not rospy.is_shutdown(): if frame is not None: if spinner: spinner.start() stream_context.feedAudioContent(np.frombuffer(frame, np.int16)) else: if spinner: spinner.stop() recognized_text = stream_context.finishStream() if recognized_text: recognized_text = clean_text(recognized_text) print("Recognized: %s" % recognized_text) publisher.publish(recognized_text) stream_context = model.createStream() else: stream_context.freeStream() print("Ctrl-C received. Shutting down ROS node '%s'!" % ROS_NODE_NAME) break
def __init__(self, model=None, scorer=None): self.sessions = dict() #hold transcription sessions and the deepspeech model class statefully here, then the endpoint is stateless self.sessions["last_used_id"] = -1 self.wait_time = 60 #amount of seconds to wait before killing a stream self.timeout_check_time = 20 #number seconds to check if a stream has gone over its timout time since last action self.timeout() #continues to repeat indefinitly if model is None: path = pathlib.Path(__file__).parent.absolute() model_path = os.path.join(path, "./deepspeech-0.8.2-models.pbmm") self.model = deepspeech.Model(model_path) else: self.model = deepspeech.Model(model) if scorer is None: path = pathlib.Path(__file__).parent.absolute() scorer_path = os.path.join(path, "./deepspeech-0.8.2-models.scorer") self.model.enableExternalScorer(scorer_path) else: self.model.enableExternalScorer(scorer)