Esempio n. 1
0
def getModel(ARGS):
    # Load DeepSpeech model
    if os.path.isdir(ARGS.model):
        model_dir = ARGS.model
        ARGS.model = os.path.join(model_dir, 'output_graph.pb')
        ARGS.scorer = os.path.join(model_dir, ARGS.scorer)

    print('Initializing model...')
    logging.info("ARGS.model: %s", ARGS.model)
    model = deepspeech.Model(ARGS.model)

    # Load DeepSpeech model
    if os.path.isdir(ARGS.model):
        model_dir = ARGS.model
        ARGS.model = os.path.join(model_dir, 'output_graph.pb')
        ARGS.scorer = os.path.join(model_dir, ARGS.scorer)

    print('Initializing model...')
    logging.info("ARGS.model: %s", ARGS.model)
    model = deepspeech.Model(ARGS.model)

    #model.addHotWord('Fire', 10)
    #model.addHotWord('Intruder', 10)
    #model.addHotWord('Help', 10)
    #model.addHotWord('Yes', 10)
    #model.addHotWord('No', 10)

    if ARGS.scorer:
        logging.info("ARGS.scorer: %s", ARGS.scorer)
        model.enableExternalScorer(ARGS.scorer)

    return model
Esempio n. 2
0
def _register_(serviceList, pluginProperties):
    global services, plugin, core, audioRecorder, dsModel, stream, defaudsrc, actions, replacements, pattern
    services = serviceList
    plugin = pluginProperties
    core = services["core"][0]
    audioRecorder = services["audioRecorder"][0]
    actions = services["actions"][0]

    with open(r"englishSTT\en_us_replacements.json") as f:
        replacements = json.load(f)
        replacements = dict((re.escape(k), v) for k, v in replacements.items())
        pattern = re.compile("|".join(replacements.keys()))

    defaudsrc = audioRecorder.getAudioSource(device=1)

    try:
        dsModel = deepspeech.Model(
            r"englishSTT/deepspeech-0.9.3-models/deepspeech-0.9.3-models.pbmm",
        )
        dsModel.enableExternalScorer(
            r"englishSTT/deepspeech-0.9.3-models/deepspeech-0.9.3-models.scorer"
        )
    except RuntimeError:
        print("Downloading deepspeech models")
        os.makedirs("englishSTT/deepspeech-0.9.3-models")
        print("Downloading deepspeech .pbmm")
        urllib.request.urlretrieve(
            "https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.pbmm",
            "englishSTT/deepspeech-0.9.3-models/deepspeech-0.9.3-models.pbmm")
        print("Downloading deepspeech .scorer")
        urllib.request.urlretrieve(
            "https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.scorer",
            "englishSTT/deepspeech-0.9.3-models/deepspeech-0.9.3-models.scorer"
        )
        dsModel = deepspeech.Model(
            r"englishSTT/deepspeech-0.9.3-models/deepspeech-0.9.3-models.pbmm",
        )
        dsModel.enableExternalScorer(
            r"englishSTT/deepspeech-0.9.3-models/deepspeech-0.9.3-models.scorer"
        )

    #dsModel.enableDecoderWithLM(
    #    r"englishSTT\model\lm.binary",
    #    r"englishSTT\model\trie",
    #    0.75,
    #    1.85)

    # DeepSpeech locks up after the first few frames, this clears that up
    # THIS WAS ADDED WITH DEEPSPEECH 0.6.0, AND MAY NO LONGER BE NEEDED
    stream = dsModel.createStream()
    stream.feedAudioContent([0, 0, 0, 0, 65535, 65535, 65535, 65535] * 8192)
    stream.finishStream()

    services["userInterface"][0].addCommands({"trigger": trigger})
Esempio n. 3
0
def main():
    model = deepspeech.Model(MODEL_PATH)
    model.setBeamWidth(BEAM_WIDTH)
    model.enableExternalScorer(SCORER_PATH)

    stream = model.createStream()

    audio = pyaudio.PyAudio()
    index, name = find_device(audio, 'pulse')

    print(f'select device {name}')

    buffer_size = model.sampleRate() // BUFFERS_PER_SECOND
    audio_stream = audio.open(rate=model.sampleRate(),
                              channels=1,
                              format=audio.get_format_from_width(
                                  SAMPLE_WIDTH, unsigned=False),
                              input_device_index=index,
                              input=True,
                              frames_per_buffer=buffer_size,
                              stream_callback=audio_callback)

    num_iterations = BUFFERS_PER_SECOND * 2
    i = 0
    while audio_stream.is_active():
        stream.feedAudioContent(buffer_queue.get())
        if i % num_iterations == 0:
            text = stream.intermediateDecode()
            if text.find('stop') >= 0:
                break
            print(text)
        i += 1

    print(stream.finishStream())
    audio_stream.close()
Esempio n. 4
0
    def maybe_load_model(self):
        """Load DeepSpeech model if not already loaded."""
        if self.model:
            return

        assert self.model_path, "No model path"

        _LOGGER.debug("Loading model from %s (beam width=%s)", self.model_path,
                      self.beam_width)
        self.model = deepspeech.Model(str(self.model_path), self.beam_width)

        if (self.language_model_path and self.language_model_path.is_file()
                and self.trie_path and self.trie_path.is_file()):
            _LOGGER.debug(
                "Enabling language model (lm=%s, trie=%s, lm_alpha=%s, lm_beta=%s)",
                self.language_model_path,
                self.trie_path,
                self.lm_alpha,
                self.lm_beta,
            )

            self.model.enableDecoderWithLM(
                str(self.language_model_path),
                str(self.trie_path),
                self.lm_alpha,
                self.lm_beta,
            )
async def main():

    BEAM_WIDTH = 500
    LM_ALPHA = 0.75
    LM_BETA = 1.85
    MODEL = "./models/deepspeech-0.8.2-models.pbmm"
    LANG_MODEL = "./models/lm.binary"
    TRIE = "./models/trie"

    model = deepspeech.Model(MODEL, BEAM_WIDTH)
    model.enableDecoderWithLM(LANG_MODEL, TRIE, LM_ALPHA, LM_BETA)
    vad_audio = VADAudio(aggressiveness=3, device=11, input_rate=48000)

    uri = "ws://localhost:8000/ws"
    ws = await websockets.connect(uri, ping_interval=None)

    stream_context = model.createStream()
    for frame in vad_audio.vad_collector():
        if frame is not None:
            model.feedAudioContent(stream_context, np.frombuffer(frame, np.int16))
        else:
            text = model.finishStream(stream_context)
            try:
                await ws.send(text)
                stream_context = model.createStream()
                returned = await ws.recv()
                print(returned)
            except:  # there is clearly a better ay to do this but I have a smol brain
                print("Reconnecting")
                ws = await websockets.connect(uri, ping_interval=None)
Esempio n. 6
0
def main(ARGS):
    # Load DeepSpeech model
    if os.path.isdir(ARGS.model):
        model_dir = ARGS.model
        ARGS.model = os.path.join(model_dir, 'output_graph.pb')
        ARGS.alphabet = os.path.join(
            model_dir, ARGS.alphabet if ARGS.alphabet else 'alphabet.txt')
        ARGS.lm = os.path.join(model_dir, ARGS.lm)
        ARGS.trie = os.path.join(model_dir, ARGS.trie)

    print('Initializing model...')
    logging.info("ARGS.model: %s", ARGS.model)
    logging.info("ARGS.alphabet: %s", ARGS.alphabet)
    model = deepspeech.Model(ARGS.model, ARGS.beam_width)
    if ARGS.lm and ARGS.trie:
        logging.info("ARGS.lm: %s", ARGS.lm)
        logging.info("ARGS.trie: %s", ARGS.trie)
        model.enableDecoderWithLM(ARGS.lm, ARGS.trie, ARGS.lm_alpha,
                                  ARGS.lm_beta)

    # Start audio with VAD
    vad_audio = VADAudio(aggressiveness=ARGS.vad_aggressiveness,
                         device=ARGS.device,
                         input_rate=ARGS.rate,
                         file=ARGS.file)
    # this create a generator from vad_collector function.
    # if new block is saved it will be saved in the buffer queue
    # when a new frame is required to process the function vad_collector.
    # provids the next uttrance by using yeild, so it will provide frame and
    # when next one is asked it will return to the function from the last yeild postion.
    print("Listening (ctrl-C to exit)...")
    frames = vad_audio.vad_collector()

    # Stream from microphone to DeepSpeech using VAD
    spinner = None
    if not ARGS.nospinner:
        spinner = Halo(spinner='line')
    stream_context = model.createStream()
    wav_data = bytearray()
    for frame in frames:

        if frame is not None:
            if spinner: spinner.start()
            logging.debug("streaming frame")
            model.feedAudioContent(stream_context,
                                   np.frombuffer(frame, np.int16))
            if ARGS.savewav: wav_data.extend(frame)
        else:
            if spinner: spinner.stop()
            logging.debug("end utterence")
            if ARGS.savewav:
                vad_audio.write_wav(
                    os.path.join(
                        ARGS.savewav,
                        datetime.now().strftime(
                            "savewav_%Y-%m-%d_%H-%M-%S_%f.wav")), wav_data)
                wav_data = bytearray()
            text = model.finishStream(stream_context)
            print("Recognized: %s" % text)
            stream_context = model.createStream()
Esempio n. 7
0
    def __init__(self, results_event, config=None):
        if len(
                signature(super(DeepSpeechLocalStreamingSTT,
                                self).__init__).parameters) == 2:
            super(DeepSpeechLocalStreamingSTT,
                  self).__init__(results_event, config)
        else:
            LOG.warning(
                f"Shorter Signature Found; config will be ignored and results_event will not be handled!"
            )
            super(DeepSpeechLocalStreamingSTT, self).__init__()
            self.results_event = None
        # override language with module specific language selection
        self.language = self.config.get('lang') or self.lang
        self.queue = None
        if not self.language.startswith("en"):
            raise ValueError("DeepSpeech is currently english only")

        model_path = self.config.get("model_path") or \
            os.path.expanduser("~/.local/share/neon/deepspeech-0.9.3-models.pbmm")
        scorer_path = self.config.get("scorer_path") or \
            os.path.expanduser("~/.local/share/neon/deepspeech-0.9.3-models.scorer")
        if not os.path.isfile(model_path):
            LOG.error("Model not found and will be downloaded!")
            LOG.error(model_path)
            get_model()

        self.client = deepspeech.Model(model_path)

        if not scorer_path or not os.path.isfile(scorer_path):
            LOG.warning("You should provide a valid scorer")
            LOG.info(
                "download scorer from https://github.com/mozilla/DeepSpeech")
        else:
            self.client.enableExternalScorer(scorer_path)
Esempio n. 8
0
    def __init__(self):
        self.BEAM_WIDTH = 500
        self.LM_ALPHA = 0.75
        self.LM_BETA = 1.85
        self.model_dir = 'DeepSpeech/data/wernicke/model/'
        self.model_file = os.path.join(self.model_dir, 'output_graph.pb')
        # self.model_dir = 'deepspeech-0.6.0-models/'
        # self.model_file = os.path.join(self.model_dir, 'output_graph.pbmm')
        self.lm_file = os.path.join(self.model_dir, 'lm.binary')
        self.trie_file = os.path.join(self.model_dir, 'trie')

        self.save_dir = 'saved_wavs'
        os.makedirs(self.save_dir, exist_ok=True)

        # load segment model
        log.info('Initializing pyAudioAnalysis classifier model...')
        [
            self.classifier, self.MEAN, self.STD, self.class_names,
            self.mt_win, self.mt_step, self.st_win, self.st_step, _
        ] = aT.load_model("wernicke_server_model")
        self.fs = 16000

        log.info('Initializing deepspeech model...')
        self.model = deepspeech.Model(self.model_file, self.BEAM_WIDTH)
        # Temporarily disabling this. I don't think I have nearly enough samples to start doing LM and trie files, etc
        self.model.enableDecoderWithLM(self.lm_file, self.trie_file,
                                       self.LM_ALPHA, self.LM_BETA)

        log.info('Models ready.')
Esempio n. 9
0
def main():
    # setup pre trained model for audio to text transcribing
    model_file_path = 'deepspeech-0.6.0-models/output_graph.pbmm'
    beam_width = 500
    model = deepspeech.Model(model_file_path, beam_width)
    lm_file_path = 'deepspeech-0.6.0-models/lm.binary'
    trie_file_path = 'deepspeech-0.6.0-models/trie'
    lm_alpha = 0.75
    lm_beta = 1.85
    model.enableDecoderWithLM(lm_file_path, trie_file_path, lm_alpha, lm_beta)

    # get MongoDb client
    podcast_db = get_db()
    podcast_fs = get_fs(podcast_db)

    # find all segments that are not transcribed
    for segment in podcast_db.segment.find({"segment_transcript": None}, \
     no_cursor_timeout=True):
        key = segment['gridfs_key']

        # read wav audio file for this segment
        data = podcast_fs.get(key)
        audio = wave.open(data, 'rb')

        # transcribe this audio segment to text
        transcript = transcribe_audio_to_text(audio, model)
        audio.close()
        # print(transcript)
        segment['segment_transcript'] = transcript

        # add updated MongoDb record into the collection
        update_record(podcast_db, segment)
Esempio n. 10
0
def main(ARGS):
    # Load DeepSpeech model
    if os.path.isdir(ARGS.model):
        model_dir = ARGS.model
        ARGS.model = os.path.join(model_dir, 'output_graph.pb')
        ARGS.scorer = os.path.join(model_dir, ARGS.scorer)

    print('Initializing model...')
    logging.info("ARGS.model: %s", ARGS.model)
    model = deepspeech.Model(ARGS.model)
    if ARGS.scorer:
        logging.info("ARGS.scorer: %s", ARGS.scorer)
        model.enableExternalScorer(ARGS.scorer)

    # Start audio with VAD
    vad_audio = VADAudio(aggressiveness=ARGS.vad_aggressiveness,
                         device=ARGS.device,
                         input_rate=ARGS.rate,
                         file=ARGS.file)
    print("Listening (ctrl-C to exit)...")
    frames = vad_audio.vad_collector()

    # Stream from microphone to DeepSpeech using VAD
    spinner = None
    if not ARGS.nospinner:
        spinner = Halo(spinner='line')
    # Deepspeech model
    stream_context = model.createStream()
    wav_data = bytearray()
    for frame in frames:
        if frame is not None:
            if spinner: spinner.start()
            logging.debug("streaming frame")
            stream_context.feedAudioContent(np.frombuffer(frame, np.int16))
            if ARGS.savewav: wav_data.extend(frame)
        else:
            if spinner: spinner.stop()
            logging.debug("end utterence")
            if ARGS.savewav:
                vad_audio.write_wav(
                    os.path.join(
                        ARGS.savewav,
                        datetime.now().strftime(
                            "savewav_%Y-%m-%d_%H-%M-%S_%f.wav")), wav_data)
                wav_data = bytearray()
            text = stream_context.finishStream()
            vad_audio.stream.stop_stream()

            print("Recognized: %s" % text)
            if text != "":

                # send text as a ros topic, due to being in a python3 env not ros, calling command line
                subprocess.run([
                    'rostopic', 'pub', '--once', '/gaan/nlp/user_msg',
                    'std_msgs/String', text
                ])

            # Listen from microphone
            vad_audio.stream.start_stream()
            stream_context = model.createStream()
Esempio n. 11
0
File: main.py Progetto: msmedes/deep
def create_model_from_config(config: Config) -> deepspeech.Model:
    print("Initialize model...")
    model = deepspeech.Model(config.model_path)
    model.setBeamWidth(config.beam_width)
    model.enableExternalScorer(config.scorer)
    model.setScorerAlphaBeta(config.scorer_alpha, config.scorer_beta)
    return model
Esempio n. 12
0
    def __init__(self):
        super().__init__()

        start_time = time.perf_counter()
        LOG.info("Loading DeepSpeech model...")

        model = self.config['model']
        alphabet = self.config['alphabet']
        num_context = self.config.get('num_context', 9)
        beam_width = self.config.get('beam_width', 512)
        num_features = self.config.get('num_features', 26)
        lm = self.config.get('lm')
        trie = self.config.get('trie')

        self.model = deepspeech.Model(model, num_features, num_context,
                                      alphabet, beam_width)

        if lm is not None and trie is not None:
            lm_weight = self.config.get('lm_weight', 1.5)
            vwcw = self.config.get('valid_word_count_weight', 2.25)

            self.model.enableDecoderWithLM(alphabet, lm, trie, lm_weight, vwcw)

        LOG.info("Loaded DeepSpeech model in %0.3fs" %
                 (time.perf_counter() - start_time))
        self.stream_ctx = None
        self.can_stream = True
Esempio n. 13
0
    def _get_model(self) -> deepspeech.Model:
        if not self._model:
            self._model = deepspeech.Model(self.model_file, self.beam_width)
            self._model.enableDecoderWithLM(self.lm_file, self.trie_file,
                                            self.lm_alpha, self.lm_beta)

        return self._model
Esempio n. 14
0
    def __init__(self, results_event, config=None):
        super(DeepSpeechLocalStreamingSTT,
              self).__init__(results_event, config)
        # override language with module specific language selection
        self.language = self.config.get('lang') or self.lang
        self.queue = None
        if not self.language.startswith("en"):
            raise ValueError("DeepSpeech is currently english only")

        model_path = self.config.get("model_path") or \
            os.path.expanduser("~/.local/share/neon/deepspeech-0.8.1-models.pbmm")
        scorer_path = self.config.get("scorer_path") or \
            os.path.expanduser("~/.local/share/neon/deepspeech-0.8.1-models.scorer")
        if not os.path.isfile(model_path):
            LOG.error("You need to provide a valid model file")
            LOG.error(model_path)
            LOG.info(
                "download a model from https://github.com/mozilla/DeepSpeech")
            raise FileNotFoundError
        if not scorer_path or not os.path.isfile(scorer_path):
            LOG.warning("You should provide a valid scorer")
            LOG.info(
                "download scorer from https://github.com/mozilla/DeepSpeech")

        self.client = deepspeech.Model(model_path)
        if scorer_path:
            self.client.enableExternalScorer(scorer_path)
    async def activate(self, site):
        # self.log('activate')
        #if not self.active[site]:
        if os.path.isdir(self.model_path):
            # self.log('START DS ASR')

            self.audio_stream[site] = BytesLoop()
            self.active[site] = True
            self.started[site] = False
            await self.client.subscribe('hermod/' + site + '/microphone/audio')
            # Load DeepSpeech model
            # self.log('START DS ASR ACTIVATE '+self.model_path)

            #deepspeech-0.7.0-models.pbmm

            modelPath = os.path.join(self.model_path, self.modelFile)
            scorerPath = os.path.join(self.model_path,
                                      'deepspeech-0.7.0-models.scorer')
            # lm = os.path.join(self.model_path, 'lm.binary')
            # trie = os.path.join(self.model_path, 'trie')

            self.log('START DS ASR ACTIVATE ' + modelPath)

            # self.models[site] = deepspeech.Model(modelPath, 500)
            # if lm and trie:
            # self.models[site].enableDecoderWithLM(lm, trie, 0.75, 1.85)
            self.models[site] = deepspeech.Model(modelPath)
            self.models[site].enableExternalScorer(scorerPath)
            self.stream_contexts[site] = self.models[site].createStream()
Esempio n. 16
0
def deepspeech_stt():
    ARGS_model = 'models/deepspeech.pbmm'
    ARGS_scorer = 'models/deepspeech.scorer'
    model = deepspeech.Model(ARGS_model)
    if ARGS_scorer:
        model.enableExternalScorer(ARGS_scorer)

    vad_audio = VADAudio(aggressiveness=0,
                         device=None,
                         input_rate=16000,
                         file=None)
    print("Listening (ctrl-C to exit)...")
    frames = vad_audio.vad_collector()

    spinner = None
    ARGS_savewav = 1
    #spinner = Halo(spinner='line')
    stream_context = model.createStream()
    wav_data = bytearray()
    for frame in frames:
        if frame is not None:
            if spinner: spinner.start()
            logging.debug("streaming frame")
            stream_context.feedAudioContent(np.frombuffer(frame, np.int16))
            if ARGS_savewav: wav_data.extend(frame)
        else:
            if spinner: spinner.stop()
            logging.debug("end utterence")
            if ARGS_savewav:
                vad_audio.write_wav("input_temp.wav", wav_data)
                wav_data = bytearray()
            text = stream_context.finishStream()
            print("Recognized: %s" % text)
            stream_context = model.createStream()
            return text
Esempio n. 17
0
	def __init__(self):
		dirName = os.path.expanduser('deep_speech_models')
		model_path = dirName + '/deepspeech-0.7.0-models.pbmm'
		scorer_path = dirName + '/deepspeech-0.7.0-models.scorer'
		self.dir_audio = 'audio_tests/'
		self.model = deepspeech.Model(model_path)
		self.model.enableExternalScorer(scorer_path)
Esempio n. 18
0
def main(ARGS):
    # Load DeepSpeech model
    if os.path.isdir(ARGS.model):
        model_dir = ARGS.model
        ARGS.model = os.path.join(model_dir, 'custom_lm_output_graph.pb')
        ARGS.alphabet = os.path.join(model_dir, ARGS.alphabet if ARGS.alphabet else 'alphabet.txt')
        ARGS.lm = os.path.join(model_dir, ARGS.lm)
        ARGS.trie = os.path.join(model_dir, ARGS.trie)

    print("Booting up server...")
    server = Server()

    print('Initializing model...')
    logging.info("ARGS.model: %s", ARGS.model)
    logging.info("ARGS.alphabet: %s", ARGS.alphabet)
    model = deepspeech.Model(ARGS.model, ARGS.n_features, ARGS.n_context, ARGS.alphabet, ARGS.beam_width)
    if ARGS.lm and ARGS.trie:
        logging.info("ARGS.lm: %s", ARGS.lm)
        logging.info("ARGS.trie: %s", ARGS.trie)
        model.enableDecoderWithLM(ARGS.alphabet, ARGS.lm, ARGS.trie, ARGS.lm_alpha, ARGS.lm_beta)

    # Start audio with VAD
    vad_audio = VADAudio(aggressiveness=ARGS.vad_aggressiveness,
                         device=ARGS.device,
                         input_rate=ARGS.rate)

    # Stream from microphone to DeepSpeech using VAD
    stream_context = model.setupStream()
    wav_data = bytearray()

    print("Warming up model...")
    # Warm up the model - For some reason there's a few seconds long pause on the 26th frame we feed into the model, as
    # it's presumable reallocating memory or something? In any case, if we do this up front latencies are much better.
    empty_frame = np.zeros((320,), dtype=np.int16)
    for i in range(26):
        model.feedAudioContent(stream_context, empty_frame)

    print("Listening (ctrl-C to exit)...")
    frames = vad_audio.vad_collector()
    # frames = audio.frame_generator()

    count = 0
    for frame in frames:
        if frame is not None:
            model.feedAudioContent(stream_context, np.frombuffer(frame, np.int16))
            count += 1
            if count > 20:
                text = model.intermediateDecode(stream_context)
                count = 0
                if len(text) > 0:
                    print("Intermediate recognition: %s" % text)
                    server.emit_utterance(text, True)
        else:
            text = model.finishStream(stream_context)
            stream_context = model.setupStream()
            if len(text) > 0:
                print("Recognized: %s" % text)
                server.emit_utterance(text, False)
                stream_context = model.setupStream()
Esempio n. 19
0
def main(ARGS):
    # init node
    pub = rospy.Publisher('chatter', String, queue_size=10)
    rospy.init_node('talker', anonymous=True)

    # Load DeepSpeech model
    if os.path.isdir(ARGS.model):
        model_dir = ARGS.model
        ARGS.model = os.path.join(model_dir, 'output_graph.pb')
        ARGS.lm = os.path.join(model_dir, ARGS.lm)
        ARGS.trie = os.path.join(model_dir, ARGS.trie)

    print('Initializing model...')
    logging.info("ARGS.model: %s", ARGS.model)
    model = deepspeech.Model(ARGS.model, ARGS.beam_width)
    if ARGS.lm and ARGS.trie:
        logging.info("ARGS.lm: %s", ARGS.lm)
        logging.info("ARGS.trie: %s", ARGS.trie)
        model.enableDecoderWithLM(ARGS.lm, ARGS.trie, ARGS.lm_alpha,
                                  ARGS.lm_beta)

    # Start audio with VAD
    vad_audio = VADAudio(aggressiveness=ARGS.vad_aggressiveness,
                         device=ARGS.device,
                         input_rate=ARGS.rate,
                         file=ARGS.file)
    print("Listening (ctrl-C to exit)...")
    frames = vad_audio.vad_collector()

    # Stream from microphone to DeepSpeech using VAD
    spinner = None
    if not ARGS.nospinner:
        spinner = Halo(spinner='line')
    stream_context = model.createStream()
    wav_data = bytearray()
    for frame in frames:
        if frame is not None:
            if spinner: spinner.start()
            logging.debug("streaming frame")
            model.feedAudioContent(stream_context,
                                   np.frombuffer(frame, np.int16))
            if ARGS.savewav: wav_data.extend(frame)
        else:
            if spinner: spinner.stop()
            logging.debug("end utterence")
            if ARGS.savewav:
                vad_audio.write_wav(
                    os.path.join(
                        ARGS.savewav,
                        datetime.now().strftime(
                            "savewav_%Y-%m-%d_%H-%M-%S_%f.wav")), wav_data)
                wav_data = bytearray()
            text = model.finishStream(stream_context)
            print("Recognized: %s" % text)
            stream_context = model.createStream()

            # publish
            rospy.loginfo(text)
            pub.publish(text)
Esempio n. 20
0
 def __init__(self, model, scorer=None):
     logging.basicConfig(level=20)
     print('Initializing model...')
     logging.info("Model: %s", model)
     self.model = deepspeech.Model(model)
     if scorer:
         logging.info("Scorer: %s", scorer)
         self.model.enableExternalScorer(scorer)
Esempio n. 21
0
def init_stt(output_graph_path, scorer_path):
    # global model? How are we supposed to do multiprocessing then?
    global model
    model = deepspeech.Model(output_graph_path)
    # It definitely seems very reasonable to adapt the
    # ctc_decoder_with_kenlm tensorflow operator to use this...
    model.enableExternalScorer(scorer_path)
    logging.debug('Process {}: Loaded models'.format(os.getpid()))
Esempio n. 22
0
	def onStart(self):
		super().onStart()

		if not self.checkLanguage():
			self.downloadLanguage()

		self._model = deepspeech.Model(f'{self._langPath}/deepspeech-0.6.1-models/output_graph.tflite', 500)
		self._model.enableDecoderWithLM(f'{self._langPath}/deepspeech-0.6.1-models/lm.binary', f'{self._langPath}/deepspeech-0.6.1-models/trie', 0.75, 1.85)
 def load_deepspeech_model(self):
     N_FEATURES = 25
     N_CONTEXT = 9
     BEAM_WIDTH = 500
     LM_ALPHA = 0.75
     LM_BETA = 1.85
     ds = deepspeech.Model('deepspeech_model/deepspeech-0.7.3-models.pbmm')
     return ds
def main(ARGS):
    # Load DeepSpeech model
    if os.path.isdir(ARGS.model):
        model_dir = ARGS.model
        ARGS.model = os.path.join(model_dir, 'output_graph.pb')
        ARGS.alphabet = os.path.join(
            model_dir, ARGS.alphabet if ARGS.alphabet else 'alphabet.txt')
        ARGS.lm = os.path.join(model_dir, ARGS.lm)
        ARGS.trie = os.path.join(model_dir, ARGS.trie)

    #time.sleep(30)

    to_node("status", "Initializing model...")
    # print('Initializing model...')
    logging.info("ARGS.model: %s", ARGS.model)
    logging.info("ARGS.alphabet: %s", ARGS.alphabet)
    model = deepspeech.Model(ARGS.model, ARGS.n_features, ARGS.n_context,
                             ARGS.alphabet, ARGS.beam_width)
    if ARGS.lm and ARGS.trie:
        logging.info("ARGS.lm: %s", ARGS.lm)
        logging.info("ARGS.trie: %s", ARGS.trie)
        model.enableDecoderWithLM(ARGS.alphabet, ARGS.lm, ARGS.trie,
                                  ARGS.lm_alpha, ARGS.lm_beta)

    # Start audio with VAD
    vad_audio = VADAudio(aggressiveness=ARGS.vad_aggressiveness,
                         device=ARGS.device,
                         input_rate=ARGS.rate)
    to_node("status", "Listening")
    #print("Listening (ctrl-C to exit)...")
    frames = vad_audio.vad_collector()

    # Stream from microphone to DeepSpeech using VAD
    spinner = None
    if not ARGS.nospinner: spinner = Halo(spinner='line')
    stream_context = model.setupStream()
    wav_data = bytearray()
    for frame in frames:
        if frame is not None:
            if spinner: spinner.start()
            logging.debug("streaming frame")
            model.feedAudioContent(stream_context,
                                   np.frombuffer(frame, np.int16))
            if ARGS.savewav: wav_data.extend(frame)
        else:
            if spinner: spinner.stop()
            logging.debug("end utterence")
            if ARGS.savewav:
                vad_audio.write_wav(
                    os.path.join(
                        ARGS.savewav,
                        datetime.now().strftime(
                            "savewav_%Y-%m-%d_%H-%M-%S_%f.wav")), wav_data)
                wav_data = bytearray()
            text = model.finishStream(stream_context)
            to_node("result", "{}".format(text))
            # print("Recognized: %s" % text)
            stream_context = model.setupStream()
Esempio n. 25
0
def main(ARGS):
    #pdb.set_trace()
    # Load DeepSpeech model
    if os.path.isdir(ARGS.model):
        model_dir = ARGS.model
        ARGS.model = os.path.join(model_dir, 'output_graph.pb')
        ARGS.scorer = os.path.join(model_dir, ARGS.scorer)

    print('Initializing model...')
    logging.info("ARGS.model: %s", ARGS.model)
    model = deepspeech.Model(ARGS.model)
    if ARGS.scorer:
        logging.info("ARGS.scorer: %s", ARGS.scorer)
        model.enableExternalScorer(ARGS.scorer)

    audio_file = ARGS.file[:-4] + '.wav'
    command = "ffmpeg -i {} -ab 160k -ac 1 -ar 16000 -vn {}".format(
        ARGS.file, audio_file)
    subprocess.call(command, shell=True)
    #audio_file =
    # Start audio with VAD
    vad_audio = VADAudio(aggressiveness=ARGS.vad_aggressiveness,
                         device=ARGS.device,
                         input_rate=ARGS.rate,
                         file=audio_file)
    print("Listening (ctrl-C to exit)...")
    frames = vad_audio.vad_collector()

    # Stream from microphone to DeepSpeech using VAD
    spinner = None
    if not ARGS.nospinner:
        spinner = Halo(spinner='line')
    stream_context = model.createStream()
    wav_data = bytearray()

    for frame in frames:
        if frame is not None:
            if spinner: spinner.start()
            logging.debug("streaming frame")
            stream_context.feedAudioContent(np.frombuffer(frame, np.int16))
            if ARGS.savewav: wav_data.extend(frame)
        else:
            if spinner: spinner.stop()
            logging.debug("end utterence")
            if ARGS.savewav:
                vad_audio.write_wav(
                    os.path.join(
                        ARGS.savewav,
                        datetime.now().strftime(
                            "savewav_%Y-%m-%d_%H-%M-%S_%f.wav")), wav_data)
                wav_data = bytearray()
            text = stream_context.finishStream()
            print("Recognized: %s" % text)
            if ARGS.keyboard:
                from pyautogui import typewrite
                typewrite(text)
            stream_context = model.createStream()
Esempio n. 26
0
    def __init__(self, model_file, scorer_file, vad_audio):
        log.info("DeepSpeech model: {}".format(model_file))
        self.model = deepspeech.Model(model_file)

        log.info("DeepSpeech scorer: {}".format(scorer_file))
        self.model.enableExternalScorer(scorer_file)
        self.spinner = Halo(spinner='line')
        self.vad_audio = vad_audio
        self.frames = vad_audio.vad_collector()
Esempio n. 27
0
def TRANSCRIBING_SERVICE():
	if transcribing_service._instance is None:
		transcribing_service._instance=transcribing_service()
		transcribing_service.model=deepspeech.Model(model_file_path)
		transcribing_service.model.enableExternalScorer(scorer_file_path)
		transcribing_service.model.setScorerAlphaBeta(lm_alpha, lm_beta)
		transcribing_service.model.setBeamWidth(beam_width)

	return transcribing_service._instance
def main(ARGS):
    # Load DeepSpeech model
    if os.path.isdir(ARGS.model):
        model_dir = ARGS.model
        ARGS.model = os.path.join(model_dir, 'output_graph.pb')
        ARGS.scorer = os.path.join(model_dir, ARGS.scorer)

    print('Initializing model...')
    logging.info("ARGS.model: %s", ARGS.model)
    model = deepspeech.Model(ARGS.model)
    if ARGS.scorer:
        logging.info("ARGS.scorer: %s", ARGS.scorer)
        model.enableExternalScorer(ARGS.scorer)

    # Start audio with VAD
    vad_audio = VADAudio(aggressiveness=ARGS.vad_aggressiveness,
                         device=ARGS.device,
                         input_rate=ARGS.rate,
                         file=ARGS.file)
    print("Listening (ctrl-C to exit)...")
    frames = vad_audio.vad_collector()

    # Stream from microphone to DeepSpeech using VAD
    spinner = None
    if not ARGS.nospinner:
        spinner = Halo(spinner='line')
    stream_context = model.createStream()
    wav_data = bytearray()
    for frame in frames:
        if frame is not None:
            if spinner: spinner.start()
            logging.debug("streaming frame")
            stream_context.feedAudioContent(np.frombuffer(frame, np.int16))
            if ARGS.savewav: wav_data.extend(frame)
        else:
            if spinner: spinner.stop()
            logging.debug("end utterence")
            if ARGS.savewav:
                vad_audio.write_wav("output.wav", wav_data)
                wav_data = bytearray()
            audio_file = open("output.wav", "rb")

            response = speech_to_text.recognize(
                audio=audio_file,
                content_type='audio/wav',
                timestamps=True,
                word_confidence=True,
                smart_formatting=True).get_result()
            print(response['results'])

            text_output = response['results'][0]['alternatives'][0][
                'transcript']
            text = text_output.strip()
            #text = stream_context.finishStream()
            print("Recognized: %s" % text)
            stream_context = model.createStream()
Esempio n. 29
0
def main(args):
    # Initialize Ros Node and the Topic Publisher
    rospy.init_node(ROS_NODE_NAME)
    publisher = rospy.Publisher(ROS_PUBLISHER_TOPIC_NAME,
                                String,
                                queue_size=10)

    # Load DeepSpeech model
    if os.path.isdir(args.model):
        model_dir = args.model
        args.model = os.path.join(model_dir, 'output_graph.pb')
        args.scorer = os.path.join(model_dir, args.scorer)

    model = deepspeech.Model(args.model)
    if args.scorer:
        model.enableExternalScorer(args.scorer)

    # Add hot words, boost level can be (-inf, +inf)
    file = open(HOTWORDS_FILEPATH, "r")
    lines = file.readlines()
    for line in lines:
        hot_word, boost_value = line.split(",")
        model.addHotWord(hot_word, float(boost_value))

    # Start audio with VAD
    vad_audio = VADAudio(aggressiveness=args.vad_aggressiveness,
                         device=args.device,
                         input_rate=args.rate,
                         file=None)
    print("ROS node '%s' started. Listening for speech (ctrl-C to exit)..." %
          ROS_NODE_NAME)
    frames = vad_audio.vad_collector()

    # Stream from microphone to DeepSpeech using VAD
    spinner = None
    if not args.nospinner:
        spinner = Halo(spinner='line')
    stream_context = model.createStream()
    for frame in frames:
        if not rospy.is_shutdown():
            if frame is not None:
                if spinner: spinner.start()
                stream_context.feedAudioContent(np.frombuffer(frame, np.int16))
            else:
                if spinner: spinner.stop()
                recognized_text = stream_context.finishStream()
                if recognized_text:
                    recognized_text = clean_text(recognized_text)
                    print("Recognized: %s" % recognized_text)
                    publisher.publish(recognized_text)
                stream_context = model.createStream()
        else:
            stream_context.freeStream()
            print("Ctrl-C received. Shutting down ROS node '%s'!" %
                  ROS_NODE_NAME)
            break
Esempio n. 30
0
 def __init__(self, model=None, scorer=None):
     self.sessions = dict()
     #hold transcription sessions and the deepspeech model class statefully here, then the endpoint is stateless
     self.sessions["last_used_id"] = -1
     self.wait_time = 60 #amount of seconds to wait before killing a stream
     self.timeout_check_time = 20 #number seconds to check if a stream has gone over its timout time since last action
     self.timeout() #continues to repeat indefinitly
     if model is None:
         path = pathlib.Path(__file__).parent.absolute()
         model_path = os.path.join(path, "./deepspeech-0.8.2-models.pbmm")
         self.model = deepspeech.Model(model_path)
     else:
         self.model = deepspeech.Model(model)
     if scorer is None:
         path = pathlib.Path(__file__).parent.absolute()
         scorer_path = os.path.join(path, "./deepspeech-0.8.2-models.scorer")
         self.model.enableExternalScorer(scorer_path)
     else:
         self.model.enableExternalScorer(scorer)