def run(self, model, lmbin, trie): model = Model(model, BEAM_WIDTH) if lmbin: model.enableDecoderWithLM(lmbin, trie, LM_ALPHA, LM_BETA) stream = None while True: # Try to get the next command from our queue, use a timeout to check # periodically for a quit signal so the application doesn't hang on # exit. try: cmd, *data = self._in_queue.get(timeout=0.3) except queue.Empty: if self._should_quit: break # If we haven't received a quit signal just continue trying to # get a command from the queue indefinitely continue if cmd == 'start': # 'start' means create a new stream stream = model.createStream() elif cmd == 'data': # 'data' means we received more audio data from the recorder if stream: model.feedAudioContent( stream, np.frombuffer(data[0].data(), np.int16)) elif cmd == 'finish': # 'finish' means the caller wants the result of the current stream transcript = model.finishStream(stream) self.finished.emit(transcript) stream = None
class InferenceThread(QObject): finished = Signal(str) def __init__(self): super(InferenceThread, self).__init__() self.in_queue = queue.Queue() self.should_quit = False self.worker = threading.Thread(target=self.run) def send_cmd(self, cmd): ''' Insert command in queue to be processed by the thread ''' self.in_queue.put(cmd) def setQuit(self): ''' Signal to the thread that it should stop running ''' self.should_quit = True def start(self): self.worker.start() def run(self): # Creating the model self.model = Model( os.path.join(os.path.dirname(__file__), "deepspeech-0.6.1-models/output_graph.pbmm"), BEAM_WIDTH) self.model.enableDecoderWithLM( os.path.join(os.path.dirname(__file__), "deepspeech-0.6.1-models/lm.binary"), os.path.join(os.path.dirname(__file__), "deepspeech-0.6.1-models/trie"), LM_ALPHA, LM_BETA) stream = None while True: # Try to get the next command from our queue, use a timeout to check # periodically for a quit signal so the application doesn't hang on # exit. try: cmd, *data = self.in_queue.get(timeout=0.3) except queue.Empty: if self.should_quit: break # If we haven't received a quit signal just continue trying to # get a command from the queue indefinitely continue if cmd == "start": # "start" means create a new stream stream = self.model.createStream() logging.debug("Starts to process sound") elif cmd == "data": # "data" means we received more audio data from the recorder if stream: self.model.feedAudioContent(stream, np.frombuffer(data[0].data(), np.int16)) elif cmd == "finish": # "finish" means the caller wants the result of the current stream transcript = self.model.finishStream(stream) self.finished.emit(transcript) stream = None logging.debug("Finishes to process sound")
def main(): parser = argparse.ArgumentParser( description='Running DeepSpeech inference.') parser.add_argument('--model', required=True, help='Path to the model (protocol buffer binary file)') parser.add_argument( '--alphabet', required=True, help= 'Path to the configuration file specifying the alphabet used by the network' ) parser.add_argument('--lm', nargs='?', help='Path to the language model binary file') parser.add_argument( '--trie', nargs='?', help= 'Path to the language model trie file created with native_client/generate_trie' ) parser.add_argument('--audio1', required=True, help='First audio file to use in interleaved streams') parser.add_argument('--audio2', required=True, help='Second audio file to use in interleaved streams') args = parser.parse_args() ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH) if args.lm and args.trie: ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_ALPHA, LM_BETA) fin = wave.open(args.audio1, 'rb') fs1 = fin.getframerate() audio1 = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) fin.close() fin = wave.open(args.audio2, 'rb') fs2 = fin.getframerate() audio2 = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) fin.close() stream1 = ds.setupStream(sample_rate=fs1) stream2 = ds.setupStream(sample_rate=fs2) splits1 = np.array_split(audio1, 10) splits2 = np.array_split(audio2, 10) for part1, part2 in zip(splits1, splits2): ds.feedAudioContent(stream1, part1) ds.feedAudioContent(stream2, part2) print(ds.finishStream(stream1)) print(ds.finishStream(stream2))
class DSEngine: @classmethod async def create(cls, websocket=None, **kwargs): self = DSEngine() modelPath = os.path.join(kwargs.get("DEEPSPEECH_ROOT_PATH"), kwargs.get("MODEL")) alphabetPath = os.path.join(kwargs.get("DEEPSPEECH_ROOT_PATH"), kwargs.get("ALPHABET")) logger.info('Loading model from file {}'.format( modelPath)) #, file=sys.stderr) modeLoad_start = default_timer() self.model = Model(modelPath, int(kwargs.get("NFEATS", 26)), int(kwargs.get("NCONTEXT", 9)), alphabetPath, int(kwargs.get("BEAMWIDTH", 500))) if kwargs.get("LM") is not None and kwargs.get("TRIE") is not None: lmPath = os.path.join(kwargs.get("DEEPSPEECH_ROOT_PATH"), kwargs.get("LM")) triePath = os.path.join(kwargs.get("DEEPSPEECH_ROOT_PATH"), kwargs.get("TRIE")) self.model.enableDecoderWithLM(alphabetPath, lmPath, triePath, float(kwargs.get("LMALPHA", 0.75)), float(kwargs.get("LMBETA", 1.85))) modeLoad_end = default_timer() - modeLoad_start logger.info('Loaded model in {:.3}s.'.format( modeLoad_end)) #, file=sys.stderr) #each frames is like 20ms self.pre_alloc_frames = round( float(kwargs.get("PRE_ALLOC_TIME", 2.0)) / 0.02) self.ctx = self.model.setupStream( pre_alloc_frames=self.pre_alloc_frames) self.tail = [] self.inputSr = int(kwargs.get("INPUT_SAMPLE_RATE", 16000)) self.targetSr = 16000 #resample stuff ## TODO: parametrize with gdc and test for different sample rates if self.inputSr > self.targetSr: self.up = self.inputSr // self.targetSr self.down = 1 else: self.up = self.targetSr // self.inputSr self.down = 1 self.prevResampledChunk = [] self.resampleOffset = -1 self.currentTime = 0 self.secsInQueue = 0.0 self.secsInStream = 0.0 self.websocket = websocket self.ws_id = websocket.client[1] self.lastValidTranscription = '' self.dsAsyncLoop = asyncio.get_event_loop() self.msgQueue = asyncio.Queue() self.VAD = webrtcvad.Vad(1) self.noSpeechSecs = 0.0 #counts nospeech seconds self.speechSecs = 0.0 #counts speech seconds self.minSpeechSecs = float(kwargs.get("MIN_SPEECH_SECS", 0.05)) self.minNoSpeechSecs = float(kwargs.get("MIN_NOSPEECH_SECS", 0.05)) self.decodeTriggerInterval = float( kwargs.get("DECODE_TRIGGER_INTERVAL", 1.0)) self.VADQueue = [] self.VADStartTrimIndx = -1 self.VADEndTrim = 0 self.secsInVADqueue = 0.0 self.feed = False self.consumerTask = self.dsAsyncLoop.create_task( self.consumerHandler()) return self async def enqueue(self, data): await self.msgQueue.put(data) async def consumerHandler(self): while True: try: msg = await self.msgQueue.get() await self.consume(msg) self.msgQueue.task_done() except CancelledError: logger.info("Done consuming") raise except: logger.exception("ERROR DURING CONSUME") raise return True async def consume(self, msg): if self.websocket.client_state.name == "CONNECTED": ''' A new audio buffer is coming. Steps: #1- save buffers in VADQueue #2- if this buffer is "speech", save its index (VADStartTrimIndx) inside the queue,including some no-speech buffers before if any #3- once the minimum speech seconds has been reached, take a sublist of buffers from the VADQueue starting from VADStartTrimIndx. The feed flag now is True #4- from now every buffer coming from the client will be directly sent to the DeepSpeech stream. Meanwhile a counter of speech/noSpeech seconds is updated #5- when noSpeechSecs is higher than minNoSpeechSecs, no more buffers will be feeded ''' if msg["type"] == "AUDIO_BUFFER": samples = b64decode(msg["data"]) numOfSamples = len(samples) / 2 isSpeech = False try: isSpeech = self.VAD.is_speech(samples, self.inputSr) except: logger.warning( "VAD ERROR. hint: buffer size must be 10,20 o 30ms in duration. len buffer: %s", len(samples)) info = {"chunkNum": msg.get("count", 0), "isSpeech": isSpeech} logger.info("TIME {:.3}".format( info.get("chunkNum") * (numOfSamples / self.inputSr))) #1 self.VADQueue.append(samples) self.secsInVADqueue += numOfSamples / self.inputSr #2 if isSpeech: self.speechSecs += numOfSamples / self.inputSr #4 self.noSpeechSecs = 0.0 if self.VADStartTrimIndx < 0: ## TODO: parametrize this number using milli/seconds as reference! ofs = min(10, len(self.VADQueue)) self.VADStartTrimIndx = len( self.VADQueue ) - ofs #if (len(self.VADQueue) - ofs)>0 else len(self.VADQueue) else: self.noSpeechSecs += numOfSamples / self.inputSr #4 #3 if self.speechSecs >= self.minSpeechSecs and not self.feed: logger.info("Voiced index: %s, buffer queue length: %s", self.VADStartTrimIndx, len(self.VADQueue)) self.feed = True buffers = self.VADQueue[self.VADStartTrimIndx:] samples = b''.join(buffers) logger.info( "Total nr of buffers: %s nr of samples concatenated: %s", len(buffers), len(samples)) #4 if self.feed: await self.feedAudio(samples, info) if round(self.secsInStream, 2) % self.decodeTriggerInterval == 0: logger.info("DECODING") await self.doStt() else: logger.info( "NOT FEEDED BUT QUEUED chunk {}".format( info.get("chunkNum")) + " isSpeech:{}".format(isSpeech) + " nospeechsecs:{:.3}s".format(self.noSpeechSecs) + " speechSecs:{:.3}s".format(self.speechSecs)) #5 if self.noSpeechSecs > self.minNoSpeechSecs and self.secsInStream > 0: logger.info( "FINISH STREAM-noSpeechSecs %s speechSecs %s total %s", self.noSpeechSecs, self.speechSecs, self.secsInStream) self.feed = False startIndx = self.VADStartTrimIndx self.VADStartTrimIndx = -1 self.VADQueue = [] self.noSpeechSecs = self.speechSecs = 0.0 await self.doStt(finishStream=True) #flush the deepspeech stream with a last transcription elif msg["type"] == "REQ_TRANSCRIPTION": logger.info("REQ_TRANSCRIPTION") await self.doStt(finishStream=True) else: logger.warning("Message not recognized!") await asyncio.sleep(0.05) #return True #QUESTION: useful for asyncio purpose? async def feedAudio(self, samples, info): feedFuture = self.dsAsyncLoop.run_in_executor(None, self._feedAudioContent, samples, info) #feedFuture.add_done_callback(self.callback) # TODO: Are exceptions handled in the right way? # CancelledError exception while feeding and the client disconnects try: await feedFuture #asyncio.gather(feedFuture) except CancelledError: #logger.exception("ERROR") logger.error("feedFuture done %s", feedFuture.done()) logger.error("feedFuture cancelled %s", feedFuture.cancelled()) except: logger.exception("UKNOWN ERROR") raise def _feedAudioContent(self, data, msg): if len(data) < 1: logger.warning("EMPTY DATA WHILE FEEDING!") feed_start = default_timer() if self.inputSr != self.targetSr: #resample the audio. # TODO: test more sampling rates. Rates working: # 8000khz -> 16000 khz audio = np.array(self.resample(data), dtype=np.int16) else: audio = np.frombuffer(data, dtype=np.int16) self.model.feedAudioContent(self.ctx, audio) numOfSamples = len(data) / 2 secs = numOfSamples / self.inputSr self.secsInStream += secs feed_end = default_timer() - feed_start logger.info('FEED done in: {:.3}s;'.format(feed_end) + ' isSpeech {};'.format(msg["isSpeech"]) + ' seconds feeded {:.3}s;'.format(secs) + ' seconds total {:.3}s;'.format(self.secsInStream) + ' chunk number {}'.format(msg["chunkNum"])) # if msg.get("boundaries"): # logger.info("Saving audio from chunk %s to %s",msg["boundaries"][0],msg["boundaries"][1]) # buffers = self.VADQueue[msg["boundaries"][0]:msg["boundaries"][1]] # buffers.insert(0,bytes(4000)) # buffers.append(bytes(4000)) # b = b''.join(buffers) # audio = np.frombuffer(b, dtype=np.int16) # librosa.output.write_wav("./"+str(msg["boundaries"][0])+"-"+str(msg["boundaries"][1])+".wav",audio/0x8000,self.inputSr) return self.secsInStream async def doStt(self, finishStream=False): decodeFuture = self.dsAsyncLoop.run_in_executor( None, partial(self.decode, finishStream)) try: result = await decodeFuture self.secsInStream = 0 await self.sendResult(result) except CancelledError: logger.exception("STT TASK CANCELLED") except: logger.exception("UKNOWN ERROR") raise def decode(self, finishStream=False): #self.busy=True decode_start = default_timer() if finishStream: result = self.model.finishStreamWithMetadata(self.ctx) setup_start = default_timer() self.ctx = self.model.setupStream( pre_alloc_frames=self.pre_alloc_frames) setup_end = default_timer() - setup_start logger.info('Setup done in: {:.3}s.'.format(setup_end)) else: result = self.model.intermediateDecode(self.ctx) decode_end = default_timer() - decode_start logger.info('Decode done in: {:.3}s.'.format(decode_end)) return result async def sendResult(self, result): if type(result) == str: data = {"result": result, "metadata": {}} else: #metadata transcription = ''.join(item.character for item in result.items) data = { "result": transcription, "metadata": { "numItems": result.num_items, "probability": result.probability, "indx2charMetadata": json.dumps({ i: { "char": el.character, "start_time": el.start_time, "timestep": el.timestep } for i, el in enumerate(result.items) }) } } logger.info("CONNECTED? %s", self.websocket.client_state.name) if self.websocket.client_state.name == "CONNECTED": structuredResult = { "type": "TRANSCRIPTION", "clientId": self.ws_id, "data": data } logger.info("RESULT %s", structuredResult) try: await self.websocket.send_json(structuredResult) except ConnectionClosed as exc: #await websocket.close(code=1000) logger.error("ConnectionClosed %s", exc) except: logger.exception("UKNOWN ERROR") raise async def clear(self): logger.info("CLEARING") await self.msgQueue.join() logger.info("JOINED") self.consumerTask.cancel() try: res = await self.consumerTask except asyncio.CancelledError: logger.info("Consumer task cancelled") except: logger.exception("UKNOWN ERROR") raise finally: logger.info("Feeder done? %s", self.consumerTask.done()) logger.info("Feeder cancelled? %s", self.consumerTask.cancelled()) logger.info("Audio queue size: %s", self.msgQueue.qsize()) def resample(self, data): #len data: N nZeros = 10 # TODO: parametrize nZeros data16 = np.frombuffer(data, dtype=np.int16) #len data16: N/2 pad = np.zeros(nZeros, dtype=np.int16) #len: p data16 = np.concatenate((pad, data16, pad)) #len: N/2+2*p resampled = resampler(data16, self.up, self.down) #len: (N/2)*(up/down) # if no samples saved from previous iteration if len(self.prevResampledChunk) == 0: # save half of the current resampled buffer self.prevResampledChunk = resampled[len(resampled) // 2:] # the actual buffer is just the first half buffer = resampled[0:len(resampled) // 2] else: #crossfade between the previous chunk and the new resampled buffer #to prevent audio distortion #total overlap of buffer's tails: nZeros*4 offset buffer = np.zeros( len(self.prevResampledChunk) + len(resampled) - (nZeros * 4)) #fill the first half of the final buffer with the previous saved chunk buffer[0:len(self.prevResampledChunk)] = self.prevResampledChunk #copy the resampled samples in a way that the sum of the smoothed boundaries #given by the resample process will not create discontinuities over the signal buffer[(len(self.prevResampledChunk) - (nZeros * 4)):] += resampled buffLen = len(data) # TODO: len(data) = (N/2)*(up/down) ? #save the new chunk self.prevResampledChunk = buffer[buffLen:] #the final buffer to return buffer = buffer[0:buffLen] return buffer def callback(self, future): exc = future.exception() if exc: logger.exception("ERROR")