def speech_to_text(input_file, file_length, return_speed_per_chunk=False, chunk_size=10): """ Compute the words pronounced in the input_file :param input_file: sound file path :param file_length: time length of the input file (in seconds) :param return_speed_per_chunk: if True, the function return a list of words per chunk, if false it returns all the words in the extract :return: words as string """ # setup the model if return_speed_per_chunk: result = [] else: result = "" recognizer = Model("models/deepspeech-0.8.2-models.pbmm") recognizer.setBeamWidth(2000) recognizer.enableExternalScorer("models/deepspeech-0.8.2-models.scorer") desired_sample_rate = recognizer.sampleRate() # convert input file into smaller audio chunks (apparently works better) CHUNK_SIZE = chunk_size n_chunks = int(file_length // CHUNK_SIZE) for i in range(n_chunks): tfm = sox.Transformer() tfm.trim(i * CHUNK_SIZE, (i + 1) * CHUNK_SIZE) tfm.set_output_format(channels=1) tfm.build(input_file, "temp_folder/chunked_file{}.wav".format(i)) #cmb = sox.Combiner() input_list = [ "audio-files/silence.wav", "temp_folder/chunked_file{}.wav".format(i), "audio-files/silence.wav" ] input_list_correct_sample_rate = list( map(lambda file: convert_samplerate(file, desired_sample_rate)[1], input_list)) audio = np.concatenate(input_list_correct_sample_rate) #cmb.build(input_list, "temp_folder/chunked_file_with_silence{}.wav".format(i), combine_type="concatenate") #fs, audio = convert_samplerate("temp_folder/chunked_file_with_silence{}.wav".format(i), desired_sample_rate) if return_speed_per_chunk: result.append(recognizer.stt(audio)) else: result += recognizer.stt(audio) os.remove("temp_folder/chunked_file{}.wav".format(i)) #os.remove("temp_folder/chunked_file_with_silence{}.wav".format(i)) print(result) return result
def tflite_worker(model, lm, trie, queue_in, queue_out, gpu_mask): os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_mask) ds = Model(model, BEAM_WIDTH) ds.enableDecoderWithLM(lm, trie, LM_ALPHA, LM_BETA) while True: try: msg = queue_in.get() filename = msg['filename'] wavname = os.path.splitext(os.path.basename(filename))[0] fin = wave.open(filename, 'rb') audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) fin.close() decoded = ds.stt(audio) queue_out.put({ 'wav': wavname, 'prediction': decoded, 'ground_truth': msg['transcript'] }) except FileNotFoundError as ex: print('FileNotFoundError: ', ex) print(queue_out.qsize(), end='\r') # Update the current progress queue_in.task_done()
def tflite_worker(model, scorer, queue_in, queue_out, gpu_mask): os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_mask) ds = Model(model) while True: try: msg = queue_in.get() filename = msg['filename'] fin = wave.open(filename, 'rb') audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) fin.close() decoded = ds.stt(audio) queue_out.put({ 'wav': filename, 'prediction': decoded, 'ground_truth': msg['transcript'] }) except FileNotFoundError as ex: print('FileNotFoundError: ', ex) print(queue_out.qsize(), end='\r') # Update the current progress queue_in.task_done()
def mainCall(model=ROOT_DIR+"models/output_graph.pbmm", alphabet=ROOT_DIR+"models/alphabet.txt", lm=ROOT_DIR+"models/lm.binary", trie=ROOT_DIR+"models/trie", audio=ROOT_DIR+"test.wav"): print('Loading model from file {}'.format(model), file=sys.stderr) model_load_start = timer() ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) if lm and trie: print('Loading language model from files {} {}'.format(lm, trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(alphabet, lm, trie, LM_ALPHA, LM_BETA) lm_load_end = timer() - lm_load_start print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr) fin = wave.open(audio, 'rb') fs = fin.getframerate() if fs != 16000: print('Warning: original sample rate ({}) is different than 16kHz. Resampling might produce erratic speech recognition.'.format(fs), file=sys.stderr) fs, audio = convert_samplerate(audio) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1/16000) fin.close() print('Running inference.', file=sys.stderr) inference_start = timer() res = ds.stt(audio, fs) print(res) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr) return res
def extract_text(AUDIO): ds = Model(MODEL, N_FEATURES, N_CONTEXT, ALPHABET, BEAM_WIDTH) ds.enableDecoderWithLM(ALPHABET, LM, TRIE, LM_ALPHA, LM_BETA) fin = wave.open(AUDIO, 'rb') fs = fin.getframerate() if fs != 16000: print( 'Warning: original sample rate ({}) is different than 16kHz. Resampling might produce erratic speech recognition.' .format(fs), file=sys.stderr) fs, audio = convert_samplerate(AUDIO) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1 / 16000) fin.close() print('Running inference.', file=sys.stderr) inference_start = timer() INFERENCE_RESULT = ds.stt(audio, fs) print(INFERENCE_RESULT) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr) # write_text_to_file(INFERENCE_RESULT) return INFERENCE_RESULT
def main(): # import soundfile as sf # for format, format_desc in sf.available_formats().items(): # print(f'Format: {format} {format_desc} ') # for subtype, st_desc in sf.available_subtypes().items(): # print(f'{subtype} {st_desc}') # print() print(create_args_str(args)) print(f'Loading model from file {args.model}', file=sys.stderr) model_load_start = timer() ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print(f'Loaded model in {model_load_end:.3}s.', file=sys.stderr) # if args.lm and args.trie: # print(f'Loading language model from files {args.lm} {args.trie}', file=sys.stderr) # lm_load_start = timer() # ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT, VALID_WORD_COUNT_WEIGHT) # lm_load_end = timer() - lm_load_start # print(f'Loaded language model in {lm_load_end:.3}s.', file=sys.stderr) corpus = get_corpus('ls') corpus_entry = corpus[0] for i, segment in enumerate(corpus_entry[:5]): audio, rate = segment.audio, segment.rate transcription = ds.stt(audio, rate) print(f'transcription: \t{transcription}') print(f'actual: \t\t{segment.text}')
class MozillaDeepSpeechEngine(Engine): def __init__(self, pbmm_path: str, scorer_path: str): self._model = Model(pbmm_path) self._model.enableExternalScorer(scorer_path) self._audio_sec = 0. self._proc_sec = 0. def transcribe(self, path: str) -> str: audio, sample_rate = soundfile.read(path, dtype='int16') assert sample_rate == self._model.sampleRate() self._audio_sec += audio.size / sample_rate start_sec = time.time() res = self._model.stt(audio) self._proc_sec += time.time() - start_sec return res def rtf(self) -> float: return self._proc_sec / self._audio_sec def delete(self) -> None: pass def __str__(self) -> str: return 'Mozilla DeepSpeech'
def s2t(file): ds = Model(MODEL_FILE, 500) ds.enableDecoderWithLM(LANG_MODEL, TRIE_FILE, 1.50, 2.25) fs, audio = wav.read(file) data = ds.stt(audio) return data
def recognize_DS(audio1, data): beam_width = 500 #how many different word sequences will the model take into account model_name = data['wake']['model name'] ds = Model(model_name) ds.setBeamWidth(beam_width) audio1 = np.frombuffer(audio1.frame_data, np.int16) #converts into numpy array return (ds.stt(audio1)) #returning predicted audio
def client(audio_file, lang="uk"): model_load_start = timer() # sphinx-doc: python_ref_model_start model = "./uk.tflite" ds = Model(model) # ds.enableExternalScorer("kenlm.scorer") # sphinx-doc: python_ref_model_stop model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) desired_sample_rate = ds.sampleRate() fin = wave.open(audio_file, 'rb') fs_orig = fin.getframerate() audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1 / fs_orig) fin.close() print('Running inference.', file=sys.stderr) inference_start = timer() # sphinx-doc: python_ref_inference_start result = ds.stt(audio) print(result) # sphinx-doc: python_ref_inference_stop inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr) return result
def generate_text(self): print(self.t1.isAlive(), "check thread t1 is alive or not") print(self.t2.isAlive(), "check thread t2 is alive or not") self.change_status = False self.t1 = threading.Thread(target=self.start_recording) self.t2 = threading.Thread(target=self.stop_recording) model_path = '/home/batman/python_projects/flask_blog_version1/myblog/models/deepspeech/deepspeech-0.5.1-models/' # Numeric values are configurable ds = Model(model_path + 'output_graph.pbmm', 26, 9, model_path + 'alphabet.txt', 500) ds.enableDecoderWithLM(model_path + 'alphabet.txt', model_path + 'lm.binary', model_path + 'trie', 0.75, 1.85) def load_audio(audio_path): fin = wave.open(audio_path, 'rb') audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) fin.close() return audio def frame_rate(audio_path): fin = wave.open(audio_path, 'rb') sample_rate = fin.getframerate() fin.close() return sample_rate audio_file = self.filename field_value = ds.stt(load_audio(audio_file), frame_rate(audio_file)) self.welcome_text.delete('1.0', END) self.welcome_text.insert(END, field_value)
class MozillaDeepSpeechASREngine(ASREngine): """https://github.com/mozilla/DeepSpeech""" def __init__(self, model_path, alphabet_path, language_model_path=None, trie_path=None): """ Constructor. :param model_path: Absolute path to (acoustic) model file. :param alphabet_path: Absolute path to file containing alphabet. :param language_model_path: Absolute path to language model file. This parameter is optional. Set to enable decoding with language model. :param trie_path: Absolute path to trie. This parameter is optional. Set to enable decoding with language model. """ # https://github.com/mozilla/DeepSpeech/blob/master/native_client/python/client.py self._model = Model(model_path, 26, 9, alphabet_path, 500) self._model.enableDecoderWithLM(alphabet_path, language_model_path, trie_path, 1.5, 2.1) def transcribe(self, path): pcm, sample_rate = soundfile.read(path) pcm = (np.iinfo(np.int16).max * pcm).astype(np.int16) return self._model.stt(pcm, aSampleRate=sample_rate) def __str__(self): return 'Mozilla DeepSpeech'
def main(): parser = argparse.ArgumentParser(description='Running DeepSpeech inference.') parser.add_argument('--model', required=True, help='Path to the model (protocol buffer binary file)') parser.add_argument('--lm', nargs='?', help='Path to the language model binary file') parser.add_argument('--trie', nargs='?', help='Path to the language model trie file created with native_client/generate_trie') parser.add_argument('--audio', required=True, help='Path to the audio file to run (WAV format)') parser.add_argument('--beam_width', type=int, default=500, help='Beam width for the CTC decoder') parser.add_argument('--lm_alpha', type=float, default=0.75, help='Language model weight (lm_alpha)') parser.add_argument('--lm_beta', type=float, default=1.85, help='Word insertion bonus (lm_beta)') parser.add_argument('--version', action=VersionAction, help='Print version and exits') parser.add_argument('--extended', required=False, action='store_true', help='Output string from extended metadata') parser.add_argument('--json', required=False, action='store_true', help='Output json from metadata with timestamp of each word') args = parser.parse_args() print('Loading model from file {}'.format(args.model), file=sys.stderr) model_load_start = timer() ds = Model(args.model, args.beam_width) model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) desired_sample_rate = ds.sampleRate() if args.lm and args.trie: print('Loading language model from files {} {}'.format(args.lm, args.trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(args.lm, args.trie, args.lm_alpha, args.lm_beta) lm_load_end = timer() - lm_load_start print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr) fin = wave.open(args.audio, 'rb') fs = fin.getframerate() if fs != desired_sample_rate: print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format(fs, desired_sample_rate), file=sys.stderr) fs, audio = convert_samplerate(args.audio, desired_sample_rate) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1/fs) fin.close() print('Running inference.', file=sys.stderr) inference_start = timer() if args.extended: print(metadata_to_string(ds.sttWithMetadata(audio))) elif args.json: print(metadata_json_output(ds.sttWithMetadata(audio))) else: print(ds.stt(audio)) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
def handle_message(sid, data): output_channel = SocketIOOutput(sio, sid, self.bot_message_evt) ##convert audio message to text and pass it to the Rasa Core ds = Model('models_stt/output_graph.pbmm', 26, 9, 'models_stt/alphabet.txt', 500) fs, audio = wav.read('LDC93S1.wav') audio_length = len(audio) * (1 / 16000) message = ds.stt(audio, fs) if self.session_persistence: if not data.get("session_id"): logger.warning("A message without a valid sender_id " "was received. This message will be " "ignored. Make sure to set a proper " "session id using the " "`session_request` socketIO event.") return sender_id = data['session_id'] else: sender_id = sid message = UserMessage(data['message'], output_channel, sender_id, input_channel=self.name()) on_new_message(message)
class MozillaDeepSpeech(ASRSystem): """ Implements a Mozilla DeepSpeech model based on the model file at model_path. This code assumes the model follows Mozilla DeepSpeech version 6.1 and may not work for later models. See https://deepspeech.readthedocs.io/en/v0.6.1/USING.html for installation instructions. """ def __init__(self, model_path, use_language_model=False, identifier=None): super(MozillaDeepSpeech, self).__init__(model_path, identifier) model_path = os.path.join(self.model_path, 'output_graph.pbmm') alphabet_path = os.path.join(self.model_path, 'alphabet.txt') language_model_path = os.path.join(self.model_path, 'lm.binary') trie_path = os.path.join(self.model_path, 'trie') self._model = DPModel(model_path, 500) self.samplerate_hz = 16000 if use_language_model: self._model.enableDecoderWithLM(language_model_path, trie_path, 0.75, 1.85) def transcribe(self, sound_or_path, fs=None): sound = self._load_sound(sound_or_path) sound = (np.iinfo(np.int16).max * sound).astype(np.int16) res = self._model.stt(sound) return res
def upload(request): # retrieves filename from post request data = json.loads(request.body) filename = data.get('filename') #google upload to bucket parameters # storage_client = storage.Client() # bucket = storage_client.bucket('waev') # blob = bucket.blob(filename+'.flac') audio = f"{settings.MEDIA_ROOT}/{filename}" #convert audio file to mono FLAC, 16000 samplerate to optimize transcription tfm = sox.Transformer() tfm.convert(samplerate=16000, n_channels=1) new_audio = f"{settings.MEDIA_ROOT}/test.wav" audio = tfm.build(audio, new_audio) fin = wave.open(new_audio, 'rb') audio_buffer = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) # parser = argparse.ArgumentParser(description='Running DeepSpeech inference.') # args = parser.parse_args() print(new_audio) ds = Model("deepspeech-0.7.4-models.pbmm") print(ds.stt(audio_buffer)) #upload file to bucket # blob.upload_from_filename(new_audio) # print("file uploaded!") return HttpResponse()
class DeepSpeechInput(AudioInput): ''' Input from DeepSpeech using the US English language model. ''' def __init__(self, notifier, use_lm=False, wav_dir=None): ''' @see AudioInput.__init__() @type use_lm: bool @param use_lm: Whether to use the DeepSpeech language model for better predictions. ''' super(DeepSpeechInput, self).__init__(notifier, format=pyaudio.paInt16, channels=1, rate=16000, wav_dir=wav_dir) # The files which we'll need from the model directory alphabet = os.path.join(_MODEL_DIR, 'alphabet.txt') model = os.path.join(_MODEL_DIR, 'output_graph.pb') lm = os.path.join(_MODEL_DIR, 'lm.binary') trie = os.path.join(_MODEL_DIR, 'trie') # If these don't exist then DeepSpeech will segfault when inferring! if not os.path.exists(alphabet): raise IOError("Not found: %s" % alphabet) if not os.path.exists(model): raise IOError("Not found: %s" % model) # Load in the model. LOG.info("Loading %s" % model) self._model = Model(model, _NUM_FEATURES, _NUM_CONTEXT, alphabet, _BEAM_WIDTH) # If we're using a language model then pull that in too. This requires a # decent chunk of memory. if use_lm: if not os.path.exists(lm): raise IOError("Not found: %s" % lm) if not os.path.exists(trie): raise IOError("Not found: %s" % trie) LOG.info("Loading %s" % lm) self._model.enableDecoderWithLM(alphabet, lm, trie, _LM_WEIGHT, _VALID_WORD_COUNT_WEIGHT) def _decode_raw(self, data): ''' @see AudioInput._decode_raw() ''' audio = numpy.frombuffer(data, numpy.int16) words = self._model.stt(audio, self._rate) LOG.info("Got: %s" % (words, )) tokens = [ Token(word.strip(), 1.0, True) for word in words.split(' ') if len(word.strip()) > 0 ] return tokens
def get_text(wav_file): ds = Model(MODEL_FILE, N_FEATURES, N_CONTEXT, ALPHABET_FILE, BEAM_WIDTH) ds.enableDecoderWithLM(ALPHABET_FILE, LANGUAGE_MODEL, TRIE_FILE, LM_WEIGHT, VALID_WORD_COUNT_WEIGHT) fs, audio = wavfile.read(wav_file) processed_data = ds.stt(audio, fs) print(processed_data)
class SpeechRecognizer: def __init__(self): self._model = Model('DeepSpeech/deepspeech-0.7.1-models.pbmm') # self._model.setBeamWidth(1) # self._model.enableExternalScorer('DeepSpeech/deepspeech-0.7.1-models.scorer') def listen(self, audio): return self._model.stt(audio)
def main(): parser = argparse.ArgumentParser(description='Running DeepSpeech inference.') parser.add_argument('--model', required=True, help='Path to the model (protocol buffer binary file)') parser.add_argument('--alphabet', required=True, help='Path to the configuration file specifying the alphabet used by the network') parser.add_argument('--lm', nargs='?', help='Path to the language model binary file') parser.add_argument('--trie', nargs='?', help='Path to the language model trie file created with native_client/generate_trie') parser.add_argument('--audio', required=True, help='Path to the audio file to run (WAV format)') parser.add_argument('--version', action=VersionAction, help='Print version and exits') parser.add_argument('--extended', required=False, action='store_true', help='Output string from extended metadata') args = parser.parse_args() print('Loading model from file {}'.format(args.model), file=sys.stderr) model_load_start = timer() ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) if args.lm and args.trie: print('Loading language model from files {} {}'.format(args.lm, args.trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_ALPHA, LM_BETA) lm_load_end = timer() - lm_load_start print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr) writeFile = open('speechtotext.csv', 'w') writer = csv.writer(writeFile) writer.writerow(['inputfile', 'inference']) for file in glob.glob("{}*.wav".format(args.audio)): fin = wave.open(file, 'rb') fs = fin.getframerate() if fs != SAMPLE_RATE: print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format(fs, SAMPLE_RATE), file=sys.stderr) fs, audio = convert_samplerate(args.audio) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1/SAMPLE_RATE) fin.close() print('Running inference for {}'.format(file), file=sys.stderr) inference_start = timer() if args.extended: print(metadata_to_string(ds.sttWithMetadata(audio, fs))) else: #print(ds.stt(audio, fs)) writer.writerow(["{}".format(file),"{}".format(ds.stt(audio, fs))]) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr) writeFile.close()
def DeepSpeech(Window, SpeechToNLPQueue, wavefile): # Create Signal Object SpeechSignal = GUISignal() SpeechSignal.signal.connect(Window.UpdateSpeechBox) MsgSignal = GUISignal() MsgSignal.signal.connect(Window.UpdateMsgBox) # References to models: model = 'DeepSpeech_Models/output_graph.pbmm' alphabet = 'DeepSpeech_Models/alphabet.txt' lm = 'DeepSpeech_Models/lm.binary' trie = 'DeepSpeech_Models/trie' print('Loading model from file {}'.format(model), file=sys.stderr) model_load_start = timer() ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) if lm and trie: print('Loading language model from files {} {}'.format(lm, trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(alphabet, lm, trie, LM_ALPHA, LM_BETA) lm_load_end = timer() - lm_load_start print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr) audio = wavefile fin = wave.open(audio, 'rb') fs = fin.getframerate() if fs != 16000: print( 'Warning: original sample rate ({}) is different than 16kHz. Resampling might produce erratic speech recognition.' .format(fs), file=sys.stderr) fs, audio = convert_samplerate(audio) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1 / 16000) fin.close() print('Running inference.', file=sys.stderr) inference_start = timer() output = (ds.stt(audio, fs)) print(output) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr) QueueItem = SpeechNLPItem(output, True, 0, 0, 'Speech') SpeechToNLPQueue.put(QueueItem) SpeechSignal.signal.emit([QueueItem])
class DeepSpeechRecognizer: def __init__(self): self.file_path = Path(__file__).parent self.model = Model('/Users/shihangyu/Scripts/python/stt_server/model/deepspeech-0.6.1-models/output_graph.pbmm', aBeamWidth=500) self.desired_sample_rate = self.model.sampleRate() self.logger = getLogger(self.__module__) self.tmp_path = self.file_path / 'tmp.wav' def __convert_samplerate(self, audio_path): sox_cmd = 'sox {} --type raw --bits 16 --channels 1 --rate {} --encoding signed-integer --endian little --compression 0.0 --no-dither - '.format( quote(audio_path), self.desired_sample_rate) try: output = subprocess.check_output(shlex.split(sox_cmd), stderr=subprocess.PIPE) except subprocess.CalledProcessError as e: raise RuntimeError('SoX returned non-zero status: {}'.format(e.stderr)) except OSError as e: raise OSError(e.errno, 'SoX not found, use {}hz files or install it: {}'.format(self.desired_sample_rate, e.strerror)) return self.desired_sample_rate, np.frombuffer(output, np.int16) def inference(self, audio_path): try: fin = wave.open(audio_path, 'rb') except Exception as e: x, _ = librosa.load(str(audio_path), sr=16000) sf.write(str(self.tmp_path), x, 16000) fin = wave.open(str(self.tmp_path), 'rb') fs = fin.getframerate() if fs != self.desired_sample_rate: # self.logger.warning(f'Warning: original sample rate ({fs}) is different than {self.desired_sample_rate}hz. ' # f'Resampling might produce erratic speech recognition.') fs, audio = self.__convert_samplerate(audio_path) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) fin.close() output = self.model.stt(audio) self.logger.debug(f"DeepSpeechRecognizer inference output: {output}") return output
class EnglishASR(object): def __init__(self, model, scorer): self.model = Model(model) self.model.enableExternalScorer(scorer) def recognize(self, wav_path): fs, audio = wavfile.read(wav_path) assert fs == 16000 result = self.model.stt(audio) return result
def SpeechToText(self, audio_file): input_graph = "deepspeech-0.5.1-models/output_graph.pbmm" alphabet = "deepspeech-0.5.1-models/alphabet.txt" deepSpeech = Model(input_graph, 26, 9, alphabet, 500) fs, audio = wav.read(audio_file) text_data = deepSpeech.stt(audio, fs) print(text_data) with open('out_text_data.txt', 'w') as f: f.write(text_data) return text_data
def predict_speech_to_text(stream_file): # Initialize the model speech_model = Model(MODEL_PATH) # Enable language scorer to improve the accuracy speech_model.enableExternalScorer(SCORER_PATH) # You can play with setting the model Beam Width, Scorer language model weight and word insertion weight # Use scipy to covert wav file into numpy array _, audio = wav.read(stream_file) return speech_model.stt(audio)
def main(argv): if len(argv) < 1: print("No .wav File given.") return ds = Model(MODEL_FILE, 500) ds.enableDecoderWithLM(LANG_MODEL, TRIE_FILE, 1.50, 2.25) fs, audio = wav.read(argv[0]) data = ds.stt(audio) print(data)
class SpeechToTextEngine: def __init__(self, model_path, scorer_path): self.model = Model(model_path=model_path) self.model.enableExternalScorer(scorer_path=scorer_path) def run(self, audio): audio = normalize_audio(audio) audio = BytesIO(audio) with wave.Wave_read(audio) as wav: audio = np.frombuffer(wav.readframes(wav.getnframes()), np.int16) result = self.model.stt(audio_buffer=audio) return result
class Tester(BaseTester): name = 'DeepSpeech' audio_format = RATE16K_MONO_WAV def __init__(self, *args, **kwargs): super(Tester, self).__init__(*args, **kwargs) files = [ args_lm, args_trie, args_model, # args_alphabet, ] for f in files: assert os.path.isfile(f), 'File %s does not exist.' % f print('Loading model from file %s' % (args_model), file=sys.stderr) model_load_start = timer() # self.ds = Model(args_model, N_FEATURES, N_CONTEXT, args_alphabet, BEAM_WIDTH) self.ds = Model(args_model, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr) # if args_lm and args_trie: print('Loading language model from files %s %s' % (args_lm, args_trie), file=sys.stderr) lm_load_start = timer() # self.ds.enableDecoderWithLM(args_alphabet, args_lm, args_trie, LM_ALPHA, LM_BETA) self.ds.enableDecoderWithLM(args_lm, args_trie, LM_ALPHA, LM_BETA) lm_load_end = timer() - lm_load_start print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr) def audio_to_text(self, fn): fin = wave.open(fn, 'rb') fs = fin.getframerate() assert fs == 16000, "Only 16000Hz input WAV files are supported for now!" audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1. / fs) fin.close() print('Running inference.', file=sys.stderr) inference_start = timer() # text = self.ds.stt(audio, fs) text = self.ds.stt(audio) print('text:', text) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr) return text
async def handle_message(sid, data): output_channel = SocketIOOutput(sio, sid, self.bot_message_evt, data['message']) if data['message'] == "/get_started": message = data['message'] else: ##receive audio as .ogg received_file = sid + '.wav' urllib.request.urlretrieve(data['message'], received_file) path = os.path.dirname(__file__) #print(path) #print(sid) # convert .ogg file into int16 wave file by ffmpeg #-ar 44100 os.system("ffmpeg -y -i {0} -ar 16000 output_{1}.wav".format( received_file, sid)) #os.system("ffmpeg -y -i {0} -c:a pcm_s161e output_{1}.wav".format(received_file,sid)) N_FEATURES = 25 N_CONTEXT = 9 BEAM_WIDTH = 500 LM_ALPHA = 0.75 LM_BETA = 1.85 ds = Model('deepspeech-0.5.1-models/output_graph.pbmm', N_FEATURES, N_CONTEXT, 'deepspeech-0.5.1-models/alphabet.txt', BEAM_WIDTH) fs, audio = wav.read("output_{0}.wav".format(sid)) message = ds.stt(audio, fs) #await self.sio.emit(self.bot_message_evt, response, room=socket_id) await sio.emit("user_uttered", {"text": message}, room=sid) #ffmpeg -i input.flv -f s16le -acodec pcm_s16le output.raw if self.session_persistence: #if not data.get("session_id"): # logger.warning("A message without a valid sender_id " # "was received. This message will be " # "ignored. Make sure to set a proper " # "session id using the " # "`session_request` socketIO event.") # return #sender_id = data['session_id'] #else: sender_id = sid message_rasa = UserMessage(message, output_channel, sender_id, input_channel=self.name()) await on_new_message(message_rasa)
def deepspeech_predict(wav_output): N_FEATURES = 25 N_CONTEXT = 9 BEAM_WIDTH = 500 print("* Loading model") ds = Model('deepspeech-0.5.1-models/output_graph.pbmm', N_FEATURES, N_CONTEXT, 'deepspeech-0.5.1-models/alphabet.txt', BEAM_WIDTH) print("* Reading audio file") fs, audio = wav.read(wav_output) print("* Predicting") return ds.stt(audio, fs)
def main(): parser = argparse.ArgumentParser(description='Running DeepSpeech inference.') parser.add_argument('--model', required=True, help='Path to the model (protocol buffer binary file)') parser.add_argument('--alphabet', required=True, help='Path to the configuration file specifying the alphabet used by the network') parser.add_argument('--lm', nargs='?', help='Path to the language model binary file') parser.add_argument('--trie', nargs='?', help='Path to the language model trie file created with native_client/generate_trie') parser.add_argument('--audio', required=True, help='Path to the audio file to run (WAV format)') parser.add_argument('--version', action=VersionAction, help='Print version and exits') parser.add_argument('--extended', required=False, action='store_true', help='Output string from extended metadata') args = parser.parse_args() print('Loading model from file {}'.format(args.model), file=sys.stderr) model_load_start = timer() ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) if args.lm and args.trie: print('Loading language model from files {} {}'.format(args.lm, args.trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_ALPHA, LM_BETA) lm_load_end = timer() - lm_load_start print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr) fin = wave.open(args.audio, 'rb') fs = fin.getframerate() if fs != 16000: print('Warning: original sample rate ({}) is different than 16kHz. Resampling might produce erratic speech recognition.'.format(fs), file=sys.stderr) fs, audio = convert_samplerate(args.audio) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1/16000) fin.close() print('Running inference.', file=sys.stderr) inference_start = timer() if args.extended: print(metadata_to_string(ds.sttWithMetadata(audio, fs))) else: print(ds.stt(audio, fs)) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
def tflite_worker(model, alphabet, lm, trie, queue_in, queue_out, gpu_mask): os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_mask) ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH) ds.enableDecoderWithLM(alphabet, lm, trie, LM_ALPHA, LM_BETA) while True: msg = queue_in.get() fin = wave.open(msg['filename'], 'rb') fs = fin.getframerate() audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1/16000) fin.close() decoded = ds.stt(audio, fs) queue_out.put({'prediction': decoded, 'ground_truth': msg['transcript']}) queue_in.task_done()
class Tester(BaseTester): name = 'DeepSpeech' audio_format = RATE16K_MONO_WAV def __init__(self, *args, **kwargs): super(Tester, self).__init__(*args, **kwargs) files = [args_lm, args_trie, args_model, args_alphabet] for f in files: assert os.path.isfile(f) print('Loading model from file %s' % (args_model), file=sys.stderr) model_load_start = timer() self.ds = Model(args_model, N_FEATURES, N_CONTEXT, args_alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr) if args_lm and args_trie: print('Loading language model from files %s %s' % (args_lm, args_trie), file=sys.stderr) lm_load_start = timer() self.ds.enableDecoderWithLM(args_alphabet, args_lm, args_trie, LM_ALPHA, LM_BETA) lm_load_end = timer() - lm_load_start print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr) def audio_to_text(self, fn): fin = wave.open(fn, 'rb') fs = fin.getframerate() assert fs == 16000, "Only 16000Hz input WAV files are supported for now!" audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1/16000) fin.close() print('Running inference.', file=sys.stderr) inference_start = timer() text = self.ds.stt(audio, fs) print('text:', text) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr) return text