class SpeechRecognizer: def __init__(self): self._model = Model(MODEL_PATH, N_FEATURES, N_CONTEXT, ALPHABET_PATH, BEAM_WIDTH) self._model.enableDecoderWithLM(ALPHABET_PATH, LANGUAGE_MODEL_PATH, TRIE_PATH, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) def speech_to_text(self, audio_buffer, sample_rate): app.logger.info('processing audio file') audio = self._process_audio_data(audio_buffer, sample_rate) app.logger.info('starting recognition') start = time() text = self._model.stt(audio, SAMPLE_RATE) end = time() app.logger.info('finished in {:.3f}s'.format(end - start)) return text def _process_audio_data(self, audio_buffer, original_sample_rate): audio = np.frombuffer(audio_buffer, dtype=np.int16) if original_sample_rate != SAMPLE_RATE: audio = self._resample(audio, original_sample_rate) return audio def _resample(self, audio, original_sample_rate): audio_length = len(audio) / original_sample_rate samples = int(audio_length * SAMPLE_RATE) return signal.resample(audio, samples).astype(np.int16)
def _worker_thread(self): print('restoring from {}'.format(model_file)) model = Model(model_file, N_INPUT, N_CONTEXT, ALPHABET_CONFIG_PATH, BEAM_WIDTH) model.enableDecoderWithLM(ALPHABET_CONFIG_PATH, LM_BINARY_PATH, LM_TRIE_PATH, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) while True: cmd, *args = self._queue.get() if cmd == 'sample': sample = args[0] file = wave.open(sample.wav_path) audio = np.frombuffer(file.readframes(file.getnframes()), dtype=np.int16) fs = file.getframerate() start = time.time() result = model.stt(audio, fs) inference_time = time.time() - start wav_time = wav_length(sample.wav_path) print('wav length: {}\ninference time: {}\nRTF: {:2f}'.format( wav_time, inference_time, inference_time / wav_time)) self.inference_done.emit(sample, result) elif cmd == 'stop': break sess.close()
def main(): model = "models/output_graph.pb" alphabet = "models/alphabet.txt" lm = "models/lm.binary" trie = "models/trie" ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH) ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) with open("flickr_audio_transcription.txt", "w") as out: for audio_f in glob.glob( "/roaming/gchrupal/vgs/data/flickr8k/flickr_audio/wavs/*.wav"): print("Transcribing {}".format(audio_f)) try: fs, audio = wav.read(audio_f) if fs != 16000: if fs < 16000: print( 'Warning: original sample rate (%d) is lower than 16kHz. Up-sampling might produce erratic speech recognition.' % (fs), file=sys.stderr) fs, audio = convert_samplerate(args.audio) audio_length = len(audio) * (1 / 16000) basename, ext = os.path.splitext(os.path.basename(audio_f)) out.write("{}\t{}\n".format(basename, ds.stt(audio, fs))) out.flush() except ValueError as e: print("Error: {}".format(e))
def recognize(model="../models/output_graph.pb", audio="../audio/2830-3980-0043.wav", alphabet="../models/alphabet.txt", lm="../models/lm.binary", trie="../models/trie"): print('Loading model from file %s' % (model), file=sys.stderr) model_load_start = timer() ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr) if lm and trie: print('Loading language model from files %s %s' % (lm, trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr) fs, audio = wav.read(audio) # We can assume 16kHz audio_length = len(audio) * (1 / 16000) assert fs == 16000, "Only 16000Hz input WAV files are supported for now!" print('Running inference.', file=sys.stderr) inference_start = timer() result = ds.stt(audio, fs) print(result, file=sys.stderr) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr) return result
class DeepSpeechSTT(plugin.STTPlugin): """ DeepSpeech Speech-to-Text implementation """ def __init__(self, *args, **kwargs): plugin.STTPlugin.__init__(self, *args, **kwargs) self._logger = logging.getLogger(__name__) self._plugin_config = self.profile['deepspeech'] graph = self._plugin_config['graph'] alphabet = self._plugin_config['alphabet'] self._logger.debug( "Initializing DeepSpeech with graph '%s' " + "and alphabet '%s'", graph, alphabet) self._model = Model(graph, 26, 9, alphabet, 500) def transcribe(self, fp): """ Performs STT, transcribing an audio file and returning the result. Arguments: fp -- a file object containing audio data """ fs, audio = wav.read(fp) return self._model.stt(audio, fs)
class DeepSpeech: def __init__(self, model, alphabet, lm=None, trie=None): print('Loading model from file %s' % (model), file=sys.stderr) model_load_start = timer() self.ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr) if lm is not None and trie is not None: print('Loading language model from files %s %s' % (lm, trie), file=sys.stderr) lm_load_start = timer() self.ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr) def stt(self, audio_file): fs, audio = wav.read(audio_file) audio_length = len(audio) * (1 / 16000) print('Running inference.', file=sys.stderr) inference_start = timer() stt_result = self.ds.stt(audio, fs) print('Return result: ', stt_result) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr) return stt_result
def main_deepspeech(args): args = parse_args_deep() if args is None else args print('Loading model from file %s' % (args.model), file=sys.stderr) model_load_start = timer() ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr) if args.lm and args.trie: print('Loading language model from files %s %s' % (args.lm, args.trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr) fs, audio = wave.read(args.audio) # We can assume 16kHz audio_length = len(audio) * (1 / 16000) assert fs == 16000, "Only 16000Hz input WAV files are supported for now!" print('Running inference.', file=sys.stderr) inference_start = timer() print(ds.stt(audio, fs)) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
class transciber(object): def __init__(self, modelPath, alphabet, lmPath, trie, numFeatures=26, numContext=9, beamWidth=500): print('Loading model from file %s' % modelPath, file=sys.stderr) model_load_start = timer() self.model = Model(modelPath, numFeatures, numContext, alphabet, beamWidth) self.model.enableDecoderWithLM(alphabet, lmPath, trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) model_load_end = timer() - model_load_start print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr) def transcribe(self, audioPath): fs, audio = wav.read(audioPath) audio_length = len(audio) * (1 / 16000) label = self.model.stt(audio, fs) print(label) return label
class DeepSpeechImp: ds = None def __init__(self): logging.info('Loading model from file %s' % (shared_params.DS_MODEL)) model_load_start = timer() self.ds = Model(shared_params.DS_MODEL, N_FEATURES, N_CONTEXT, shared_params.DS_ALPHABET, shared_params.BEAM_WIDTH) model_load_end = timer() - model_load_start logging.info('Loaded model in %0.3fs.' % (model_load_end)) logging.info('Loading language model from files %s %s' % (shared_params.DS_LANGUAGE_MODEL, shared_params.DS_TRIE)) lm_load_start = timer() self.ds.enableDecoderWithLM(shared_params.DS_ALPHABET, shared_params.DS_LANGUAGE_MODEL, shared_params.DS_TRIE, shared_params.LM_WEIGHT, shared_params.WORD_COUNT_WEIGHT, shared_params.VALID_WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start logging.info('Loaded language model in %0.3fs.' % (lm_load_end)) def process_audio(self, audio_path): try: fs, audio = wav.read(audio_path) return self.ds.stt(audio, fs) except Exception as ex: logging.error(str(ex)) return "" def __del__(self): del self.ds
def main(): parser = argparse.ArgumentParser( description='Benchmarking tooling for DeepSpeech native_client.') parser.add_argument('model', type=str, help='Path to the model (protocol buffer binary file)', default="models/output_graph.pb") parser.add_argument('audio', type=str, help='Path to the audio file to run (WAV format)', default="sample_input.wav") parser.add_argument( 'alphabet', type=str, help= 'Path to the configuration file specifying the alphabet used by the network', default="models/alphabet.txt") parser.add_argument('lm', type=str, nargs='?', help='Path to the language model binary file', default="models/lm.binary") parser.add_argument( 'trie', type=str, nargs='?', help= 'Path to the language model trie file created with native_client/generate_trie', default="models/trie") args = parser.parse_args() print('Loading model from file %s' % (args.model), file=sys.stderr) model_load_start = timer() ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr) if args.lm and args.trie: print('Loading language model from files %s %s' % (args.lm, args.trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr) fs, audio = wav.read(args.audio) # We can assume 16kHz audio_length = len(audio) * (1 / 16000) assert fs == 16000, "Only 16000Hz input WAV files are supported for now!" print('Running inference.', file=sys.stderr) inference_start = timer() print(ds.stt(audio, fs)) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
def main(model, alphabet, lm, trie, dest): # parser = argparse.ArgumentParser(description='Benchmarking tooling for DeepSpeech native_client.') # parser.add_argument('model', type=str, # help='Path to the model (protocol buffer binary file)') # parser.add_argument('alphabet', type=str, # help='Path to the configuration file specifying the alphabet used by the network') # parser.add_argument('lm', type=str, nargs='?', # help='Path to the language model binary file') # parser.add_argument('trie', type=str, nargs='?', # help='Path to the language model trie file created with native_client/generate_trie') # parser.add_argument('audio', type=str, # help='Path to the audio file to run (WAV format)') # args = parser.parse_args() # print(args); print('Loading model from file %s' % (model), file=sys.stderr) model_load_start = timer() ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr) if lm and trie: print('Loading language model from files %s %s' % (lm, trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr) # fs, audio = read_video(args.audio) #wav.read(args.audio) # return ; print('Running inference.', file=sys.stderr) clips = os.listdir(dest) ; # clips dir path subs = [] ; for i, clip in enumerate(clips) : fs, audio = wav.read(dest + str(i) + '.wav') ; if fs != 16000: if fs < 16000: print('Warning: original sample rate (%d) is lower than 16kHz. Up-sampling might produce erratic speech recognition.' % (fs), file=sys.stderr) fs, audio = convert_samplerate(dest + str(i) + '.wav') audio_length = len(audio) * ( 1 / 16000) inference_start = timer() subs.append(ds.stt(audio, fs)) ; print(subs[len(subs) - 1]); inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr) # break ; return subs ;
def main(): parser = argparse.ArgumentParser( description='Benchmarking tooling for DeepSpeech native_client.') parser.add_argument('model', help='Path to the model (protocol buffer binary file)') parser.add_argument( 'alphabet', help= 'Path to the configuration file specifying the alphabet used by the network' ) parser.add_argument('lm', nargs='?', help='Path to the language model binary file') parser.add_argument( 'trie', nargs='?', help= 'Path to the language model trie file created with native_client/generate_trie' ) parser.add_argument('audio', help='Path to the audio file to run (WAV format)') args = parser.parse_args() print('Loading model from file %s' % (args.model), file=sys.stderr) model_load_start = timer() ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr) if args.lm and args.trie: print('Loading language model from files %s %s' % (args.lm, args.trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr) fs, audio = wav.read(args.audio) if fs != 16000: if fs < 16000: print( 'Warning: original sample rate (%d) is lower than 16kHz. Up-sampling might produce erratic speech recognition.' % (fs), file=sys.stderr) fs, audio = convert_samplerate(args.audio) audio_length = len(audio) * (1 / 16000) print('Running inference.', file=sys.stderr) inference_start = timer() print(ds.stt(audio, fs)) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
def main(): parser = argparse.ArgumentParser(description='Running DeepSpeech inference.') parser.add_argument('--model', help='Path to the model (protocol buffer binary file)') parser.add_argument('--alphabet', help='Path to the configuration file specifying the alphabet used by the network') parser.add_argument('--lm', nargs='?', help='Path to the language model binary file') parser.add_argument('--trie', nargs='?', help='Path to the language model trie file created with native_client/generate_trie') parser.add_argument('--audio', help='Path to the audio file to run (WAV format)') parser.add_argument('--version', help='Print version and exits') args = parser.parse_args() if args.version: print_versions() return 0 print('Loading model from file {}'.format(args.model), file=sys.stderr) model_load_start = timer() ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) if args.lm and args.trie: print('Loading language model from files {} {}'.format(args.lm, args.trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr) fin = wave.open(args.audio, 'rb') fs = fin.getframerate() if fs != 16000: print('Warning: original sample rate ({}) is different than 16kHz. Resampling might produce erratic speech recognition.'.format(fs), file=sys.stderr) fs, audio = convert_samplerate(args.audio) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1/16000) fin.close() print('Running inference.', file=sys.stderr) inference_start = timer() print(ds.stt(audio, fs)) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
def stt(audioPath): model = conf.get_config('model') alphabet = conf.get_config('alphabet') lm = conf.get_config('lm') trie = conf.get_config('trie') ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH) if lm and trie: ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) fs, audio = wav.read(audioPath) text = ds.stt(audio, fs) return text
def main(): parser = argparse.ArgumentParser(description='Benchmarking tooling for DeepSpeech native_client.') parser.add_argument('model', type=str, help='Path to the model (protocol buffer binary file)') parser.add_argument('audio', type=str, help='Path to the audio file to run (WAV format)') parser.add_argument('alphabet', type=str, help='Path to the configuration file specifying the alphabet used by the network') parser.add_argument('lm', type=str, nargs='?', help='Path to the language model binary file') parser.add_argument('trie', type=str, nargs='?', help='Path to the language model trie file created with native_client/generate_trie') args = parser.parse_args() print('Loading model from file %s' % (args.model), file=sys.stderr) model_load_start = timer() ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr) if args.lm and args.trie: print('Loading language model from files %s %s' % (args.lm, args.trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr) for path in sorted(glob.glob(args.audio))[::1]: target = os.path.splitext(path)[0] + '.txt' if os.path.exists(target): continue fs, audio = wav.read(path) # We can assume 16kHz audio_length = len(audio) * (1 / 16000) assert fs == 16000, "Only 16000Hz input WAV files are supported for now!" print('Running inference of %s.' % path, file=sys.stderr) inference_start = timer() text = ds.stt(audio, fs) print(text) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr) with open(target, 'w') as out: out.write(text)
def main(): print('Loading model from file %s' % MODEL, file=sys.stderr) model_load_start = timer() ds = Model(MODEL, N_FEATURES, N_CONTEXT, ALPHABET, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in %0.3fs.' % model_load_end, file=sys.stderr) # Uncomment if you want to use a language model # ============================================= # print('Loading language model from files %s %s' % (LANGUAGE_MODEL, TRIE), file=sys.stderr) # lm_load_start = timer() # ds.enableDecoderWithLM(ALPHABET, LANGUAGE_MODEL, TRIE, LM_WEIGHT, # WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) # lm_load_end = timer() - lm_load_start # print('Loaded language model in %0.3fs.' % lm_load_end, file=sys.stderr) # audio file path_to_audio = 'data/sesq316qna.mp3' # change rate of audio file to 16kHz call = AudioSegment.from_file(path_to_audio) call = call.set_frame_rate(16000) # only analyze the first 2 minutes (2 * 60 * 1000) segment = call[:120000] # declare the new name of the audio file path = 'data/testing.wav' # export the audio file to wav format segment.export(path, format="wav") # read the new file again with the wav reader fs, audio = wav.read(path) # We can assume 16kHz audio_length = len(audio) * (1 / 16000) assert fs == 16000, "Only 16000Hz input WAV files are supported for now!" print('Running inference.', file=sys.stderr) inference_start = timer() prediction_text = ds.stt(audio, fs) print(prediction_text) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
class DeepSpeechSTT(AbstractSTTEngine): """ DeepSpeech Speech-to-Text implementation """ SLUG = 'deepspeech' def __init__(self, vocabulary, graph="models/output_graph.pb", alphabet="models/alphabet.txt"): self._logger = logging.getLogger(__name__) self._logger.debug("Initializing DeepSpeech with graph '%s' " + "and alphabet '%s'", graph, alphabet) self._model = Model(graph, 26, 9, alphabet, 500) @classmethod def get_config(cls): # FIXME: Replace this as soon as we have a config module config = {} profile_path = jasperpath.config('profile.yml') if os.path.exists(profile_path): with open(profile_path, 'r') as f: profile = yaml.safe_load(f) try: config['graph'] = profile['deepspeech']['graph'] config['alphabet'] = profile['deepspeech']['alphabet'] except KeyError: pass return config def transcribe(self, fp): """ Performs STT, transcribing an audio file and returning the result. Arguments: fp -- a file object containing audio data """ fs, audio = wav.read(fp) return self._model.stt(audio, fs) @classmethod def is_available(cls): return diagnose.check_python_import('deepspeech')
class MozillaDeepSpeechASREngine(ASREngine): """https://github.com/mozilla/DeepSpeech""" def __init__(self, model_path, alphabet_path, language_model_path=None, trie_path=None): """ Constructor. :param model_path: Absolute path to (acoustic) model file. :param alphabet_path: Absolute path to file containing alphabet. :param language_model_path: Absolute path to language model file. This parameter is optional. Set to enable decoding with language model. :param trie_path: Absolute path to trie. This parameter is optional. Set to enable decoding with language model. """ # https://github.com/mozilla/DeepSpeech/blob/master/native_client/python/client.py self._model = Model(aModelPath=model_path, aNCep=26, aNContext=9, aAlphabetConfigPath=alphabet_path, aBeamWidth=500) if language_model_path is not None and trie_path is not None: self._model.enableDecoderWithLM(aAlphabetConfigPath=alphabet_path, aLMPath=language_model_path, aTriePath=trie_path, aLMWeight=1.75, aWordCountWeight=1.0, aValidWordCountWeight=1.0) self._with_language_model = True else: self._with_language_model = False def transcribe(self, path): pcm, sample_rate = soundfile.read(path) pcm = (np.iinfo(np.int16).max * pcm).astype(np.int16) return self._model.stt(pcm, aSampleRate=sample_rate) def __str__(self): if self._with_language_model: return 'Mozilla DeepSpeech (with language model)' else: return 'Mozilla DeepSpeech'
def Deep(): try: if tkMessageBox.askyesno("Confirmation", "Would you like to proceed?"): BEAM_WIDTH = 500 LM_WEIGHT = 1.75 WORD_COUNT_WEIGHT = 1.00 VALID_WORD_COUNT_WEIGHT = 1.00 N_FEATURES = 26 N_CONTEXT = 9 ds = Model('models/models.pb', N_FEATURES, N_CONTEXT, 'models/alphabet.txt', BEAM_WIDTH) fs, audio = wav.read(audiofile.get()) if fs != 16000: cbn = sox.Combiner() cbn.convert(samplerate=16000, n_channels=1) cbn.build([str(audiofile.get())], './', 'concatenate') fs, audio = wav.read('./') audio_length = len(audio) * (1 / 16000) resultpage = Toplevel(parent) resultpage.title("Result") result_border = ttk.Frame(resultpage, padding=(12, 12, 12, 12)) result_border.pack() result_page = Frame(result_border, bg="white") result_page.pack() Tkinter.Label(result_page, text="What I've heard from you:", font=14, bg="white").grid(row=1, column=1, sticky=E) Tkinter.Label(result_page, textvariable=word, font=12, bg="white").grid(row=2, column=2, sticky=E) word.set(ds.stt(audio, fs)) except ValueError: tkMessageBox.showerror("Error!", "Only 16000Hz WAV files supported!") except IOError: tkMessageBox.showerror("Error!", "No file uploaded!")
def main(): ds = Model('./output_graph.pb', N_FEATURES, N_CONTEXT, './alphabet.txt', BEAM_WIDTH) r = sr.Recognizer() r.energy_threshold = 500 with sr.Microphone(sample_rate=16000) as source: print('Say something!', file=sys.stdout) headDisplay.display_image( "/home/team18/Grasp-Detector-master/sawyer_head/what_fruit_would_you_like.JPG" ) audio_temp = r.listen(source) # fs=44100 print('Recording done!!!') with open("microphone-results.wav", "wb") as f: f.write(audio_temp.get_wav_data()) time.sleep(1) fs, audio = wav.read('microphone-results.wav') theText = ds.stt(audio, fs) print(theText) final_value = -1 if "av" in theText: final_value = 2 elif "ap" in theText: final_value = 1 elif "ba" in theText: final_value = 3 elif "or" in theText: final_value = 5 else: final_value = 7 with open("finalvalue.txt", "wb") as f: f.write(str(final_value)) time.sleep(0.5)
def main(): parser = argparse.ArgumentParser(description='Benchmarking tooling for DeepSpeech native_client.') parser.add_argument('model', type=str, help='Path to the model (protocol buffer binary file)') parser.add_argument('alphabet', type=str, help='Path to the configuration file specifying the alphabet used by the network') parser.add_argument('lm', type=str, nargs='?', help='Path to the language model binary file') parser.add_argument('trie', type=str, nargs='?', help='Path to the language model trie file created with native_client/generate_trie') parser.add_argument('audio', type=str, help='Path to the audio file to run (WAV format)') args = parser.parse_args() print('Loading model from file %s' % (args.model), file=sys.stderr) model_load_start = timer() ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr) if args.lm and args.trie: print('Loading language model from files %s %s' % (args.lm, args.trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr) fs, audio = wav.read(args.audio) if fs != 16000: if fs < 16000: print('Warning: original sample rate (%d) is lower than 16kHz. Up-sampling might produce erratic speech recognition.' % (fs), file=sys.stderr) fs, audio = convert_samplerate(args.audio) audio_length = len(audio) * ( 1 / 16000) print('Running inference.', file=sys.stderr) inference_start = timer() print(ds.stt(audio, fs)) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
class DeepSpeech: """Wrap DeepSpeech and provide the methods we need""" def __init__(self, settings): self.beam_width = 1024 self.lm_weight = 1.75 self.word_count_weight = 1.00 self.valid_word_count_weight = 1.00 self.n_features = 26 self.n_context = 9 self.alphabet = settings.get('alphabet') self.lm = settings.get('lm') self.trie = settings.get('trie') self.graph = settings.get('graph') def load_model(self): start = timer() self.model = Model(self.graph, self.n_features, self.n_context, self.alphabet, self.beam_width) end = timer() print('Loaded model in %0.3fs.' % (end - start)) if self.lm is not None and self.trie is not None: start = timer() self.model.enableDecoderWithLM( self.alphabet, self.lm, self.trie, self.lm_weight, self.word_count_weight, self.valid_word_count_weight ) end = timer() print('Loaded language model in %0.3fs.' % (end - start)) def oneshoot(self, wav_file): fs, audio = wav.read(wav_file) start = timer() result = self.model.stt(audio, fs) latency = timer() - start audio_length = len(audio) * ( 1 / 16000) return result, latency
class SpeechToText(): def __init__(self, model_path): # Defined constants. See https://github.com/mozilla/DeepSpeech/blob/master/native_client/python/client.py BEAM_WIDTH = 500 LM_WEIGHT = 1.75 WORD_COUNT_WEIGHT = 1.00 VALID_WORD_COUNT_WEIGHT = 1.00 N_FEATURES = 26 N_CONTEXT = 9 model = os.path.join(model_path, "output_graph.pb") alphabet = os.path.join(model_path, "alphabet.txt") lm = os.path.join(model_path, "lm.binary") trie = os.path.join(model_path, "trie") self.model = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH) self.model.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) def run(self, audio, fs): return self.model.stt(audio, fs)
def recognize_deepspeech(audio): model = path.join(path.dirname(path.realpath(__file__)), 'models/output_graph.pb') alphabet = path.join(path.dirname(path.realpath(__file__)), 'models/alphabet.txt') lm = path.join(path.dirname(path.realpath(__file__)), 'models/lm.binary') trie = path.join(path.dirname(path.realpath(__file__)), 'models/trie') #print('Loading model from file %s' % (model), file=sys.stderr) model_load_start = timer() ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start #print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr) if lm and trie: #print('Loading language model from files %s %s' % (lm, trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start #print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr) fs, audio = wav.read(audio) if fs != 16000: if fs < 16000: print( 'Warning: original sample rate (%d) is lower than 16kHz. Up-sampling might produce erratic speech recognition.' % (fs), file=sys.stderr) fs, audio = convert_samplerate(audio) audio_length = len(audio) * (1 / 16000) #print('Running inference.', file=sys.stderr) #inference_start = timer() #inference_end = timer() - inference_start return ds.stt(audio, fs)
def main(): parser = argparse.ArgumentParser( description='Benchmarking tooling for DeepSpeech native_client.') parser.add_argument('model', type=str, help='Path to the model (protocol buffer binary file)') parser.add_argument('audio', type=str, help='Path to the audio file to run (WAV format)') parser.add_argument( 'alphabet', type=str, help= 'Path to the configuration file specifying the alphabet used by the network' ) parser.add_argument('lm', type=str, nargs='?', help='Path to the language model binary file') parser.add_argument( 'trie', type=str, nargs='?', help= 'Path to the language model trie file created with native_client/generate_trie' ) args = parser.parse_args() ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH) if args.lm and args.trie: ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) fs, audio = wav.read(args.audio) print(ds.stt(audio, fs))
print('rank: ', str(rank)) # Get Audio Filename vf = file_list[_file_id] print('file: ', vf) print(' ') file_path, file_name = os.path.split(vf) folder_name = audio_dir + "/rank_" + str(rank) try: os.makedirs(folder_name) except: print("Directory %s exists \n" % folder_name) #model location alphabet file ds = Model('/home/ubuntu/deepspeech/models/output_graph.pb', 26, 9, '/home/ubuntu/deepspeech/models/alphabet.txt', 500) fs, audio = wav.read(vf) processed_data = ds.stt(audio, fs) #processed_data=ds.stt(audio.flatten(),fs) seperate_save = str(folder_name) + '-' + str(file_name) + '-data.txt' with open(seperate_save, 'a+') as f: f.write(processed_data) # read the entire audio file # Audio to text data_save = 'AudioData.txt' with open(data_save, 'a+') as f: f.write(processed_data + '\r\r') # read the entire audio file try: print('\nDeepSpeech says, "...' + str(processed_data) + '..."\n\nThe data has been stored in file: ' + str(data_save) + '\n') except:
def deepspeech_main(): # These constants control the beam search decoder # Beam width used in the CTC decoder when building candidate transcriptions BEAM_WIDTH = 500 # The alpha hyperparameter of the CTC decoder. Language Model weight LM_WEIGHT = 1.75 # The beta hyperparameter of the CTC decoder. Word insertion weight (penalty) WORD_COUNT_WEIGHT = 1.00 # Valid word insertion weight. This is used to lessen the word insertion penalty # when the inserted word is part of the vocabulary VALID_WORD_COUNT_WEIGHT = 1.00 # These constants are tied to the shape of the graph used (changing them changes # the geometry of the first layer), so make sure you use the same constants that # were used during training # Number of MFCC features to use N_FEATURES = 26 # Size of the context window used for producing timesteps in the input vector N_CONTEXT = 9 parser = argparse.ArgumentParser( description='Benchmarking tooling for DeepSpeech native_client.') parser.add_argument('model', type=str, help='Path to the model (protocol buffer binary file)') parser.add_argument('audio', type=str, help='Path to the audio file to run (WAV format)') parser.add_argument( 'alphabet', type=str, help= 'Path to the configuration file specifying the alphabet used by the network' ) parser.add_argument('lm', type=str, nargs='?', help='Path to the language model binary file') parser.add_argument( 'trie', type=str, nargs='?', help= 'Path to the language model trie file created with native_client/generate_trie' ) args = parser.parse_args() print('Loading model from file %s' % (args.model), file=sys.stderr) model_load_start = timer() ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr) if args.lm and args.trie: print('Loading language model from files %s %s' % (args.lm, args.trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr) fs, audio = wav.read(args.audio) # We can assume 16kHz audio_length = len(audio) * (1 / 16000) assert fs == 16000, "Only 16000Hz input WAV files are supported for now!" print('Running inference.', file=sys.stderr) inference_start = timer() print(ds.stt(audio, fs)) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
#!/usr/bin/env python # -*- coding: utf-8 -*- from __future__ import absolute_import, division, print_function import sys import scipy.io.wavfile as wav from deepspeech.model import Model import time print('imports ok') model2 = '/home/nvidia/DeepSpeech/data/ldc93s1/model/output_graph.pb' micro2 = '/home/nvidia/DeepSpeech/data/ldc93s1/LDC93S1.wav' ds = Model(model2, 26, 9) #model link, cepstrum, context print('Model ok') while 1: print('lecture wav') fs, audio = wav.read(micro2) print(ds.stt(audio, fs))
def main(options): # Ensure ffmpeg is around if not run_ffmpeg(['-version']): log.error( "ffmpeg needs to be available to strip audio from the video file.") exit(1) with NamedTemporaryFile(delete=True) as vid_file: log.info("Downloading %s - this might take a while." % options.vid_url) response = get(options.vid_url, stream=True) total_length = response.headers.get("content-length") if total_length is None: # no content length header log.info("Unknown length - can't predict how long this will take.") f.write(response.content) else: bar = ProgressBar(max_value=int(total_length)) dl = 0 for data in response.iter_content(chunk_size=DOWNLOAD_CHUNK_SIZE): dl += len(data) vid_file.write(data) vid_file.flush() bar.update(dl) log.info("Download done. Stripping audio.") (wav_file, wav_file_name) = mkstemp('.wav') result = run_ffmpeg([ "-y", "-i", vid_file.name, "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", wav_file_name ]) if not result: close(wav_file) log.error("ffmpeg failed. Bailing.") exit(1) fs, audio = wav.read(wav_file_name) close(wav_file) log.info("Will write VTT to %s" % options.output) # Make sure the WAV is to code... log.info("Loading up WAV file...") if fs != 16000: log.error("Only 16000hz WAV files are usable.") exit(1) total_samples = len(audio) duration_hours, duration_minutes, duration_seconds = sample_index_to_time( len(audio)) log.info("Approximate duration: %d:%02d:%02d" % (duration_hours, duration_minutes, duration_seconds)) # Let's load up DeepSpeech and get it ready. log.info("Loading pre-trained DeepSpeech model...") root_model_dir = path.join(options.deepspeech_model_dir, MODEL_DIR) model = path.join(root_model_dir, MODEL_FILE) alphabet = path.join(root_model_dir, MODEL_ALPHABET) lang_model = path.join(root_model_dir, MODEL_LANG_MODEL) trie = path.join(root_model_dir, MODEL_TRIE) deepspeech = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH) log.info("Done loading model.") log.info("Loading language model...") deepspeech.enableDecoderWithLM(alphabet, lang_model, trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) log.info("Done loading model.") playhead = 0 out = WebVTTFile() bar = ProgressBar(max_value=total_samples) while playhead < (total_samples - 1): end_point = min(playhead + AUDIO_SEGMENT_SAMPLES, (total_samples - 1)) segment = audio[playhead:end_point] inference = deepspeech.stt(segment, fs) log.debug("Inferred: %s" % inference) start_hours, start_minutes, start_seconds = sample_index_to_time( playhead) playhead = end_point end_hours, end_minutes, end_seconds = sample_index_to_time(playhead) if not inference or inference == "ah": continue for search, replace in INFERENCE_REPLACEMENTS.iteritems(): inference = sub(r"\b" + search + r"\b", replace, inference) inference = fill(inference, width=MAX_CAPTION_WIDTH) start = WebVTTTime(start_hours, start_minutes, start_seconds) end = WebVTTTime(end_hours, end_minutes, end_seconds) item = WebVTTItem(0, start, end, inference) out.append(item) bar.update(playhead) out.save(options.output, encoding="utf-8") out.clean_indexes() out.save(options.output, encoding="utf-8")
def main(): # Use the following for defaults # model /home/dalonlobo/deepspeech_models/models/output_graph.pb # audio /home/dalonlobo/deepspeech_models/models/2830-3980-0043.wav # alphabet /home/dalonlobo/deepspeech_models/lm_models/alphabet.txt # lm /home/dalonlobo/deepspeech_models/lm_models/lm_o5.binary # trie /home/dalonlobo/deepspeech_models/lm_models/o5_trie parser = argparse.ArgumentParser( description='Benchmarking tooling for DeepSpeech native_client.') parser.add_argument( 'model', type=str, nargs='?', default='/home/dalonlobo/deepspeech_models/models/output_graph.pb', help='Path to the model (protocol buffer binary file)') parser.add_argument( 'audio', type=str, nargs='?', default='/home/dalonlobo/deepspeech_models/models/2830-3980-0043.wav', help='Path to the audio file to run (WAV format)') parser.add_argument( 'alphabet', type=str, nargs='?', default='/home/dalonlobo/deepspeech_models/lm_models/alphabet.txt', help= 'Path to the configuration file specifying the alphabet used by the network' ) parser.add_argument( 'lm', type=str, nargs='?', default='/home/dalonlobo/deepspeech_models/lm_models/lm_o5.binary', help='Path to the language model binary file') parser.add_argument( 'trie', type=str, nargs='?', default='/home/dalonlobo/deepspeech_models/lm_models/o5_trie', help= 'Path to the language model trie file created with native_client/generate_trie' ) args = parser.parse_args() print('Loading model from file %s' % (args.model), file=sys.stderr) model_load_start = timer() ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr) if args.lm and args.trie: print('Loading language model from files %s %s' % (args.lm, args.trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr) fs, audio = wav.read(args.audio) # We can assume 16kHz audio_length = len(audio) * (1 / 16000) print('Running inference.', file=sys.stderr) inference_start = timer() print(ds.stt(audio, fs)) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
class DeepSpeechSTTPlugin(plugin.STTPlugin): """ Speech-To-Text implementation which relies on the DeepSpeech API. """ def __init__(self, *args, **kwargs): """ Create Plugin Instance """ plugin.STTPlugin.__init__(self, *args, **kwargs) self._logger = logging.getLogger(__name__) self._logger.info("Init DeepSpeech") self._logger.debug(str(self.profile)) if not deepspeech_available: self._logger.warning("DeepSpeech import error!") # raise ImportError("DeepSpeech not installed!") self._logger.warning("This STT plugin doesn't have multilanguage " + "support!") # Beam width used in the CTC decoder when building candidate # transcriptions try: self._BEAM_WIDTH = self.profile["deepspeech"]["beam_width"] except KeyError: self._BEAM_WIDTH = 500 # The alpha hyperparameter of the CTC decoder. Language Model weight try: self._LM_WEIGHT = self.profile["deepspeech"]["lm_weight"] except KeyError: self._LM_WEIGHT = 1.75 # The beta hyperparameter of the CTC decoder. Word insertion weight # (penalty) try: self._WORD_COUNT_WEIGHT = self.profile["deepspeech"][ "word_count_weight"] except KeyError: self._WORD_COUNT_WEIGHT = 1.00 # Valid word insertion weight. This is used to lessen the word # insertion penalty when the inserted word is part of the vocabulary try: self._VALID_WORD_COUNT_WEIGHT = self.profile["deepspeech"][ "valid_word_count_weight"] except KeyError: self._VALID_WORD_COUNT_WEIGHT = 1.00 # These constants are tied to the shape of the graph used (changing # them changes the geometry of the first layer), so make sure you # use the same constants that were used during training # Number of MFCC features to use try: self._N_FEATURES = self.profile["deepspeech"]["n_features"] except KeyError: self._N_FEATURES = 26 # Size of the context window used for producing timesteps in the # input vector try: self._N_CONTEXT = self.profile["deepspeech"]["n_context"] except KeyError: self._N_CONTEXT = 9 # Only 16KHz files are currently supported try: self._FS = self.profile["deepspeech"]["fs"] except KeyError: self._FS = 16000 # These are paths. They are required # Path to the model (protocol buffer binary file) self._MODEL = self.profile["deepspeech"]["model"] if (not os.path.exists(self._MODEL)): msg = ("DeepSpeech model '%s' does not exist! " + "Please make sure that you have set the " + "correct deepspeech: model in your profile.") % self._MODEL self._logger.error(msg) raise RuntimeError(msg) # Path to the configuration file specifying the alphabet used self._ALPHABET = self.profile["deepspeech"]["alphabet"] if (not os.path.exists(self._ALPHABET)): msg = ("DeepSpeech alphabet '%s' does not exist! " + "Please make sure that you have set the " + "correct deepspeech: alphabet in your profile." ) % self._ALPHABET self._logger.error(msg) raise RuntimeError(msg) # Path to the language model binary file self._LM = self.profile["deepspeech"]["language_model"] if (not os.path.exists(self._LM)): msg = ("DeepSpeech language model '%s' does not exist! " + "Please make sure that you have set the correct " + "deepspeech: language_model in your profile.") % self._LM self._logger.error(msg) raise RuntimeError(msg) # Path to the language model trie file created with # native_client/generate_trie self._TRIE = self.profile["deepspeech"]["trie"] if (not os.path.exists(self._TRIE)): msg = ("DeepSpeech trie '%s' does not exist! " + "Please make sure that you have set the " + "correct deepspeech: trie in your profile.") % self._TRIE self._logger.error(msg) raise RuntimeError(msg) self._ds = Model(self._MODEL, self._N_FEATURES, self._N_CONTEXT, self._ALPHABET, self._BEAM_WIDTH) self._ds.enableDecoderWithLM(self._ALPHABET, self._LM, self._TRIE, self._LM_WEIGHT, self._WORD_COUNT_WEIGHT, self._VALID_WORD_COUNT_WEIGHT) def transcribe(self, fp): """ transcribe given audio file object fp and return the result. """ fp.seek(0) fs, audio = wav.read(fp) # We can assume 16kHz # audio_length = len(audio) * (1 / self._FS) assert fs == self._FS, ( "Only %dHz input WAV files are supported for now!" % self._FS) text = self._ds.stt(audio, self._FS) transcribed = [text.upper()] return transcribed
class DeepSpeechSTTPlugin(plugin.STTPlugin): """ Speech-To-Text implementation which relies on the DeepSpeech API. """ def __init__(self, *args, **kwargs): """ Create Plugin Instance """ plugin.STTPlugin.__init__(self, *args, **kwargs) self._logger = logging.getLogger(__name__) if not deepspeech_available: self._logger.warning("DeepSpeech import error!") # raise ImportError("DeepSpeech not installed!") self._logger.warning("This STT plugin doesn't have multilanguage " + "support!") # Beam width used in the CTC decoder when building candidate transcriptions try: self._BEAM_WIDTH = self.profile["deepspeech"]["beam_width"] except KeyError: self._BEAM_WIDTH = 500 # The alpha hyperparameter of the CTC decoder. Language Model weight try: self._LM_WEIGHT = self.profile["deepspeech"]["lm_weight"] except KeyError: self._LM_WEIGHT = 1.75 # The beta hyperparameter of the CTC decoder. Word insertion weight (penalty) try: self._WORD_COUNT_WEIGHT = self.profile["deepspeech"][ "word_count_weight"] except KeyError: self._WORD_COUNT_WEIGHT = 1.00 # Valid word insertion weight. This is used to lessen the word insertion penalty # when the inserted word is part of the vocabulary try: self._VALID_WORD_COUNT_WEIGHT = self.profile["deepspeech"][ "valid_word_count_weight"] except KeyError: self._VALID_WORD_COUNT_WEIGHT = 1.00 # These constants are tied to the shape of the graph used (changing them changes # the geometry of the first layer), so make sure you use the same constants that # were used during training # Number of MFCC features to use try: self._N_FEATURES = self.profile["deepspeech"]["n_features"] except KeyError: self._N_FEATURES = 26 # Size of the context window used for producing timesteps in the input vector try: self._N_CONTEXT = self.profile["deepspeech"]["n_context"] except KeyError: self._N_CONTEXT = 9 # Only 16KHz files are currently supported try: self._FS = self.profile["deepspeech"]["fs"] except KeyError: self._FS = 16000 # Save the output for inspection? self._save_input = False try: _save_input = self.profile["deepspeech"]["save_input"] except KeyError: self._save_input = False # These are paths. They are required # Path to the model (protocol buffer binary file) self._MODEL = self.profile["deepspeech"]["model"] if (not os.path.exists(self._MODEL)): msg = ( "DeepSpeech model '%s' does not exist! Please make sure that you " + "have set the correct deepspeech: model in your profile." ) % self._MODEL self._logger.error(msg) raise RuntimeError(msg) # Path to the configuration file specifying the alphabet used self._ALPHABET = self.profile["deepspeech"]["alphabet"] if (not os.path.exists(self._ALPHABET)): msg = ( "DeepSpeech alphabet '%s' does not exist! Please make sure that you " + "have set the correct deepspeech: alphabet in your profile." ) % self._ALPHABET self._logger.error(msg) raise RuntimeError(msg) # Path to the language model binary file self._LM = self.profile["deepspeech"]["language_model"] if (not os.path.exists(self._LM)): msg = ( "DeepSpeech language model '%s' does not exist! Please make sure that you " + "have set the correct deepspeech: language_model in your profile." ) % self._LM self._logger.error(msg) raise RuntimeError(msg) # Path to the language model trie file created with native_client/generate_trie self._TRIE = self.profile["deepspeech"]["trie"] if (not os.path.exists(self._TRIE)): msg = ( "DeepSpeech trie '%s' does not exist! Please make sure that you " + "have set the correct deepspeech: trie in your profile." ) % self._TRIE self._logger.error(msg) raise RuntimeError(msg) self._ds = Model(self._MODEL, self._N_FEATURES, self._N_CONTEXT, self._ALPHABET, self._BEAM_WIDTH) self._ds.enableDecoderWithLM(self._ALPHABET, self._LM, self._TRIE, self._LM_WEIGHT, self._WORD_COUNT_WEIGHT, self._VALID_WORD_COUNT_WEIGHT) # Create the audiolog if it does not exist self._audiolog = os.path.join( os.path.dirname(os.path.realpath(__file__)), "audiolog") if not os.path.exists(self._audiolog): os.makedirs(self._audiolog) # Clear the audiolog files = os.listdir(self._audiolog) for file in files: if file.endswith(".wav"): self._logger.info("to delete: %s" % os.path.join(self._audiolog, file)) os.remove(os.path.join(self._audiolog, file)) self._filecount = 0 def transcribe(self, fp): """ transcribe given audio file object fp and return the result. """ fs, audio = wav.read(fp) # We can assume 16kHz audio_length = len(audio) * (1 / self._FS) assert fs == self._FS, "Only %dHz input WAV files are supported for now!" % self._FS text = self._ds.stt(audio, self._FS) transcribed = [text.upper()] print('>> %r' % transcribed) # write the output to a log file if (self._save_input and not transcribed == ['']): self._filecount += 1 f = open( os.path.join(self._audiolog, "%d_%s.wav" % (self._filecount, text)), "w") f.write(fp.read()) f.close() return transcribed