def main(): parser = argparse.ArgumentParser( description='Running DeepSpeech inference.') parser.add_argument('--model', required=True, help='Path to the model (protocol buffer binary file)') parser.add_argument( '--alphabet', required=True, help= 'Path to the configuration file specifying the alphabet used by the network' ) parser.add_argument('--lm', nargs='?', help='Path to the language model binary file') parser.add_argument( '--trie', nargs='?', help= 'Path to the language model trie file created with native_client/generate_trie' ) parser.add_argument('--audio1', required=True, help='First audio file to use in interleaved streams') parser.add_argument('--audio2', required=True, help='Second audio file to use in interleaved streams') args = parser.parse_args() ds = Model(args.model, args.alphabet, BEAM_WIDTH) if args.lm and args.trie: ds.enableDecoderWithLM(args.lm, args.trie, LM_ALPHA, LM_BETA) fin = wave.open(args.audio1, 'rb') fs1 = fin.getframerate() audio1 = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) fin.close() fin = wave.open(args.audio2, 'rb') fs2 = fin.getframerate() audio2 = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) fin.close() stream1 = ds.createStream() stream2 = ds.createStream() splits1 = np.array_split(audio1, 10) splits2 = np.array_split(audio2, 10) for part1, part2 in zip(splits1, splits2): ds.feedAudioContent(stream1, part1) ds.feedAudioContent(stream2, part2) print(ds.finishStream(stream1)) print(ds.finishStream(stream2))
def setup_model(model_path, alphabet, lm, trie, features): log("creating model {} {} with features {}...".format( model_path, alphabet, features)) ds_model = Model(model_path, features.beam_width) if lm and trie: ds_model.enableDecoderWithLM(lm, trie, features.lm_alpha, features.lm_beta) log("model is ready.") return ds_model
def main(argv): if len(argv) < 1: print("No .wav File given.") return ds = Model(MODEL_FILE, 500) ds.enableDecoderWithLM(LANG_MODEL, TRIE_FILE, 1.50, 2.25) fs, audio = wav.read(argv[0]) data = ds.stt(audio) print(data)
def transcribe(args, filepath="", verbose=0): if verbose > 0: print('Loading model from file {}'.format(args.model), file=sys.stderr) model_load_start = timer() ds = Model(args.model, args.beam_width) if verbose > 0: model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format( model_load_end), file=sys.stderr) desired_sample_rate = ds.sampleRate() if args.lm and args.trie: if verbose > 0: print('Loading language model from files {} {}'.format( args.lm, args.trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(args.lm, args.trie, args.lm_alpha, args.lm_beta) if verbose > 0: lm_load_end = timer() - lm_load_start print('Loaded language model in {:.3}s.'.format( lm_load_end), file=sys.stderr) fin = wave.open(filepath, 'rb') fs = fin.getframerate() if fs != desired_sample_rate: if verbose > 0: print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format( fs, desired_sample_rate), file=sys.stderr) fs, audio = convert_samplerate(filepath, desired_sample_rate) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1/fs) fin.close() if verbose > 0: print('Running inference.', file=sys.stderr) inference_start = timer() audio_metadata = ds.sttWithMetadata(audio) if verbose > 0: inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr) dict_result = dict() dict_result["sentence"] = "".join( item.character for item in audio_metadata.items) dict_result["words"] = words_from_metadata(audio_metadata) dict_result["characters"] = audio_metadata dict_result["confidence"] = audio_metadata.confidence return dict_result
class Tester(BaseTester): name = 'DeepSpeech' audio_format = RATE16K_MONO_WAV def __init__(self, *args, **kwargs): super(Tester, self).__init__(*args, **kwargs) files = [ args_lm, args_trie, args_model, # args_alphabet, ] for f in files: assert os.path.isfile(f), 'File %s does not exist.' % f print('Loading model from file %s' % (args_model), file=sys.stderr) model_load_start = timer() # self.ds = Model(args_model, N_FEATURES, N_CONTEXT, args_alphabet, BEAM_WIDTH) self.ds = Model(args_model, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr) # if args_lm and args_trie: print('Loading language model from files %s %s' % (args_lm, args_trie), file=sys.stderr) lm_load_start = timer() # self.ds.enableDecoderWithLM(args_alphabet, args_lm, args_trie, LM_ALPHA, LM_BETA) self.ds.enableDecoderWithLM(args_lm, args_trie, LM_ALPHA, LM_BETA) lm_load_end = timer() - lm_load_start print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr) def audio_to_text(self, fn): fin = wave.open(fn, 'rb') fs = fin.getframerate() assert fs == 16000, "Only 16000Hz input WAV files are supported for now!" audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1. / fs) fin.close() print('Running inference.', file=sys.stderr) inference_start = timer() # text = self.ds.stt(audio, fs) text = self.ds.stt(audio) print('text:', text) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr) return text
def main(): parser = argparse.ArgumentParser(description='Running DeepSpeech inference.') parser.add_argument('--model', required=True, help='Path to the model (protocol buffer binary file)') parser.add_argument('--alphabet', required=True, help='Path to the configuration file specifying the alphabet used by the network') parser.add_argument('--lm', nargs='?', help='Path to the language model binary file') parser.add_argument('--trie', nargs='?', help='Path to the language model trie file created with native_client/generate_trie') parser.add_argument('--audio', required=True, help='Path to the audio file to run (WAV format)') parser.add_argument('--version', action=VersionAction, help='Print version and exits') parser.add_argument('--extended', required=False, action='store_true', help='Output string from extended metadata') args = parser.parse_args() print('Loading model from file {}'.format(args.model), file=sys.stderr) model_load_start = timer() ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) if args.lm and args.trie: print('Loading language model from files {} {}'.format(args.lm, args.trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_ALPHA, LM_BETA) lm_load_end = timer() - lm_load_start print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr) fin = wave.open(args.audio, 'rb') fs = fin.getframerate() if fs != SAMPLE_RATE: print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format(fs, SAMPLE_RATE), file=sys.stderr) fs, audio = convert_samplerate(args.audio) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1/SAMPLE_RATE) fin.close() print('Running inference.', file=sys.stderr) inference_start = timer() if args.extended: print(metadata_to_string(ds.sttWithMetadata(audio, fs))) else: print(ds.stt(audio, fs)) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
def load_models(): model_load_start = timer() ds = Model(MODEL_FILE, BEAM_WIDTH) model_load_end = timer() - model_load_start logging.debug('Loaded model in %0.3fs.' % (model_load_end)) lm_load_start = timer() ds.enableDecoderWithLM(LANGUAGE_MODEL, TRIE_FILE, LM_ALPHA, LM_BETA) lm_load_end = timer() - lm_load_start logging.debug('Loaded language model in %0.3fs.' % (lm_load_end)) return ds
def load_model(model_dir): BEAM_WIDTH = 500 DEFAULT_SAMPLE_RATE = 16000 LM_ALPHA = 0.75 LM_BETA = 1.85 model_path = os.path.join(model_dir, 'output_graph.pbmm') trie_path = os.path.join(model_dir, 'trie') lm_path = os.path.join(model_dir, 'lm.binary') model = Model(model_path, BEAM_WIDTH) model.enableDecoderWithLM(lm_path, trie_path, LM_ALPHA, LM_BETA) return model
def main(): parser = argparse.ArgumentParser(description='Running DeepSpeech inference.') parser.add_argument('--model', required=True, help='Path to the model (protocol buffer binary file)') parser.add_argument('--alphabet', required=True, help='Path to the configuration file specifying the alphabet used by the network') parser.add_argument('--lm', nargs='?', help='Path to the language model binary file') parser.add_argument('--trie', nargs='?', help='Path to the language model trie file created with native_client/generate_trie') parser.add_argument('--audio', required=True, help='Path to the audio file to run (WAV format)') parser.add_argument('--version', action=VersionAction, help='Print version and exits') parser.add_argument('--extended', required=False, action='store_true', help='Output string from extended metadata') args = parser.parse_args() print('Loading model from file {}'.format(args.model), file=sys.stderr) model_load_start = timer() ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) if args.lm and args.trie: print('Loading language model from files {} {}'.format(args.lm, args.trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_ALPHA, LM_BETA) lm_load_end = timer() - lm_load_start print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr) fin = wave.open(args.audio, 'rb') fs = fin.getframerate() if fs != 16000: print('Warning: original sample rate ({}) is different than 16kHz. Resampling might produce erratic speech recognition.'.format(fs), file=sys.stderr) fs, audio = convert_samplerate(args.audio) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1/16000) fin.close() print('Running inference.', file=sys.stderr) inference_start = timer() if args.extended: print(metadata_to_string(ds.sttWithMetadata(audio, fs))) else: print(ds.stt(audio, fs)) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
def loadModel(): print('Loading model from file {}'.format(modelFile), file=sys.stderr) model_load_start = timer() ds = Model(modelFile, N_FEATURES, N_CONTEXT, alphabetFile, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) print('Loading language model from files {} {}'.format(lmFile, trieFIle), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(alphabetFile, lmFile, trieFIle, LM_WEIGHT, VALID_WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr) return ds
def load_deepspeech_model(model='deepspeech-0.5.1-models/output_graph.pb', alphabet='deepspeech-0.5.1-models/alphabet.txt', lm='deepspeech-0.5.1-models/lm.binary', trie='models/trie'): print('Loading model from file {}'.format(model), file=sys.stderr) model_load_start = timer() ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) if lm and trie: print('Loading language model from files {} {}'.format(lm, trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(alphabet, lm, trie, LM_ALPHA, LM_BETA) lm_load_end = timer() - lm_load_start print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr) return ds
def load_model(models, lm, trie): BEAM_WIDTH = 500 LM_ALPHA = 0.75 LM_BETA = 1.85 ds = Model(models, BEAM_WIDTH) ds.enableDecoderWithLM(lm, trie, LM_ALPHA, LM_BETA) sample_rate = ds.sampleRate() return [ds, sample_rate]
class DeepSpeech: def __init__(self, model_path): self.model = model_path + '/output_graph.pbmm' self.alphabet = model_path + '/alphabet.txt' self.lm = model_path + '/lm.binary' self.trie = model_path + '/trie' #print('Loading model from file {}'.format(self.model), file=sys.stderr) #model_load_start = timer() self.ds = Model(self.model, N_FEATURES, N_CONTEXT, self.alphabet, BEAM_WIDTH) #model_load_end = timer() - model_load_start #print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) if self.lm and self.trie: #print('Loading language model from files {} {}'.format(self.lm, self.trie), file=sys.stderr) #lm_load_start = timer() self.ds.enableDecoderWithLM(self.alphabet, self.lm, self.trie, LM_ALPHA, LM_BETA) #lm_load_end = timer() - lm_load_start #print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr) def recognize(self, wav_file): '''parser = argparse.ArgumentParser(description='Running DeepSpeech inference.') parser.add_argument('--model', required=True, help='Path to the model (protocol buffer binary file)') parser.add_argument('--alphabet', required=True, help='Path to the configuration file specifying the alphabet used by the network') parser.add_argument('--lm', nargs='?', help='Path to the language model binary file') parser.add_argument('--trie', nargs='?', help='Path to the language model trie file created with native_client/generate_trie') parser.add_argument('--audio', required=True, help='Path to the audio file to run (WAV format)') parser.add_argument('--version', action=VersionAction, help='Print version and exits') args = parser.parse_args()''' fin = wave.open(wav_file, 'rb') fs = fin.getframerate() if fs != 16000: #print('Warning: original sample rate ({}) is different than 16kHz. Resampling might produce erratic speech recognition.'.format(fs), file=sys.stderr) fs, audio = convert_samplerate(wav_file) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) #audio_length = fin.getnframes() * (1/16000) fin.close() #print('Running inference.', file=sys.stderr) #inference_start = timer() return self.ds.stt(audio, fs)
def transcribe(self, audio): name = 'speech_server_main' conf = config.ConfigDeepSpeech() model = conf.get_config('model') print(model) # alphabet = conf.get_config('alphabet') # print(alphabet) lm = conf.get_config('lm') trie = conf.get_config('trie') print(trie) ds = Model(model, BEAM_WIDTH) if lm and trie: ds.enableDecoderWithLM(lm, trie, LM_ALPHA, LM_BETA) text = ds.stt(audio) return text
def load_model(models, alphabet, lm, trie): BEAM_WIDTH = 500 LM_ALPHA = 0.75 LM_BETA = 1.85 model_load_start = timer() ds = Model(models, alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start logging.debug("Loaded model in %0.3fs." % (model_load_end)) lm_load_start = timer() ds.enableDecoderWithLM(lm, trie, LM_ALPHA, LM_BETA) lm_load_end = timer() - lm_load_start logging.debug('Loaded language model in %0.3fs.' % (lm_load_end)) return [ds, model_load_end, lm_load_end]
def build_model(self, model_path): # Build deepspeech model to use for adversarial sample evaluation BEAM_WIDTH = 500 LM_ALPHA = 0.75 LM_BETA = 1.85 N_FEATURES = 26 N_CONTEXT = 9 MODEL_PATH = model_path + '/models/output_graph.pb' ALPHABET_PATH = model_path + '/models/alphabet.txt' LM_PATH = model_path + '/models/lm.binary' TRIE_PATH = model_path + '/models/trie' ds = Model(MODEL_PATH, N_FEATURES, N_CONTEXT, ALPHABET_PATH, BEAM_WIDTH) ds.enableDecoderWithLM(ALPHABET_PATH, LM_PATH, TRIE_PATH, LM_ALPHA, LM_BETA) return ds
def load_ds_model(model_path, alphabet_path, lm_path=None, trie_path=None, n_features=26, n_context=9, beam_width=500, lm_weight=1.50, valid_word_count_weight=2.10): print( f'loading DeepSpeech model from {model_path}, using alphabet at {alphabet_path}, ' f'LM at {lm_path} and trie at {trie_path}') ds = Model(model_path, n_features, n_context, alphabet_path, beam_width) if lm_path and trie_path: ds.enableDecoderWithLM(alphabet_path, lm_path, trie_path, lm_weight, valid_word_count_weight) return ds
def load_model(): models = "models/output_graph.tflite" #.tflite lm = "models/lm.binary" # lm.binary trie = "models/trie" # trie BEAM_WIDTH = 500 LM_ALPHA = 0.75 LM_BETA = 1.85 ds = Model(models, BEAM_WIDTH) ds.enableDecoderWithLM(lm, trie, LM_ALPHA, LM_BETA) sample_rate = ds.sampleRate() return [ds, sample_rate]
def load_model(): # load the pre-trained Keras model (here we are using a model # pre-trained on ImageNet and provided by Keras, but you can # substitute in your own networks just as easily) global model global ds #model = predict_model("/home/mwang/Development/deep-learning/SincNet/exp/SincNet_lifesize/model_raw.pkl.lifesize") model = predict_model( "/home/mwang/Development/deep-learning/SincNet/model_raw.pkl.lifesize") #model = predict_model("/home/mwang/Development/deep-learning/SincNet/exp/SincNet_libri/model_raw.pkl.amazon") #model = predict_model("/home/mwang/Development/deep-learning/SincNet/exp/SincNet_libri/model_raw.pkl.my_desktop") ds_model = "/home/mwang/Development/deep-learning/stt/models/output_graph.pbmm" ds_alphabet = "/home/mwang/Development/deep-learning/stt/models/alphabet.txt" ds_lm = "/home/mwang/Development/deep-learning/stt/models/lm.binary" ds_trie = "/home/mwang/Development/deep-learning/stt/models/trie" ds = Model(ds_model, N_FEATURES, N_CONTEXT, ds_alphabet, BEAM_WIDTH) ds.enableDecoderWithLM(ds_alphabet, ds_lm, ds_trie, LM_ALPHA, LM_BETA)
def load_model(models, alphabet, lm, trie): N_FEATURES = 26 N_CONTEXT = 9 BEAM_WIDTH = 500 LM_WEIGHT = 1.50 VALID_WORD_COUNT_WEIGHT = 2.10 model_load_start = timer() ds = Model(models, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start logging.debug("Loaded model in %0.3fs." % (model_load_end)) lm_load_start = timer() ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT, VALID_WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start logging.debug('Loaded language model in %0.3fs.' % (lm_load_end)) return [ds, model_load_end, lm_load_end]
def tflite_worker(model, alphabet, lm, trie, queue_in, queue_out, gpu_mask): os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_mask) ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH) ds.enableDecoderWithLM(alphabet, lm, trie, LM_ALPHA, LM_BETA) while True: msg = queue_in.get() fin = wave.open(msg['filename'], 'rb') fs = fin.getframerate() audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1/16000) fin.close() decoded = ds.stt(audio, fs) queue_out.put({'prediction': decoded, 'ground_truth': msg['transcript']}) queue_in.task_done()
def load_model(models, alphabet, lm, trie): N_FEATURES = 26 N_CONTEXT = 9 BEAM_WIDTH = 500 LM_ALPHA = 0.75 LM_BETA = 1.85 model_load_start = timer() ds = Model(models, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start logging.debug("Loaded model in %0.3fs." % (model_load_end)) lm_load_start = timer() ds.enableDecoderWithLM(alphabet, lm, trie, LM_ALPHA, LM_BETA) lm_load_end = timer() - lm_load_start logging.debug('Loaded language model in %0.3fs.' % (lm_load_end)) return [ds, model_load_end, lm_load_end]
def main(): #initialize the data dictionary that will be returned from the #view data = {"success": False} # ensure that an audio file was properly uploadec to our endpoint if flask.request.method == "POST": if flask.request.files.get("audio"): fin = wave.open(flask.request.files["audio"], 'rb') fs = fin.getframerate() audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1 / 16000) fin.close() print('Loading model from file', file=sys.stderr) model_load_start = timer() ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT, VALID_WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start print('Loaded model in {:.3}s.'.format(lm_load_end), file=sys.stderr) print('Running inference.', file=sys.stderr) inference_start = timer() text = ds.stt(audio, fs) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr) data["results"] = [] data["results"].append(text) data["success"] = True data["sentiment"] = sentimentanalysis.get_score(text) return flask.jsonify(data)
def load_model(models, lm, trie): BEAM_WIDTH = 500 LM_ALPHA = 0.75 LM_BETA = 1.85 model_load_start = timer() ds = Model(models, BEAM_WIDTH) model_load_end = timer() - model_load_start logging.debug("Loaded model in %0.3fs." % (model_load_end)) lm_load_start = timer() ds.enableDecoderWithLM(lm, trie, LM_ALPHA, LM_BETA) lm_load_end = timer() - lm_load_start logging.debug('Loaded language model in %0.3fs.' % (lm_load_end)) sample_rate = ds.sampleRate() logging.debug('Loaded model sample rate: %dHz.' % (sample_rate)) return [ds, model_load_end, lm_load_end, sample_rate]
def speechRec(audio_data): # r = sr.Recognizer() # with sr.Microphone(sample_rate=sample_rate) as source: # print("Say Something") # audio = r.listen(source) # fs = audio.sample_rate # audio = np.frombuffer(audio.frame_data, np.int16) sample_rate = 16000 beam_width = 500 lm_alpha = 0.75 lm_beta = 1.85 n_features = 29 n_context = 9 data_folder = Path('deepspeech-0.6.1-models') model_name = str(data_folder / "output_graph.pbmm") alphabet = str(data_folder / "alphabet.txt") langauage_model = str(data_folder / "lm.binary") trie = str(data_folder / "trie") audio_file = 'temp.wav' with open(audio_file, 'wb') as f: f.write(audio_data) ds = Model(model_name, beam_width) ds.enableDecoderWithLM(langauage_model, trie, lm_alpha, lm_beta) # print(ds.sampleRate()) with wave.open(audio_file, 'rb') as fin: fs = fin.getframerate() print("Framerate: ", fs) audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1 / sample_rate) if os.path.exists(audio_file): os.remove(audio_file) else: sys.exit("The file {} does not exist".format(audio_file)) # print("Infering {} file".format(audio_file)) return ds.stt(audio)
def DeepSpeech(Window, SpeechToNLPQueue): # Create Signal Object SpeechSignal = GUISignal() SpeechSignal.signal.connect(Window.UpdateSpeechBox) MsgSignal = GUISignal() MsgSignal.signal.connect(Window.UpdateMsgBox) # References to models: model = 'DeepSpeech_Models/output_graph.pbmm' alphabet = 'DeepSpeech_Models/alphabet.txt' lm = 'DeepSpeech_Models/lm.binary' trie = 'DeepSpeech_Models/trie' print('Loading model from file {}'.format(model), file=sys.stderr) model_load_start = timer() ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) if lm and trie: print('Loading language model from files {} {}'.format(lm, trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(alphabet, lm, trie, LM_ALPHA, LM_BETA) lm_load_end = timer() - lm_load_start print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr) audio = [] with MicrophoneStream(Window, RATE, CHUNK) as stream: audio_generator = stream.generator() for content in audio_generator: for sample in content: audio.append(sample) result = ds.stt(audio, 16000) QueueItem = SpeechNLPItem(result, True, 0, 0, 'Speech') SpeechToNLPQueue.put(QueueItem) SpeechSignal.signal.emit([QueueItem])
def tflite_worker(args, queue_in, queue_out, gpu_mask): os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_mask) ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, args.beam_width) ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, args.lm_alpha, args.lm_beta) while True: msg = queue_in.get() filename = msg['filename'] wavname = os.path.splitext(os.path.basename(filename))[0] fin = wave.open(filename, 'rb') fs = fin.getframerate() audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) fin.close() decoded = ds.stt(audio, fs) queue_out.put({'wav': wavname, 'prediction': decoded, 'ground_truth': msg['transcript']}) print(queue_out.qsize(), end='\r') # Update the current progress queue_in.task_done()
class Tester(BaseTester): name = 'DeepSpeech' audio_format = RATE16K_MONO_WAV def __init__(self, *args, **kwargs): super(Tester, self).__init__(*args, **kwargs) files = [args_lm, args_trie, args_model, args_alphabet] for f in files: assert os.path.isfile(f) print('Loading model from file %s' % (args_model), file=sys.stderr) model_load_start = timer() self.ds = Model(args_model, N_FEATURES, N_CONTEXT, args_alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr) if args_lm and args_trie: print('Loading language model from files %s %s' % (args_lm, args_trie), file=sys.stderr) lm_load_start = timer() self.ds.enableDecoderWithLM(args_alphabet, args_lm, args_trie, LM_ALPHA, LM_BETA) lm_load_end = timer() - lm_load_start print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr) def audio_to_text(self, fn): fin = wave.open(fn, 'rb') fs = fin.getframerate() assert fs == 16000, "Only 16000Hz input WAV files are supported for now!" audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1/16000) fin.close() print('Running inference.', file=sys.stderr) inference_start = timer() text = self.ds.stt(audio, fs) print('text:', text) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr) return text
class Speech_Deepspeech: """ Wrapper for deepspeech Input: - model: model path - alphabet: alphabet file - lm: lm file - trie: trie file """ def __init__(self, model, alphabet, lm, trie): from deepspeech import Model as DSModel self.ds_model = DSModel(model, 26, 9, alphabet, 500) self.ds_model.enableDecoderWithLM(alphabet, lm, trie, 0.75, 1.85) def __call__(self, wavfile): with wave.open(wavfile, "rb") as fin: fs = fin.getframerate() audio = numpy.frombuffer(fin.readframes(fin.getnframes()), numpy.int16) return self.ds_model.stt(audio, fs)
def tflite_worker(model, alphabet, lm, trie, queue_in, queue_out, gpu_mask): os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_mask) #os.environ['CUDA_VISIBLE_DEVICES'] = '0' ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH) ds.enableDecoderWithLM(alphabet, lm, trie, LM_ALPHA, LM_BETA) while True: msg = queue_in.get() filename = msg['filename'] wavname = os.path.splitext(os.path.basename(filename))[0] fin = wave.open(filename, 'rb') fs = fin.getframerate() audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) fin.close() decoded = ds.stt(audio, fs) queue_out.put({'wav': wavname, 'prediction': decoded}) print(queue_out.qsize(), end='\r') # Update the current progress queue_in.task_done()
def load_model(models, alphabet, lm, trie): """ Load the pre-trained model into the memory :param models: Output Graph Protocol Buffer file :param alphabet: Alphabet.txt file :param lm: Language model file :param trie: Trie file :return: tuple (DeepSpeech object, Model Load Time, LM Load Time) """ N_FEATURES = 26 N_CONTEXT = 9 BEAM_WIDTH = 500 #LM_ALPHA = 0.75 #LM_BETA = 1.85 LM_ALPHA = 1 LM_BETA = 1.85 ds = Model(models, BEAM_WIDTH) ds.enableDecoderWithLM(lm, trie, LM_ALPHA, LM_BETA) return ds
def create_model(path, config): # Extract config model = path.get('model') + 'output_graph.pbmm' or \ path.get('model') + 'output_graph.pb' lm_path = path.get('lm_path') beam_width = config.get('beam_width') lm_weight = config.get('lm_weight') w_weight = config.get('w_weight') n_features = 26 n_context = 9 # Búa til lm paths alphabet = os.path.join(lm_path, 'alphabet.txt') lm = os.path.join(lm_path, 'lm.binary') trie = os.path.join(lm_path, 'trie') # Búa til módel ds = Model(model, n_features, n_context, alphabet, beam_width) ds.enableDecoderWithLM(alphabet, lm, trie, lm_weight, w_weight) return ds
def get_model(modeldir): args = AttrDict({ 'model': str.join('/', (modeldir, "models/output_graph.pbmm")), 'alphabet': str.join('/', (modeldir, "models/alphabet.txt")), 'lm': str.join('/', (modeldir, "models/lm.binary")), 'trie': str.join('/', (modeldir,"models/trie")), }) print('Loading model from file {}'.format(args.model), file=sys.stderr) model_load_start = timer() ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) if args.lm and args.trie: print('Loading language model from files {} {}'.format(args.lm, args.trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT, VALID_WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr) return ds