def ModelInitiate(model_file_path, lm_file_path, lm_alpha, lm_beta, beam_width): model = Model(model_file_path) model.enableExternalScorer(lm_file_path) model.setScorerAlphaBeta(lm_alpha, lm_beta) model.setBeamWidth(beam_width) return model
def transcribe_many_parallel(args, filepaths): for index, filepath in filepaths: ds = Model(args.model) if args.beam_width: ds.setBeamWidth(args.beam_width) if args.scorer: print('Loading scorer from files {}'.format(args.scorer), file=sys.stderr) scorer_load_start = timer() ds.enableExternalScorer(args.scorer) scorer_load_end = timer() - scorer_load_start print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr) if args.lm_alpha and args.lm_beta: ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta) if args.hot_words: print('Adding hot-words', file=sys.stderr) for word_boost in args.hot_words.split(','): word, boost = word_boost.split(':') ds.addHotWord(word, float(boost)) p = Process(target=transcribe_file, args=(args, ds, filepath, index)) p.start() p.join() print('{}: Transcribed file {} of {} from "{}"'.format( time.strftime("%H:%M:%S", time.localtime()), index + 1, len(filepaths), filepath))
def recognize_DS(audio1, data): beam_width = 500 #how many different word sequences will the model take into account model_name = data['wake']['model name'] ds = Model(model_name) ds.setBeamWidth(beam_width) audio1 = np.frombuffer(audio1.frame_data, np.int16) #converts into numpy array return (ds.stt(audio1)) #returning predicted audio
def create_deepspeech_model(args): ds = Model(args.model) if args.beam_width: ds.setBeamWidth(args.beam_width) if args.scorer: print('Loading scorer from files {}'.format(args.scorer), file=sys.stderr) scorer_load_start = timer() ds.enableExternalScorer(args.scorer) scorer_load_end = timer() - scorer_load_start print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr) if args.lm_alpha and args.lm_beta: ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta) if args.hot_words: print('Adding hot-words', file=sys.stderr) for word_boost in args.hot_words.split(','): word, boost = word_boost.split(':') ds.addHotWord(word, float(boost)) return ds
def run(): logging.basicConfig(level=logging.INFO) parser = argparse.ArgumentParser(description='DeepSpeech Server') parser.add_argument('--port', default=3337, type=int, help='Port to listen on') parser.add_argument('--model', required=True, help='Path to the model (protocol buffer binary file)') parser.add_argument('--scorer', required=False, help='Path to the external scorer file') parser.add_argument('--beam_width', type=int, help='Beam width for the CTC decoder') parser.add_argument( '--lm_alpha', type=float, help= 'Language model weight (lm_alpha). If not specified, use default from the scorer package.' ) parser.add_argument( '--lm_beta', type=float, help= 'Word insertion bonus (lm_beta). If not specified, use default from the scorer package.' ) parser.add_argument('--google_key', help="Google Speech-Recognition API key.") args = parser.parse_args() ds = Model(args.model) if args.beam_width: ds.setBeamWidth(args.beam_width) ds.enableExternalScorer(args.scorer) if args.lm_alpha and args.lm_beta: ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta) handler_class = ReqHandlerFactory(ds, args.google_key) server_address = ('', args.port) httpd = HTTPServer(server_address, handler_class) logging.info('Starting httpd...\n') try: httpd.serve_forever() except KeyboardInterrupt: pass httpd.server_close() logging.info('Stopping httpd...\n')
def speech_to_text(input_file, file_length, return_speed_per_chunk=False, chunk_size=10): """ Compute the words pronounced in the input_file :param input_file: sound file path :param file_length: time length of the input file (in seconds) :param return_speed_per_chunk: if True, the function return a list of words per chunk, if false it returns all the words in the extract :return: words as string """ # setup the model if return_speed_per_chunk: result = [] else: result = "" recognizer = Model("models/deepspeech-0.8.2-models.pbmm") recognizer.setBeamWidth(2000) recognizer.enableExternalScorer("models/deepspeech-0.8.2-models.scorer") desired_sample_rate = recognizer.sampleRate() # convert input file into smaller audio chunks (apparently works better) CHUNK_SIZE = chunk_size n_chunks = int(file_length // CHUNK_SIZE) for i in range(n_chunks): tfm = sox.Transformer() tfm.trim(i * CHUNK_SIZE, (i + 1) * CHUNK_SIZE) tfm.set_output_format(channels=1) tfm.build(input_file, "temp_folder/chunked_file{}.wav".format(i)) #cmb = sox.Combiner() input_list = [ "audio-files/silence.wav", "temp_folder/chunked_file{}.wav".format(i), "audio-files/silence.wav" ] input_list_correct_sample_rate = list( map(lambda file: convert_samplerate(file, desired_sample_rate)[1], input_list)) audio = np.concatenate(input_list_correct_sample_rate) #cmb.build(input_list, "temp_folder/chunked_file_with_silence{}.wav".format(i), combine_type="concatenate") #fs, audio = convert_samplerate("temp_folder/chunked_file_with_silence{}.wav".format(i), desired_sample_rate) if return_speed_per_chunk: result.append(recognizer.stt(audio)) else: result += recognizer.stt(audio) os.remove("temp_folder/chunked_file{}.wav".format(i)) #os.remove("temp_folder/chunked_file_with_silence{}.wav".format(i)) print(result) return result
def get_model(lang): ds_model = Model(DS_PARAM[lang]['model']) if DS_PARAM[lang].get('beam_width'): ds_model.setBeamWidth(DS_PARAM[lang]['beam_width']) if DS_PARAM[lang].get('scorer'): print('Loading scorer from files {}'.format(DS_PARAM[lang]['scorer']), file=sys.stderr) scorer_load_start = timer() ds_model.enableExternalScorer(DS_PARAM[lang]['scorer']) scorer_load_end = timer() - scorer_load_start print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr) if DS_PARAM[lang].get('lm_alpha') and DS_PARAM[lang].get('lm_beta'): ds_model.setScorerAlphaBeta(DS_PARAM[lang]['lm_alpha'], DS_PARAM[lang]['lm_beta']) return ds_model
def get_model(lang): ds_model = Model(DS_PARAM[lang]["model"]) if DS_PARAM[lang].get("beam_width"): ds_model.setBeamWidth(DS_PARAM[lang]["beam_width"]) if DS_PARAM[lang].get("scorer"): print( "Loading scorer from files {}".format(DS_PARAM[lang]["scorer"]), file=sys.stderr, ) scorer_load_start = timer() ds_model.enableExternalScorer(DS_PARAM[lang]["scorer"]) scorer_load_end = timer() - scorer_load_start print("Loaded scorer in {:.3}s.".format(scorer_load_end), file=sys.stderr) if DS_PARAM[lang].get("lm_alpha") and DS_PARAM[lang].get("lm_beta"): ds_model.setScorerAlphaBeta(DS_PARAM[lang]["lm_alpha"], DS_PARAM[lang]["lm_beta"]) return ds_model
def stt(model_path, audio, beam_width=None, scorer_path=None, lm_alpha=None, lm_beta=None, hot_words=None): ds = Model(model_path) if beam_width: ds.setBeamWidth(beam_width) desired_sample_rate = ds.sampleRate() if scorer_path: ds.enableExternalScorer(scorer_path) if lm_alpha and lm_beta: ds.setScorerAlphaBeta(lm_alpha, lm_beta) # TODO # if hot_words: # print('Adding hot-words', file=sys.stderr) # for w in hot_words: # ds.addHotWord(w, 6.2) fin = wave.open(audio, 'rb') fs_orig = fin.getframerate() if fs_orig != desired_sample_rate: print( f'ERROR: original sample rate ({fs_orig}) is different than {desired_sample_rate}hz.', file=sys.stderr) exit(1) audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) fin.close() print('Running inference.', file=sys.stderr) res = ds.sttWithMetadata(audio, 1) res = postprocess_metadata(res) return res
def predict_speech_to_text(stream_file): alpha = 0.85 beta = 1.75 beam_width = 500 # Initialize the model speech_model = Model(MODEL_PATH) # Enable language scorer to improve the accuracy speech_model.enableExternalScorer(SCORER_PATH) #set beam width. A larger beam width value generates better results at the cost of decoding time. speech_model.setBeamWidth(beam_width) # setting the Scorer language model weight (alpha) and word insertion weight(beta) speech_model.setScorerAlphaBeta(alpha, beta) # Use scipy to covert wav file into numpy array _, audio = wav.read(stream_file) return speech_model.stt(audio)
def process_input_file(conn, options, out_queue, background=True): """Given socket/pipe process audio input and push to out_queue""" log.info("Starting recognition on %s", conn) model = Model(options.model,) if options.beam_width: model.setBeamWidth(options.beam_width) desired_sample_rate = model.sampleRate() if desired_sample_rate != defaults.SAMPLE_RATE: log.error("Model expects rate of %s", desired_sample_rate) # if options.scorer: # model.enableExternalScorer(options.scorer) # else: log.info("Disabling the built-in scorer") model.disableExternalScorer() out_queue.put({'partial': False, 'final': False, 'message': ['Connected']}) if background: thread = threading.Thread(target=run_recognition, args=(model, conn, out_queue)) thread.setDaemon(background) thread.start() else: run_recognition(model, conn, out_queue)
def process_input_file(conn, options, out_queue, background=True): # TODO: allow socket connections from *clients* to choose # the model rather than setting it in the daemon... # to be clear, *output* clients, not audio sinks log.info("Starting recognition on %s", conn) model = Model(options.model,) if options.beam_width: model.setBeamWidth(options.beam_width) desired_sample_rate = model.sampleRate() if desired_sample_rate != defaults.SAMPLE_RATE: log.error("Model expects rate of %s", desired_sample_rate) if options.scorer: model.enableExternalScorer(options.scorer) else: log.info("Disabling the scorer") model.disableExternalScorer() if background: t = threading.Thread(target=run_recognition, args=(model, conn, out_queue)) t.setDaemon(background) t.start() else: run_recognition(model, conn, out_queue)
def load_deepspeech_model(self): model = os.path.join(self.deepspeech_models_folder, "deepspeech-0.9.3-models.pbmm") scorer = os.path.join(self.deepspeech_models_folder, "deepspeech-0.9.3-models.scorer") lm_alpha = 0.93 lm_beta = 1.18 beam_width = 100 model_load_start = timer() deepspeech_model = Model(model) model_load_end = timer() - model_load_start logger.debug("Loaded model in %0.3fs." % (model_load_end)) scorer_load_start = timer() deepspeech_model.enableExternalScorer(scorer) deepspeech_model.setScorerAlphaBeta(lm_alpha, lm_beta) deepspeech_model.setBeamWidth(beam_width) scorer_load_end = timer() - scorer_load_start logger.debug("Loaded external scorer in %0.3fs." % (scorer_load_end)) return deepspeech_model
def load(model, scorer, verbose=True, beam_width="", lm_alpha="", lm_beta="", hot_words=""): """ Load models""" model_load_start = timer() # sphinx-doc: python_ref_model_start ds = Model(model) # sphinx-doc: python_ref_model_stop model_load_end = timer() - model_load_start if verbose==True: print('\nLoading model from files {}'.format(model), file=sys.stderr) print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) if beam_width: ds.setBeamWidth(beam_width) desired_sample_rate = ds.sampleRate() if scorer: if verbose == True: print('Loading scorer from files {}'.format(scorer), file=sys.stderr) scorer_load_start = timer() ds.enableExternalScorer(scorer) scorer_load_end = timer() - scorer_load_start if verbose == True: print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr) if lm_alpha and lm_beta: ds.setScorerAlphaBeta(lm_alpha, lm_beta) if hot_words: if verbose == True: print('Adding hot-words', file=sys.stderr) for word_boost in hot_words.split(','): word, boost = word_boost.split(':') ds.addHotWord(word, float(boost)) return ds, desired_sample_rate
def __init__(self, ): print('Loading model from file {}'.format(args.model), file=sys.stderr) model_load_start = timer() # sphinx-doc: python_ref_model_start model_path = os.path.dirname(os.path.abspath(__file__)) ds = Model(os.path.join(model_path, args.model)) # sphinx-doc: python_ref_model_stop model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) if args.beam_width: ds.setBeamWidth(args.beam_width) self.desired_sample_rate = ds.sampleRate() if args.scorer: print('Loading scorer from files {}'.format(args.scorer), file=sys.stderr) scorer_load_start = timer() ds.enableExternalScorer(os.path.join(model_path, args.scorer)) scorer_load_end = timer() - scorer_load_start print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr) if args.lm_alpha and args.lm_beta: ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta) if args.hot_words: print('Adding hot-words', file=sys.stderr) for word_boost in args.hot_words.split(','): word, boost = word_boost.split(':') ds.addHotWord(word, float(boost)) self.ds = ds
def setup_model(model_path, scorer, beam_width): log("creating model {} with scorer {}...".format(model_path, scorer)) model = Model(model_path) if scorer.scorer is not None: model.enableExternalScorer(scorer.scorer) if scorer.lm_alpha is not None and scorer.lm_beta is not None: if model.setScorerAlphaBeta(scorer.lm_alpha, scorer.lm_beta) != 0: raise RuntimeError("Unable to set scorer parameters") if beam_width is not None: if model.setBeamWidth(beam_width) != 0: raise RuntimeError("Unable to set beam width") log("model is ready.") return model
def main(): parser = argparse.ArgumentParser( description='Running DeepSpeech inference.') parser.add_argument('--model', required=True, help='Path to the model (protocol buffer binary file)') parser.add_argument('--scorer', required=False, help='Path to the external scorer file') parser.add_argument( '--prediction_in', required=True, help='Path to the directory with sound files (mp3/ogg/wav) to analyze') parser.add_argument( '--prediction_out', required=True, help='Path to the directory for moving the processed sound files to') parser.add_argument( '--prediction_tmp', required=False, help= 'Path to the temp directory for storing the predictions initially before moving them to "--prediction_out"' ) parser.add_argument( '--continuous', action='store_true', help='Whether to continuously load test images and perform prediction', required=False, default=False) parser.add_argument( '--delete_input', action='store_true', help= 'Whether to delete the input files rather than move them to "--prediction_out" directory', required=False, default=False) parser.add_argument('--beam_width', type=int, help='Beam width for the CTC decoder') parser.add_argument( '--lm_alpha', type=float, help= 'Language model weight (lm_alpha). If not specified, use default from the scorer package.' ) parser.add_argument( '--lm_beta', type=float, help= 'Word insertion bonus (lm_beta). If not specified, use default from the scorer package.' ) parser.add_argument( '--json', required=False, action='store_true', help='Output json from metadata with timestamp of each word') parser.add_argument( '--candidate_transcripts', type=int, default=3, help='Number of candidate transcripts to include in JSON output') parser.add_argument( '--normalize', required=False, action='store_true', help='Whether to apply standard amplitude normalization') parsed = parser.parse_args() print('Loading model from file {}'.format(parsed.model)) ds = Model(parsed.model) if parsed.beam_width: ds.setBeamWidth(parsed.beam_width) if parsed.scorer: print('Loading scorer from file {}'.format(parsed.scorer)) ds.enableExternalScorer(parsed.scorer) if parsed.lm_alpha and parsed.lm_beta: ds.setScorerAlphaBeta(parsed.lm_alpha, parsed.lm_beta) process(model=ds, prediction_in=parsed.prediction_in, prediction_out=parsed.prediction_out, prediction_tmp=parsed.prediction_tmp, continuous=parsed.continuous, delete_input=parsed.delete_input, json=parsed.json, candidate_transcripts=parsed.candidate_transcripts, normalize=parsed.normalize)
parser.add_argument('--model', required=True, help='Path to the .pbmm file') parser.add_argument('--scorer', required=False, help='Path to the .scorer file') parser.add_argument('--beam_width', type=int, default=500, help='Beam width for the CTC decoder') parser.add_argument('--port', type=int, default=8008, help='The port number to listen on') args = parser.parse_args() # Load in the model logging.info("Loading model from %s" % args.model) model = Model(args.model) # Configure it model.setBeamWidth(args.beam_width) if args.scorer: logging.info("Loading scorer from %s" % (args.scorer,)) model.enableExternalScorer(args.scorer) # Set up the server socket logging.info("Opening socket on port %d" % (args.port,)) sckt = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sckt.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) sckt.bind(('0.0.0.0', args.port)) sckt.listen(5) # Do this forever while True: try: # Get a connection
def app_sst(model_path: str, lm_path: str, lm_alpha: float, lm_beta: float, beam: int): webrtc_ctx = webrtc_streamer( key="speech-to-text", mode=WebRtcMode.SENDONLY, audio_receiver_size=1024, rtc_configuration={ "iceServers": [{ "urls": ["stun:stun.l.google.com:19302"] }] }, media_stream_constraints={ "video": False, "audio": True }, ) status_indicator = st.empty() if not webrtc_ctx.state.playing: return status_indicator.write("Loading...") text_output = st.empty() stream = None while True: if webrtc_ctx.audio_receiver: if stream is None: from deepspeech import Model model = Model(model_path) model.enableExternalScorer(lm_path) model.setScorerAlphaBeta(lm_alpha, lm_beta) model.setBeamWidth(beam) stream = model.createStream() status_indicator.write("Model loaded.") sound_chunk = pydub.AudioSegment.empty() try: audio_frames = webrtc_ctx.audio_receiver.get_frames(timeout=1) except queue.Empty: time.sleep(0.1) status_indicator.write("No frame arrived.") continue status_indicator.write("Running. Say something!") for audio_frame in audio_frames: sound = pydub.AudioSegment( data=audio_frame.to_ndarray().tobytes(), sample_width=audio_frame.format.bytes, frame_rate=audio_frame.sample_rate, channels=len(audio_frame.layout.channels), ) sound_chunk += sound if len(sound_chunk) > 0: sound_chunk = sound_chunk.set_channels(1).set_frame_rate( model.sampleRate()) buffer = np.array(sound_chunk.get_array_of_samples()) stream.feedAudioContent(buffer) text = stream.intermediateDecode() text_output.markdown(f"**Text:** {text}") else: status_indicator.write("AudioReciver is not set. Abort.") break
def app_sst_with_video(model_path: str, lm_path: str, lm_alpha: float, lm_beta: float, beam: int): class AudioProcessor(AudioProcessorBase): frames_lock: threading.Lock frames: deque def __init__(self) -> None: self.frames_lock = threading.Lock() self.frames = deque([]) async def recv_queued(self, frames: List[av.AudioFrame]) -> av.AudioFrame: with self.frames_lock: self.frames.extend(frames) # Return empty frames to be silent. new_frames = [] for frame in frames: input_array = frame.to_ndarray() new_frame = av.AudioFrame.from_ndarray( np.zeros(input_array.shape, dtype=input_array.dtype), layout=frame.layout.name, ) new_frame.sample_rate = frame.sample_rate new_frames.append(new_frame) return new_frames webrtc_ctx = webrtc_streamer( key="speech-to-text-w-video", mode=WebRtcMode.SENDRECV, audio_processor_factory=AudioProcessor, rtc_configuration={ "iceServers": [{ "urls": ["stun:stun.l.google.com:19302"] }] }, media_stream_constraints={ "video": True, "audio": True }, ) status_indicator = st.empty() if not webrtc_ctx.state.playing: return status_indicator.write("Loading...") text_output = st.empty() stream = None while True: if webrtc_ctx.audio_processor: if stream is None: from deepspeech import Model model = Model(model_path) model.enableExternalScorer(lm_path) model.setScorerAlphaBeta(lm_alpha, lm_beta) model.setBeamWidth(beam) stream = model.createStream() status_indicator.write("Model loaded.") sound_chunk = pydub.AudioSegment.empty() audio_frames = [] with webrtc_ctx.audio_processor.frames_lock: while len(webrtc_ctx.audio_processor.frames) > 0: frame = webrtc_ctx.audio_processor.frames.popleft() audio_frames.append(frame) if len(audio_frames) == 0: time.sleep(0.1) status_indicator.write("No frame arrived.") continue status_indicator.write("Running. Say something!") for audio_frame in audio_frames: sound = pydub.AudioSegment( data=audio_frame.to_ndarray().tobytes(), sample_width=audio_frame.format.bytes, frame_rate=audio_frame.sample_rate, channels=len(audio_frame.layout.channels), ) sound_chunk += sound if len(sound_chunk) > 0: sound_chunk = sound_chunk.set_channels(1).set_frame_rate( model.sampleRate()) buffer = np.array(sound_chunk.get_array_of_samples()) stream.feedAudioContent(buffer) text = stream.intermediateDecode() text_output.markdown(f"**Text:** {text}") else: status_indicator.write("AudioReciver is not set. Abort.") break
# Word insertion bonus (lm_beta). If not specified, use default from the scorer package. lm_beta = None # float, # Hot-words and their boosts. hot_words = None # str print('Loading model from file {}'.format(model)) model_load_start = timer() # sphinx-doc: python_ref_model_start ds = Model(model) # sphinx-doc: python_ref_model_stop model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end)) if beam_width: ds.setBeamWidth(beam_width) desired_sample_rate = ds.sampleRate() if scorer: print('Loading scorer from files {}'.format(scorer)) scorer_load_start = timer() ds.enableExternalScorer(scorer) scorer_load_end = timer() - scorer_load_start print('Loaded scorer in {:.3}s.'.format(scorer_load_end)) if lm_alpha and lm_beta: print("Set Scorer Alpha and Beta") ds.setScorerAlphaBeta(lm_alpha, lm_beta) if hot_words:
def main(): parser = argparse.ArgumentParser(description='Running DeepSpeech inference.') parser.add_argument('--model', required=True, help='Path to the model (protocol buffer binary file)') parser.add_argument('--scorer', required=False, help='Path to the external scorer file') parser.add_argument('--audio', required=True, help='Path to the audio file to run (WAV format)') parser.add_argument('--beam_width', type=int, help='Beam width for the CTC decoder') parser.add_argument('--lm_alpha', type=float, help='Language model weight (lm_alpha). If not specified, use default from the scorer package.') parser.add_argument('--lm_beta', type=float, help='Word insertion bonus (lm_beta). If not specified, use default from the scorer package.') parser.add_argument('--version', action=VersionAction, help='Print version and exits') parser.add_argument('--extended', required=False, action='store_true', help='Output string from extended metadata') parser.add_argument('--json', required=False, action='store_true', help='Output json from metadata with timestamp of each word') parser.add_argument('--candidate_transcripts', type=int, default=3, help='Number of candidate transcripts to include in JSON output') args = parser.parse_args() # print('Loading model from file {}'.format(args.model), file=sys.stderr) model_load_start = timer() # sphinx-doc: python_ref_model_start ds = Model(args.model) # sphinx-doc: python_ref_model_stop model_load_end = timer() - model_load_start # print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) if args.beam_width: ds.setBeamWidth(args.beam_width) desired_sample_rate = ds.sampleRate() if args.scorer: # print('Loading scorer from files {}'.format(args.scorer), file=sys.stderr) scorer_load_start = timer() ds.enableExternalScorer(args.scorer) scorer_load_end = timer() - scorer_load_start # print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr) if args.lm_alpha and args.lm_beta: ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta) fin = wave.open(args.audio, 'rb') fs_orig = fin.getframerate() if fs_orig != desired_sample_rate: # print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format(fs_orig, desired_sample_rate), file=sys.stderr) fs_new, audio = convert_samplerate(args.audio, desired_sample_rate) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1/fs_orig) fin.close() # print('Running inference.', file=sys.stderr) inference_start = timer() # sphinx-doc: python_ref_inference_start if args.extended: print(metadata_to_string(ds.sttWithMetadata(audio, 1).transcripts[0])) elif args.json: print(metadata_json_output(ds.sttWithMetadata(audio, args.candidate_transcripts))) else: print("Translation: "+ds.stt(audio)) # sphinx-doc: python_ref_inference_stop inference_end = timer() - inference_start
def record_voice_and_predict_text(self): """Records the speech and predicts its text """ #Recording the speech stream_file_name = 'AudioFile/speech_stream.wav' stream_format = pyaudio.paInt16 # Sampling size and format no_of_channels = 1 # Number of audio channels sampling_rate = 16000 # Sampling rate in Hertz frames_count = 1024 # Number of frames per buffer record_seconds = 5 stream = pyaudio.PyAudio() stream_data = stream.open(format=stream_format, channels=no_of_channels, rate=sampling_rate, input=True, frames_per_buffer=frames_count) frames = [ stream_data.read(frames_count) for i in range(0, int(sampling_rate / frames_count * record_seconds)) ] stream_data.stop_stream() stream_data.close() stream.terminate() wave_file = wave.open(stream_file_name, 'wb') wave_file.setnchannels(no_of_channels) wave_file.setsampwidth(stream.get_sample_size(stream_format)) wave_file.setframerate(sampling_rate) wave_file.writeframes(b''.join(frames)) wave_file.close() try: self.label_info.setText('Recording completed.') except: pass #Text prediction Part alpha = 0.75 beta = 1.85 beam_width = 500 # Initialize the model speech_model = Model(MODEL_PATH) # set beam width. A larger beam width value generates better results at the cost of decoding time. speech_model.setBeamWidth(beam_width) # Enable language scorer to improve the accuracy speech_model.enableExternalScorer(SCORER_PATH) # You can play with setting the model Beam Width, Scorer language model weight and word insertion weight # Set hyperparameters alpha and beta of the external scorer. # alpha: Language model weight. # beta: Word insertion weight speech_model.setScorerAlphaBeta(alpha, beta) # Use scipy to covert wav file into numpy array _, audio = wav.read(stream_file_name) text = speech_model.stt(audio) try: self.text_pred.setText(text) except: pass show_images(text)
def main(): parser = argparse.ArgumentParser(description='Running DeepSpeech inference.') parser.add_argument('--model', required=True, help='Path to the model (protocol buffer binary file)') parser.add_argument('--scorer', required=False, help='Path to the external scorer file') parser.add_argument('--audio', required=True, help='Path to the audio file to run (WAV format)') parser.add_argument('--beam_width', type=int, help='Beam width for the CTC decoder') parser.add_argument('--lm_alpha', type=float, help='Language model weight (lm_alpha). If not specified, use default from the scorer package.') parser.add_argument('--lm_beta', type=float, help='Word insertion bonus (lm_beta). If not specified, use default from the scorer package.') # parser.add_argument('--version', action=VersionAction, # help='Print version and exits') parser.add_argument('--extended', required=False, action='store_true', help='Output string from extended metadata') parser.add_argument('--json', required=False, action='store_true', help='Output json from metadata with timestamp of each word') parser.add_argument('--candidate_transcripts', type=int, default=3, help='Number of candidate transcripts to include in JSON output') parser.add_argument('--hot_words', type=str, help='Hot-words and their boosts.') args = parser.parse_args() print('Loading model from file {}'.format(args.model), file=sys.stderr) model_load_start = timer() # sphinx-doc: python_ref_model_start ds = Model(args.model) # sphinx-doc: python_ref_model_stop model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) if args.beam_width: ds.setBeamWidth(args.beam_width) desired_sample_rate = ds.sampleRate() if args.scorer: print('Loading scorer from files {}'.format(args.scorer), file=sys.stderr) scorer_load_start = timer() ds.enableExternalScorer(args.scorer) scorer_load_end = timer() - scorer_load_start print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr) if args.lm_alpha and args.lm_beta: ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta) if args.hot_words: print('Adding hot-words', file=sys.stderr) for word_boost in args.hot_words.split(','): word,boost = word_boost.split(':') ds.addHotWord(word,float(boost)) fin = wave.open(args.audio, 'rb') fs_orig = fin.getframerate() if fs_orig != desired_sample_rate: print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format(fs_orig, desired_sample_rate), file=sys.stderr) fs_new, audio = convert_samplerate(args.audio, desired_sample_rate) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1/fs_orig) fin.close() print('Running inference.', file=sys.stderr) inference_start = timer() # sphinx-doc: python_ref_inference_start if args.extended: print(metadata_to_string(ds.sttWithMetadata(audio, 1).transcripts[0])) elif args.json: print(metadata_json_output(ds.sttWithMetadata(audio, args.candidate_transcripts))) else: print(ds.stt(audio)) test = ds.createStream().sentencefit(audio, "ka arohia katoatia te hāhi me ōna whakapono e te hapū o ōtākou") [print(f"letter: {t.letter}, confidence: {t.confidence}, timestep: {t.timestep}") for t in test.tokens] # sphinx-doc: python_ref_inference_stop inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
class DeepSpeech(): def __init__(self, model_path, scorer_path, result_json_path, result_txt_path, candidate_transcripts=3, beam_width=None): # Path to the Speech-To-Text model self.MODEL_PATH = model_path # Path to the scorer language mode self.SCORER_PATH = scorer_path # The number of times to trascript self.CANDIDATE_TRANSCRIPTS = candidate_transcripts self.result_json_path = result_json_path self.result_txt_path = result_txt_path self.beam_width = beam_width self._setup() def _setup(self): self.ds = Model(self.MODEL_PATH) # Declare the model obj # Set desired sample rate for STT model. self.sample_rate = '16000' if self.beam_width: self.ds.setBeamWidth(self.beam_width) if self.SCORER_PATH: self.ds.enableExternalScorer(self.SCORER_PATH) def convert_samplerate(self, audio_path, desired_sample_rate): sox_cmd = 'sox {} --type raw --bits 16 --channels 1 --rate {}\ --encoding signed-integer --endian little\ --compression 0.0 --no-dither - '\ .format(quote(audio_path), desired_sample_rate) try: output = subprocess.check_output( shlex.split(sox_cmd), stderr=subprocess.PIPE) except subprocess.CalledProcessError as e: raise RuntimeError( 'SoX returned non-zero status: {}'.format(e.stderr)) except OSError as e: raise OSError(e.errno, 'SoX not found, use {}hz files or install it: {}' .format(desired_sample_rate, e.strerror)) return desired_sample_rate, np.frombuffer(output, np.int16) def words_from_candidate_transcript(self, metadata): word = "" word_list = [] word_start_time = 0 # Loop through each character for i, token in enumerate(metadata.tokens): # Append character to word if it's not a space if token.text != " ": if len(word) == 0: # Log the start time of the new word word_start_time = token.start_time word = word + token.text # Word boundary is either a space or the last character in the arr if token.text == " " or i == len(metadata.tokens) - 1: word_duration = token.start_time - word_start_time if word_duration < 0: word_duration = 0 each_word = dict() each_word["word"] = word each_word["start_time "] = round(word_start_time, 4) each_word["duration"] = round(word_duration, 4) word_list.append(each_word) # Reset word = "" word_start_time = 0 return word_list def metadata_json_output(self, metadata): json_result = dict() json_result["transcripts"] = [{ "confidence": transcript.confidence, "words": self.words_from_candidate_transcript(transcript), } for transcript in metadata.transcripts] return json.dumps(json_result, indent=4) def take_audio_info(self): probe = ffmpeg.probe(self.FILE_PATH) self.audio_info = next( (stream for stream in probe['streams'] if stream['codec_type'] == 'audio'), None) print(self.audio_info) return self.audio_info def take_audio(self): out, err = ( ffmpeg .input(self.FILE_PATH) .output('-', format='s16le', acodec='pcm_s16le', ac=1, ar=self.sample_rate) .run(capture_stdout=True, capture_stderr=True) ) self.audio = np.frombuffer(out, np.int16) return self.audio def speech2text(self): metadata = self.ds.sttWithMetadata( self.audio, self.CANDIDATE_TRANSCRIPTS) json_result = self.metadata_json_output(metadata) with open(self.result_json_path, 'w') as outfile: outfile.write(json_result) dict_result = json.loads(json_result) word_list = [item["word"] for item in dict_result["transcripts"][0]["words"]] sentence = " ".join(word_list) self.export2textfile(sentence) return sentence def export2textfile(self, sentence): txt_file = open(self.result_txt_path, "w") txt_file.writelines(sentence) txt_file.close() def set_file(self, filepath): self.FILE_PATH = filepath
p.terminate() print(" Recording complete.") audio_data = (np.frombuffer(b''.join(frames), dtype=np.int16) / 32767) bg_data = (np.frombuffer(b''.join(frames_bg), dtype=np.int16) / 32767) # denoised_data = removeNoise(audio_data, bg_data)#.astype('float32') return audio_data #denoised_data #######Deepspeech Voice-To-Text Parameters######## DS_FOLDER = 'deepspeech_data' if not os.path.exists(DS_FOLDER): os.mkdir(DS_FOLDER) DS_model_file_path = 'deepspeech_data/deepspeech-0.7.4-models.pbmm' beam_width = 500 DS_model = Model(DS_model_file_path) DS_model.setBeamWidth(beam_width) DS_model.enableExternalScorer('deepspeech_data/deepspeech-0.7.4-models.scorer') def get_text(data, model=DS_model): """ Transcribe text from audio. data: audio data as in array read from librosa with sampling rate 16000. model: Deepspeech ASR model. """ # y , s = librosa.load(fpath, sr=16000) y = (data * 32767).astype('int16') text = model.stt(y) return text
from deepspeech import Model import gradio as gr import numpy as np model_file_path = "deepspeech-0.8.2-models.pbmm" lm_file_path = "deepspeech-0.8.2-models.scorer" beam_width = 100 lm_alpha = 0.93 lm_beta = 1.18 model = Model(model_file_path) model.enableExternalScorer(lm_file_path) model.setScorerAlphaBeta(lm_alpha, lm_beta) model.setBeamWidth(beam_width) def reformat_freq(sr, y): if sr not in ( 48000, 16000, ): # Deepspeech only supports 16k, (we convert 48k -> 16k) raise ValueError("Unsupported rate", sr) if sr == 48000: y = (((y / max(np.max(y), 1)) * 32767).reshape( (-1, 3)).mean(axis=1).astype("int16")) sr = 16000 return sr, y def transcribe(speech, stream): _, y = reformat_freq(*speech)