def modelResult(self, dirName): # Fetch and Resolve all the paths of model files output_graph, alphabet, lm, trie = wavTranscriber.resolve_models( dirName) # Load output_graph, alpahbet, lm and trie self.model = wavTranscriber.load_model(output_graph, alphabet, lm, trie)
def test_ds_inst(model_path): # threadName = threading.currentThread().name processName = multiprocessing.current_process().name sample_rate = 16000 # Point to a path containing the pre-trained models & resolve ~ if used dirName = os.path.expanduser(args.model) # print(dirName) # Resolve all the paths of model files output_graph, alphabet, lm, trie = wavTranscriber.resolve_models(dirName) # print(output_graph, alphabet, lm, trie) # Load output_graph, alpahbet, lm and trie model_retval = wavTranscriber.load_model(output_graph, alphabet, lm, trie) # print(threadName + ': Model loded . . . ') _model_loded.put(processName) run_ds = False with _read_write_i_audio_sema: run_ds = check_i_audio() while run_ds: # audio, sample_rate, audio_length = wavTranscriber.read_wave('audio.wav') with _fetch_audio_sema: audio = fetch_audio() # print(threadName + ':' + 'audio_queue_after_get:' + str(audio_queue.qsize())) audio = np.frombuffer(audio, dtype=np.int16) output = wavTranscriber.stt(model_retval[0], audio, sample_rate) # print(threadName + ':' + output[0]) print('Inference took %0.3fs for %0.3fs audio file.' % (output[2], output[1])) print(i_audio, n_audio) with _read_write_audio_length_sema: audio_length_queue.put(output[1]) with _read_write_inference_time_sema: inference_time_queue.put(output[2]) with _read_write_i_audio_sema: run_ds = check_i_audio() print(processName + ' end')
def modelResult(self, dirName): # Fetch and Resolve all the paths of model files output_graph, scorer = wavTranscriber.resolve_models(dirName) # Load output_graph, alphabet and scorer self.model = wavTranscriber.load_model(output_graph, scorer)
def transcriptionProcess(aggressive, audioLoc, modelLoc): logging.debug("Transcribing audio file: %s" % audioLoc) # Point to a path containing the pre-trained models & resolve ~ if used dirName = os.path.expanduser(modelLoc) logging.debug("dirName: %s" % dirName) # Resolve all the paths of model files output_graph, scorer = wavTranscriber.resolve_models(dirName) # Load output_graph, alpahbet and scorer model_retval = wavTranscriber.load_model(output_graph, scorer) if audioLoc is not None: title_names = [ 'Filename', 'Duration(s)', 'Inference Time(s)', 'Model Load Time(s)', 'Scorer Load Time(s)' ] print("\n%-30s %-20s %-20s %-20s %s" % (title_names[0], title_names[1], title_names[2], title_names[3], title_names[4])) inference_time = 0.0 # Run VAD on the input file waveFile = audioLoc segments, sample_rate, audio_length = wavTranscriber.vad_segment_generator( waveFile, aggressive) transcripts = "" f = open(waveFile.rstrip(".wav") + ".txt", 'w') logging.debug("Saving Transcript @: %s" % waveFile.rstrip(".wav") + ".txt") for i, segment in enumerate(segments): # Run deepspeech on the chunk that just completed VAD logging.debug("Processing chunk %002d" % (i, )) audio = np.frombuffer(segment, dtype=np.int16) output = wavTranscriber.stt(model_retval[0], audio, sample_rate) inference_time += output[1] logging.debug("Transcript: %s" % output[0]) transcripts += output[0] f.write(output[0] + " ") # Summary of the files processed f.close() # Extract filename from the full file path filename, ext = os.path.split(os.path.basename(waveFile)) logging.debug( "************************************************************************************************************" ) logging.debug("%-30s %-20s %-20s %-20s %s" % (title_names[0], title_names[1], title_names[2], title_names[3], title_names[4])) logging.debug("%-30s %-20.3f %-20.3f %-20.3f %-0.3f" % (filename + ext, audio_length, inference_time, model_retval[1], model_retval[2])) logging.debug( "************************************************************************************************************" ) print("%-30s %-20.3f %-20.3f %-20.3f %-0.3f" % (filename + ext, audio_length, inference_time, model_retval[1], model_retval[2])) return transcripts
def main(args): parser = argparse.ArgumentParser( description= 'Transcribe long audio files using webRTC VAD or use the streaming interface' ) parser.add_argument( '--aggressive', type=int, choices=range(4), required=False, help= 'Determines how aggressive filtering out non-speech is. (Interger between 0-3)' ) parser.add_argument('--audio', required=False, help='Path to the audio file to run (WAV format)') parser.add_argument( '--model', required=True, help= 'Path to directory that contains all model files (output_graph, lm and trie)' ) parser.add_argument('--stream', required=False, action='store_true', help='To use deepspeech streaming interface') args = parser.parse_args() if args.stream is True: print("Opening mic for streaming") elif args.audio is not None: logging.debug("Transcribing audio file @ %s" % args.audio) else: parser.print_help() parser.exit() # Point to a path containing the pre-trained models & resolve ~ if used dirName = os.path.expanduser(args.model) # Resolve all the paths of model files output_graph, lm, trie = wavTranscriber.resolve_models(dirName) # Load output_graph, alpahbet, lm and trie model_retval = wavTranscriber.load_model(output_graph, lm, trie) if args.audio is not None: title_names = [ 'Filename', 'Duration(s)', 'Inference Time(s)', 'Model Load Time(s)', 'LM Load Time(s)' ] print("\n%-30s %-20s %-20s %-20s %s" % (title_names[0], title_names[1], title_names[2], title_names[3], title_names[4])) inference_time = 0.0 # Run VAD on the input file waveFile = args.audio segments, sample_rate, audio_length = wavTranscriber.vad_segment_generator( waveFile, args.aggressive) f = open(waveFile.rstrip(".wav") + ".txt", 'w') logging.debug("Saving Transcript @: %s" % waveFile.rstrip(".wav") + ".txt") for i, segment in enumerate(segments): # Run deepspeech on the chunk that just completed VAD logging.debug("Processing chunk %002d" % (i, )) audio = np.frombuffer(segment, dtype=np.int16) output = wavTranscriber.stt(model_retval[0], audio, sample_rate) inference_time += output[1] logging.debug("Transcript: %s" % output[0]) f.write(output[0] + " ") # Summary of the files processed f.close() # Extract filename from the full file path filename, ext = os.path.split(os.path.basename(waveFile)) logging.debug( "************************************************************************************************************" ) logging.debug("%-30s %-20s %-20s %-20s %s" % (title_names[0], title_names[1], title_names[2], title_names[3], title_names[4])) logging.debug("%-30s %-20.3f %-20.3f %-20.3f %-0.3f" % (filename + ext, audio_length, inference_time, model_retval[1], model_retval[2])) logging.debug( "************************************************************************************************************" ) print("%-30s %-20.3f %-20.3f %-20.3f %-0.3f" % (filename + ext, audio_length, inference_time, model_retval[1], model_retval[2])) else: sctx = model_retval[0].createStream() subproc = subprocess.Popen(shlex.split( 'rec -q -V0 -e signed -L -c 1 -b 16 -r 16k -t raw - gain -2'), stdout=subprocess.PIPE, bufsize=0) print('You can start speaking now. Press Control-C to stop recording.') try: while True: data = subproc.stdout.read(512) model_retval[0].feedAudioContent(sctx, np.frombuffer(data, np.int16)) except KeyboardInterrupt: print('Transcription: ', model_retval[0].finishStream(sctx)) subproc.terminate() subproc.wait()
def main(args): parser = argparse.ArgumentParser(description='Transcribe long audio files using webRTC VAD or use the streaming interface') parser.add_argument('--aggressive', type=int, choices=range(4), required=False, help='Determines how aggressive filtering out non-speech is. (Interger between 0-3)') parser.add_argument('--audio', required=False, help='Path to the audio file to run (WAV format)') parser.add_argument('--model', required=True, help='Path to directory that contains all model files (output_graph, lm, trie and alphabet)') parser.add_argument('--stream', required=False, action='store_true', help='To use deepspeech streaming interface') args = parser.parse_args() if args.stream is True and len(sys.argv[1:]) == 3: print("Opening mic for streaming") elif args.audio is not None and len(sys.argv[1:]) == 6: logging.debug("Transcribing audio file @ %s" % args.audio) else: parser.print_help() parser.exit() # Point to a path containing the pre-trained models & resolve ~ if used dirName = os.path.expanduser(args.model) # Resolve all the paths of model files output_graph, alphabet, lm, trie = wavTranscriber.resolve_models(dirName) # Load output_graph, alpahbet, lm and trie model_retval = wavTranscriber.load_model(output_graph, alphabet, lm, trie) if args.audio is not None: title_names = ['Filename', 'Duration(s)', 'Inference Time(s)', 'Model Load Time(s)', 'LM Load Time(s)'] print("\n%-30s %-20s %-20s %-20s %s" % (title_names[0], title_names[1], title_names[2], title_names[3], title_names[4])) inference_time = 0.0 # Run VAD on the input file waveFile = args.audio segments, sample_rate, audio_length = wavTranscriber.vad_segment_generator(waveFile, args.aggressive) f = open(waveFile.rstrip(".wav") + ".txt", 'w') logging.debug("Saving Transcript @: %s" % waveFile.rstrip(".wav") + ".txt") for i, segment in enumerate(segments): # Run deepspeech on the chunk that just completed VAD logging.debug("Processing chunk %002d" % (i,)) audio = np.frombuffer(segment, dtype=np.int16) output = wavTranscriber.stt(model_retval[0], audio, sample_rate) inference_time += output[1] logging.debug("Transcript: %s" % output[0]) f.write(output[0] + " ") # Summary of the files processed f.close() # Extract filename from the full file path filename, ext = os.path.split(os.path.basename(waveFile)) logging.debug("************************************************************************************************************") logging.debug("%-30s %-20s %-20s %-20s %s" % (title_names[0], title_names[1], title_names[2], title_names[3], title_names[4])) logging.debug("%-30s %-20.3f %-20.3f %-20.3f %-0.3f" % (filename + ext, audio_length, inference_time, model_retval[1], model_retval[2])) logging.debug("************************************************************************************************************") print("%-30s %-20.3f %-20.3f %-20.3f %-0.3f" % (filename + ext, audio_length, inference_time, model_retval[1], model_retval[2])) else: sctx = model_retval[0].setupStream() subproc = subprocess.Popen(shlex.split('rec -q -V0 -e signed -L -c 1 -b 16 -r 16k -t raw - gain -2'), stdout=subprocess.PIPE, bufsize=0) print('You can start speaking now. Press Control-C to stop recording.') try: while True: data = subproc.stdout.read(512) model_retval[0].feedAudioContent(sctx, np.frombuffer(data, np.int16)) except KeyboardInterrupt: print('Transcription: ', model_retval[0].finishStream(sctx)) subproc.terminate() subproc.wait()
def main(args): parser = argparse.ArgumentParser( description= 'Transcribe long audio files using webRTC VAD or use the streaming interface' ) parser.add_argument('audio', type=str, help='Path to the audio file to run (WAV format)') parser.add_argument( '--aggressive', type=int, choices=range(4), required=False, help= 'Determines how aggressive filtering out non-speech is. (Interger between 0-3)' ) parser.add_argument( '--model', required=False, help= 'Path to directory that contains all model files (output_graph, lm, trie and alphabet)' ) args = parser.parse_args() # Loading model model_dir = os.path.expanduser(args.model if args.model else 'models/en') output_graph, alphabet, lm, trie = wavTranscriber.resolve_models(model_dir) model = wavTranscriber.load_model(output_graph, alphabet, lm, trie) title_names = [ 'Filename', 'Duration(s)', 'Inference Time(s)', 'Model Load Time(s)', 'LM Load Time(s)' ] print("\n%-30s %-20s %-20s %-20s %s" % (title_names[0], title_names[1], title_names[2], title_names[3], title_names[4])) inference_time = 0.0 # Run VAD on the input file wave_file = args.audio aggressiveness = int(args.aggressive) if args.aggressive else 3 segments, sample_rate, audio_length = wavTranscriber.vad_segment_generator( wave_file, aggressiveness) f = open(wave_file.rstrip(".wav") + ".txt", 'w') logging.debug("Saving Transcript @: %s" % wave_file.rstrip(".wav") + ".txt") for i, segment in enumerate(segments): # Run deepspeech on the chunk that just completed VAD logging.debug("Processing chunk %002d" % (i, )) audio = np.frombuffer(segment, dtype=np.int16) output = wavTranscriber.stt(model[0], audio, sample_rate) inference_time += output[1] logging.debug("Transcript: %s" % output[0]) f.write(output[0] + " ") # Summary of the files processed f.close() # Extract filename from the full file path filename, ext = os.path.split(os.path.basename(wave_file)) logging.debug( "************************************************************************************************************" ) logging.debug("%-30s %-20s %-20s %-20s %s" % (title_names[0], title_names[1], title_names[2], title_names[3], title_names[4])) logging.debug("%-30s %-20.3f %-20.3f %-20.3f %-0.3f" % (filename + ext, audio_length, inference_time, model_retval[1], model_retval[2])) logging.debug( "************************************************************************************************************" ) print("%-30s %-20.3f %-20.3f %-20.3f %-0.3f" % (filename + ext, audio_length, inference_time, model_retval[1], model_retval[2]))
def main(args): parser = argparse.ArgumentParser() parser.add_argument('--audio', required=True, help='Path to the audio file to run (WAV format)') args = parser.parse_args() # Run VAD on the input file waveFile = args.audio logger.debug("Loading wav file %s" % waveFile) segments, sample_rate, audio_length = wavTranscriber.vad_segment_generator( waveFile, 3) logger.debug("Processing speaker diarization") segments = speaker_diarization.diarize(segments) f = open(waveFile.replace(".wav", ".txt"), 'w') logger.debug("Processing speech recognition") # Point to a path containing the pre-trained models & resolve ~ if used model_path = os.path.expanduser("deepspeech/models/") # Resolve all the paths of model files output_graph, alphabet, lm, trie = wavTranscriber.resolve_models( model_path) # Load output_graph, alpahbet, lm and trie model_retval = wavTranscriber.load_model(output_graph, alphabet, lm, trie) inference_time = 0.0 for i, segment in enumerate(segments): # Run deepspeech on the chunk that just completed VAD logger.debug("[Speech recognition] Processing chunk %002d" % (i, )) audio = np.frombuffer(segment.bytes, dtype=np.int16) output = wavTranscriber.stt(model_retval[0], audio, sample_rate) inference_time += output[1] logger.debug("[Speech recognition] Transcript: %s" % output[0]) f.write("%s - %s Speaker %s: %s\n" % (str(datetime.timedelta(seconds=round(segment.begin, 3)))[:-3], str(datetime.timedelta(seconds=round(segment.end, 3)))[:-3], segment.speaker, output[0])) # Summary of the files processed f.close() logger.debug("Saved transcript @: %s" % waveFile.replace(".wav", ".txt")) # Extract filename from the full file path filename, ext = os.path.split(os.path.basename(waveFile)) title_names = [ 'Filename', 'Duration(s)', 'Inference Time(s)', 'Model Load Time(s)', 'LM Load Time(s)' ] logger.debug( "************************************************************************************************************" ) logger.debug("%-30s %-20s %-20s %-20s %s" % (title_names[0], title_names[1], title_names[2], title_names[3], title_names[4])) logger.debug("%-30s %-20.3f %-20.3f %-20.3f %-0.3f" % (filename + ext, audio_length, inference_time, model_retval[1], model_retval[2])) logger.debug( "************************************************************************************************************" ) print("\n%-30s %-20s %-20s %-20s %s" % (title_names[0], title_names[1], title_names[2], title_names[3], title_names[4])) print("%-30s %-20.3f %-20.3f %-20.3f %-0.3f" % (filename + ext, audio_length, inference_time, model_retval[1], model_retval[2]))
def init_stt(output_graph_path, alphabet_path, lm_path, trie_path, rate): global model, sample_rate sample_rate = rate logging.debug('Process {}: Loaded models'.format(os.getpid())) model = wavTranscriber.load_model(output_graph_path, alphabet_path, lm_path, trie_path)
def modelResult(self, dirName): # Fetch and Resolve all the paths of model files output_graph, alphabet, lm, trie = wavTranscriber.resolve_models(dirName) # Load output_graph, alpahbet, lm and trie self.model = wavTranscriber.load_model(output_graph, alphabet, lm, trie)
def runDeepspeech(self, waveFile, progress_callback): # Deepspeech will be run from this method logging.debug("Preparing for transcription...") # Go and fetch the models from the directory specified if self.dirName: # Resolve all the paths of model files output_graph, alphabet, lm, trie = wavTranscriber.resolve_models( self.dirName) else: logging.critical( "*****************************************************") logging.critical("Model path not specified..") logging.critical("You sure of what you're doing ?? ") logging.critical("Trying to fetch from present working directory.") logging.critical( "*****************************************************") return "Transcription Failed, models path not specified" # Load output_graph, alpahbet, lm and trie model_retval = wavTranscriber.load_model(output_graph, alphabet, lm, trie) inference_time = 0.0 # Run VAD on the input file segments, sample_rate, audio_length = wavTranscriber.vad_segment_generator( waveFile, 1) f = open(waveFile.rstrip(".wav") + ".txt", 'w') logging.debug("Saving Transcript @: %s" % waveFile.rstrip(".wav") + ".txt") for i, segment in enumerate(segments): # Run deepspeech on the chunk that just completed VAD logging.debug("Processing chunk %002d" % (i, )) audio = np.frombuffer(segment, dtype=np.int16) output = wavTranscriber.stt(model_retval[0], audio, sample_rate) inference_time += output[1] f.write(output[0] + " ") progress_callback.emit(output[0] + " ") # Summary of the files processed f.close() # Format pretty, extract filename from the full file path filename, ext = os.path.split(os.path.basename(waveFile)) title_names = [ 'Filename', 'Duration(s)', 'Inference Time(s)', 'Model Load Time(s)', 'LM Load Time(s)' ] logging.debug( "************************************************************************************************************" ) logging.debug("%-30s %-20s %-20s %-20s %s" % (title_names[0], title_names[1], title_names[2], title_names[3], title_names[4])) logging.debug("%-30s %-20.3f %-20.3f %-20.3f %-0.3f" % (filename + ext, audio_length, inference_time, model_retval[1], model_retval[2])) logging.debug( "************************************************************************************************************" ) print("\n%-30s %-20s %-20s %-20s %s" % (title_names[0], title_names[1], title_names[2], title_names[3], title_names[4])) print("%-30s %-20.3f %-20.3f %-20.3f %-0.3f" % (filename + ext, audio_length, inference_time, model_retval[1], model_retval[2])) return "\n*********************\nTranscription Done..."