class MozillaDeepSpeechEngine(Engine): def __init__(self, pbmm_path: str, scorer_path: str): self._model = Model(pbmm_path) self._model.enableExternalScorer(scorer_path) self._audio_sec = 0. self._proc_sec = 0. def transcribe(self, path: str) -> str: audio, sample_rate = soundfile.read(path, dtype='int16') assert sample_rate == self._model.sampleRate() self._audio_sec += audio.size / sample_rate start_sec = time.time() res = self._model.stt(audio) self._proc_sec += time.time() - start_sec return res def rtf(self) -> float: return self._proc_sec / self._audio_sec def delete(self) -> None: pass def __str__(self) -> str: return 'Mozilla DeepSpeech'
def main(): parser = argparse.ArgumentParser(description='Running DeepSpeech inference.') parser.add_argument('--model', required=True, help='Path to the model (protocol buffer binary file)') parser.add_argument('--lm', nargs='?', help='Path to the language model binary file') parser.add_argument('--trie', nargs='?', help='Path to the language model trie file created with native_client/generate_trie') parser.add_argument('--audio', required=True, help='Path to the audio file to run (WAV format)') parser.add_argument('--beam_width', type=int, default=500, help='Beam width for the CTC decoder') parser.add_argument('--lm_alpha', type=float, default=0.75, help='Language model weight (lm_alpha)') parser.add_argument('--lm_beta', type=float, default=1.85, help='Word insertion bonus (lm_beta)') parser.add_argument('--version', action=VersionAction, help='Print version and exits') parser.add_argument('--extended', required=False, action='store_true', help='Output string from extended metadata') parser.add_argument('--json', required=False, action='store_true', help='Output json from metadata with timestamp of each word') args = parser.parse_args() print('Loading model from file {}'.format(args.model), file=sys.stderr) model_load_start = timer() ds = Model(args.model, args.beam_width) model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) desired_sample_rate = ds.sampleRate() if args.lm and args.trie: print('Loading language model from files {} {}'.format(args.lm, args.trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(args.lm, args.trie, args.lm_alpha, args.lm_beta) lm_load_end = timer() - lm_load_start print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr) fin = wave.open(args.audio, 'rb') fs = fin.getframerate() if fs != desired_sample_rate: print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format(fs, desired_sample_rate), file=sys.stderr) fs, audio = convert_samplerate(args.audio, desired_sample_rate) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1/fs) fin.close() print('Running inference.', file=sys.stderr) inference_start = timer() if args.extended: print(metadata_to_string(ds.sttWithMetadata(audio))) elif args.json: print(metadata_json_output(ds.sttWithMetadata(audio))) else: print(ds.stt(audio)) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
def client(audio_file, lang="uk"): model_load_start = timer() # sphinx-doc: python_ref_model_start model = "./uk.tflite" ds = Model(model) # ds.enableExternalScorer("kenlm.scorer") # sphinx-doc: python_ref_model_stop model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) desired_sample_rate = ds.sampleRate() fin = wave.open(audio_file, 'rb') fs_orig = fin.getframerate() audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1 / fs_orig) fin.close() print('Running inference.', file=sys.stderr) inference_start = timer() # sphinx-doc: python_ref_inference_start result = ds.stt(audio) print(result) # sphinx-doc: python_ref_inference_stop inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr) return result
def transcribe(audio_path): ds = Model(model_path="deepspeech-0.7.0-models.pbmm") desired_sample_rate = ds.sampleRate() print(desired_sample_rate) ds.enableExternalScorer("deepspeech-0.7.0-models.scorer") fin = wave.open(audio_path, 'rb') fs_orig = fin.getframerate() if fs_orig != desired_sample_rate: print("Converting from {}hz to {}hz" % (fs_orig, desired_sample_rate)) fs_new, audio = convert_samplerate(audio_path, desired_sample_rate) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1 / fs_orig) fin.close() inference_start = timer() transcript = ds.sttWithMetadata(audio, 1).transcripts[0] json_result = metadata_json_output(transcript) string_result = metadata_to_string(transcript) inference_end = timer() - inference_start print(json_result) print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr) return json_result, string_result
class DeepSpeechRecognizer: def __init__(self): self.file_path = Path(__file__).parent self.model = Model('/Users/shihangyu/Scripts/python/stt_server/model/deepspeech-0.6.1-models/output_graph.pbmm', aBeamWidth=500) self.desired_sample_rate = self.model.sampleRate() self.logger = getLogger(self.__module__) self.tmp_path = self.file_path / 'tmp.wav' def __convert_samplerate(self, audio_path): sox_cmd = 'sox {} --type raw --bits 16 --channels 1 --rate {} --encoding signed-integer --endian little --compression 0.0 --no-dither - '.format( quote(audio_path), self.desired_sample_rate) try: output = subprocess.check_output(shlex.split(sox_cmd), stderr=subprocess.PIPE) except subprocess.CalledProcessError as e: raise RuntimeError('SoX returned non-zero status: {}'.format(e.stderr)) except OSError as e: raise OSError(e.errno, 'SoX not found, use {}hz files or install it: {}'.format(self.desired_sample_rate, e.strerror)) return self.desired_sample_rate, np.frombuffer(output, np.int16) def inference(self, audio_path): try: fin = wave.open(audio_path, 'rb') except Exception as e: x, _ = librosa.load(str(audio_path), sr=16000) sf.write(str(self.tmp_path), x, 16000) fin = wave.open(str(self.tmp_path), 'rb') fs = fin.getframerate() if fs != self.desired_sample_rate: # self.logger.warning(f'Warning: original sample rate ({fs}) is different than {self.desired_sample_rate}hz. ' # f'Resampling might produce erratic speech recognition.') fs, audio = self.__convert_samplerate(audio_path) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) fin.close() output = self.model.stt(audio) self.logger.debug(f"DeepSpeechRecognizer inference output: {output}") return output
def transcribe(args, filepath="", verbose=0): if verbose > 0: print('Loading model from file {}'.format(args.model), file=sys.stderr) model_load_start = timer() ds = Model(args.model, args.beam_width) if verbose > 0: model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format( model_load_end), file=sys.stderr) desired_sample_rate = ds.sampleRate() if args.lm and args.trie: if verbose > 0: print('Loading language model from files {} {}'.format( args.lm, args.trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(args.lm, args.trie, args.lm_alpha, args.lm_beta) if verbose > 0: lm_load_end = timer() - lm_load_start print('Loaded language model in {:.3}s.'.format( lm_load_end), file=sys.stderr) fin = wave.open(filepath, 'rb') fs = fin.getframerate() if fs != desired_sample_rate: if verbose > 0: print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format( fs, desired_sample_rate), file=sys.stderr) fs, audio = convert_samplerate(filepath, desired_sample_rate) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1/fs) fin.close() if verbose > 0: print('Running inference.', file=sys.stderr) inference_start = timer() audio_metadata = ds.sttWithMetadata(audio) if verbose > 0: inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr) dict_result = dict() dict_result["sentence"] = "".join( item.character for item in audio_metadata.items) dict_result["words"] = words_from_metadata(audio_metadata) dict_result["characters"] = audio_metadata dict_result["confidence"] = audio_metadata.confidence return dict_result
def load_transcribe_model(): model_load_start = timer() global ds ds = Model(os.path.join(home_dir, "models", "deepspeech-0.9.3-models.pbmm")) ds.enableExternalScorer( os.path.join(home_dir, "models", "deepspeech-0.9.3-models.scorer")) model_load_end = timer() - model_load_start logging.info('Loaded model in {:.3}s.'.format(model_load_end)) global desired_sample_rate desired_sample_rate = ds.sampleRate() logging.info('Model optimized for a sample rate of ' + str(desired_sample_rate))
def load_model(models, lm, trie): BEAM_WIDTH = 500 LM_ALPHA = 0.75 LM_BETA = 1.85 ds = Model(models, BEAM_WIDTH) ds.enableDecoderWithLM(lm, trie, LM_ALPHA, LM_BETA) sample_rate = ds.sampleRate() return [ds, sample_rate]
def speech_to_text(input_file, file_length, return_speed_per_chunk=False, chunk_size=10): """ Compute the words pronounced in the input_file :param input_file: sound file path :param file_length: time length of the input file (in seconds) :param return_speed_per_chunk: if True, the function return a list of words per chunk, if false it returns all the words in the extract :return: words as string """ # setup the model if return_speed_per_chunk: result = [] else: result = "" recognizer = Model("models/deepspeech-0.8.2-models.pbmm") recognizer.setBeamWidth(2000) recognizer.enableExternalScorer("models/deepspeech-0.8.2-models.scorer") desired_sample_rate = recognizer.sampleRate() # convert input file into smaller audio chunks (apparently works better) CHUNK_SIZE = chunk_size n_chunks = int(file_length // CHUNK_SIZE) for i in range(n_chunks): tfm = sox.Transformer() tfm.trim(i * CHUNK_SIZE, (i + 1) * CHUNK_SIZE) tfm.set_output_format(channels=1) tfm.build(input_file, "temp_folder/chunked_file{}.wav".format(i)) #cmb = sox.Combiner() input_list = [ "audio-files/silence.wav", "temp_folder/chunked_file{}.wav".format(i), "audio-files/silence.wav" ] input_list_correct_sample_rate = list( map(lambda file: convert_samplerate(file, desired_sample_rate)[1], input_list)) audio = np.concatenate(input_list_correct_sample_rate) #cmb.build(input_list, "temp_folder/chunked_file_with_silence{}.wav".format(i), combine_type="concatenate") #fs, audio = convert_samplerate("temp_folder/chunked_file_with_silence{}.wav".format(i), desired_sample_rate) if return_speed_per_chunk: result.append(recognizer.stt(audio)) else: result += recognizer.stt(audio) os.remove("temp_folder/chunked_file{}.wav".format(i)) #os.remove("temp_folder/chunked_file_with_silence{}.wav".format(i)) print(result) return result
def main(): audio_files = glob.glob("uploads/*.wav") speech_model = "deepspeech-0.9.3-models.pbmm" speech_scorer = "deepspeech-0.9.3-models.scorer" speech_audio = audio_files[0] print('Loading model from file', file=sys.stderr) model_load_start = timer() ds = Model(speech_model) model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) desired_sample_rate = ds.sampleRate() print('Loading scorer from files', file=sys.stderr) scorer_load_start = timer() ds.enableExternalScorer(speech_scorer) scorer_load_end = timer() - scorer_load_start print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr) fin = wave.open(speech_audio, 'rb') fs_orig = fin.getframerate() if fs_orig != desired_sample_rate: print( 'Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.' .format(fs_orig, desired_sample_rate), file=sys.stderr) fs_new, audio = convert_samplerate(speech_audio, desired_sample_rate) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1 / fs_orig) fin.close() print('Running inference.', file=sys.stderr) inference_start = timer() inference_end = timer() - inference_start audio_transcription = metadata_json_output(ds.sttWithMetadata(audio, 3)) print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr) print('Candidate Transcritps:', 3) return audio_transcription
def load_model(): models = "models/output_graph.tflite" #.tflite lm = "models/lm.binary" # lm.binary trie = "models/trie" # trie BEAM_WIDTH = 500 LM_ALPHA = 0.75 LM_BETA = 1.85 ds = Model(models, BEAM_WIDTH) ds.enableDecoderWithLM(lm, trie, LM_ALPHA, LM_BETA) sample_rate = ds.sampleRate() return [ds, sample_rate]
class DeepLearnModel: def __init__(self): self.model = None def init_app(self, app): self.model = Model(str(app.config["DL_MODEL_PATH"])) scorer = app.config['DL_SCORER_PATH'] if scorer: self._load_scorer(scorer) def _load_scorer(self, scorer): logging.info('Loading scorer from files {}'.format(scorer)) self.model.enableExternalScorer(scorer) def infer(self, audio_sample): audio = sample_audio(audio_sample, self.model.sampleRate()) return self.model.stt(audio)
def load_model(models, lm, trie): BEAM_WIDTH = 500 LM_ALPHA = 0.75 LM_BETA = 1.85 model_load_start = timer() ds = Model(models, BEAM_WIDTH) model_load_end = timer() - model_load_start logging.debug("Loaded model in %0.3fs." % (model_load_end)) lm_load_start = timer() ds.enableDecoderWithLM(lm, trie, LM_ALPHA, LM_BETA) lm_load_end = timer() - lm_load_start logging.debug('Loaded language model in %0.3fs.' % (lm_load_end)) sample_rate = ds.sampleRate() logging.debug('Loaded model sample rate: %dHz.' % (sample_rate)) return [ds, model_load_end, lm_load_end, sample_rate]
def stt(model_path, audio, beam_width=None, scorer_path=None, lm_alpha=None, lm_beta=None, hot_words=None): ds = Model(model_path) if beam_width: ds.setBeamWidth(beam_width) desired_sample_rate = ds.sampleRate() if scorer_path: ds.enableExternalScorer(scorer_path) if lm_alpha and lm_beta: ds.setScorerAlphaBeta(lm_alpha, lm_beta) # TODO # if hot_words: # print('Adding hot-words', file=sys.stderr) # for w in hot_words: # ds.addHotWord(w, 6.2) fin = wave.open(audio, 'rb') fs_orig = fin.getframerate() if fs_orig != desired_sample_rate: print( f'ERROR: original sample rate ({fs_orig}) is different than {desired_sample_rate}hz.', file=sys.stderr) exit(1) audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) fin.close() print('Running inference.', file=sys.stderr) res = ds.sttWithMetadata(audio, 1) res = postprocess_metadata(res) return res
def process_input_file(conn, options, out_queue, background=True): """Given socket/pipe process audio input and push to out_queue""" log.info("Starting recognition on %s", conn) model = Model(options.model,) if options.beam_width: model.setBeamWidth(options.beam_width) desired_sample_rate = model.sampleRate() if desired_sample_rate != defaults.SAMPLE_RATE: log.error("Model expects rate of %s", desired_sample_rate) # if options.scorer: # model.enableExternalScorer(options.scorer) # else: log.info("Disabling the built-in scorer") model.disableExternalScorer() out_queue.put({'partial': False, 'final': False, 'message': ['Connected']}) if background: thread = threading.Thread(target=run_recognition, args=(model, conn, out_queue)) thread.setDaemon(background) thread.start() else: run_recognition(model, conn, out_queue)
def process_input_file(conn, options, out_queue, background=True): # TODO: allow socket connections from *clients* to choose # the model rather than setting it in the daemon... # to be clear, *output* clients, not audio sinks log.info("Starting recognition on %s", conn) model = Model(options.model,) if options.beam_width: model.setBeamWidth(options.beam_width) desired_sample_rate = model.sampleRate() if desired_sample_rate != defaults.SAMPLE_RATE: log.error("Model expects rate of %s", desired_sample_rate) if options.scorer: model.enableExternalScorer(options.scorer) else: log.info("Disabling the scorer") model.disableExternalScorer() if background: t = threading.Thread(target=run_recognition, args=(model, conn, out_queue)) t.setDaemon(background) t.start() else: run_recognition(model, conn, out_queue)
def MozillaSTT(audio_path): # TODO: handle different rates (not implemented) fin = wave.open(audio_path, 'rb') output = "" # print("SS") ds = Model(model_file_path) # print("SS") ds.enableExternalScorer(scorer_file_path) # print("SS") lm_alpha = 0.75 # ?? lm_beta = 1.85 desired_sample_rate = ds.sampleRate() ds.setScorerAlphaBeta(lm_alpha, lm_beta) fs_orig = fin.getframerate() # print("Desired Sampling Rate: %d", desired_sample_rate) if fs_orig != desired_sample_rate: print('Warning: original sample rate ({}) is different than {}hz. \ Resampling might produce erratic speech recognition.'.format( fs_orig, desired_sample_rate), file=sys.stderr) fs_new, audio = convert_samplerate(audio_path, desired_sample_rate) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) # audio_length = fin.getnframes() * (1/fs_orig) fin.close() print('Running inference.', file=sys.stderr) # print(metadata_to_string(ds.sttWithMetadata(audio, 1).transcripts[0])) # print(metadata_json_output(ds.sttWithMetadata(audio, 3))) # print(ds.stt(audio)) output += ds.stt(audio) output += '\n' output += metadata_json_output(ds.sttWithMetadata(audio, 3)) return output
def load(model, scorer, verbose=True, beam_width="", lm_alpha="", lm_beta="", hot_words=""): """ Load models""" model_load_start = timer() # sphinx-doc: python_ref_model_start ds = Model(model) # sphinx-doc: python_ref_model_stop model_load_end = timer() - model_load_start if verbose==True: print('\nLoading model from files {}'.format(model), file=sys.stderr) print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) if beam_width: ds.setBeamWidth(beam_width) desired_sample_rate = ds.sampleRate() if scorer: if verbose == True: print('Loading scorer from files {}'.format(scorer), file=sys.stderr) scorer_load_start = timer() ds.enableExternalScorer(scorer) scorer_load_end = timer() - scorer_load_start if verbose == True: print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr) if lm_alpha and lm_beta: ds.setScorerAlphaBeta(lm_alpha, lm_beta) if hot_words: if verbose == True: print('Adding hot-words', file=sys.stderr) for word_boost in hot_words.split(','): word, boost = word_boost.split(':') ds.addHotWord(word, float(boost)) return ds, desired_sample_rate
def __init__(self, ): print('Loading model from file {}'.format(args.model), file=sys.stderr) model_load_start = timer() # sphinx-doc: python_ref_model_start model_path = os.path.dirname(os.path.abspath(__file__)) ds = Model(os.path.join(model_path, args.model)) # sphinx-doc: python_ref_model_stop model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) if args.beam_width: ds.setBeamWidth(args.beam_width) self.desired_sample_rate = ds.sampleRate() if args.scorer: print('Loading scorer from files {}'.format(args.scorer), file=sys.stderr) scorer_load_start = timer() ds.enableExternalScorer(os.path.join(model_path, args.scorer)) scorer_load_end = timer() - scorer_load_start print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr) if args.lm_alpha and args.lm_beta: ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta) if args.hot_words: print('Adding hot-words', file=sys.stderr) for word_boost in args.hot_words.split(','): word, boost = word_boost.split(':') ds.addHotWord(word, float(boost)) self.ds = ds
# -*- coding: utf-8 -*- """ Created on Sat Mar 6 15:41:29 2021 @author: Marshall.McDougall """ #import argparse #import numpy as np #import shlex #import subprocess #import sys #import wave #import json from deepspeech import Model #, version #from timeit import default_timer as timer #try: # from shhlex import quote #except ImportError: # from pipes import quote # deepspeech --model deepspeech-0.9.3-models.pbmm --scorer deepspeech-0.9.3-models.scorer --audio audio/SimpleTest3.wav --json ds = Model("deepspeech-0.9.3-models.pbmm") desired_sample_rate = ds.sampleRate() ds.enableExternalScorer("deepspeech-0.9.3-models.scorer") ds.sttWithMetadata("audio/SimpleTest3.wav", "3")
def create_data(X: dt.Frame = None) -> dt.Frame: if X is None: return [] from deepspeech import Model try: logger = logging.getLogger(__name__) hdlr = logging.FileHandler(LOG_FILE) formatter = logging.Formatter( '%(asctime)s %(levelname)s %(message)s') hdlr.setFormatter(formatter) logger.addHandler(hdlr) logger.setLevel(logging.INFO) except: logger = False X = X.to_pandas() if WAV_COLNAME in X.columns: model = os.path.join(MODEL_PATH, "output_graph.pbmm") lm = os.path.join(MODEL_PATH, "lm.binary") trie = os.path.join(MODEL_PATH, "trie") if logger: logger.info('Loading model from file {}'.format(model)) model_load_start = timer() ds = Model(model, beam_width) model_load_end = timer() - model_load_start if logger: logger.info('Loaded model in {:.3}s.'.format(model_load_end)) desired_sample_rate = ds.sampleRate() if logger: logger.info('Loading language model from files {} {}'.format( lm, trie)) lm_load_start = timer() ds.enableDecoderWithLM(lm, trie, lm_alpha, lm_beta) lm_load_end = timer() - lm_load_start if logger: logger.info( 'Loaded language model in {:.3}s.'.format(lm_load_end)) logger.info('Running inference.') results = [] ds_len = len(X[WAV_COLNAME]) for i, audio_fn in enumerate(X[WAV_COLNAME].values.tolist()): inference_start = timer() audio_length = 0 fin = wave.open(audio_fn, 'rb') fs = fin.getframerate() if fs != desired_sample_rate: if logger: err_msg = 'Original sample rate ({}) is different than {}hz. '\ 'Resampling might produce erratic speech recognition.' logger.warning(err_msg.format(fs, desired_sample_rate)) fs, audio = convert_samplerate(audio_fn, desired_sample_rate) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) if MAX_SEC > 0: audio = audio[:int(fs * MAX_SEC)] audio_length = len(audio) * (1 / fs) fin.close() try: text = ds.stt(audio) except Exception as e: text = '' logger.error(e) results.append(text) inference_end = timer() - inference_start if logger: logger.info( 'Record {:d} of {:d}. Inference took {:0.3f}s for {:0.3f}s audio file.' .format(i, ds_len, inference_end, audio_length)) X[WAV_COLNAME + "_txt"] = results return dt.Frame(X)
def create_app(args): logging.basicConfig(level=logging.DEBUG) sys.stdout = LoggerWriter(logging.debug) sys.stderr = LoggerWriter(logging.warning) if not args.offline: from app.init import boot boot() from app.language import languages app = Flask(__name__) project_directory = args.project_directory if not os.path.exists(project_directory): os.makedirs(project_directory) # For faster access language_map = {} for l in languages: language_map[l.code] = l.name if args.debug: app.config['TEMPLATES_AUTO_RELOAD'] = True app.config['MAX_CONTENT_LENGTH'] = 64 * 1024 * 1024 # Map userdefined frontend languages to argos language object. if args.frontend_language_source == "auto": frontend_argos_language_source = type('obj', (object, ), { 'code': 'auto', 'name': 'Auto Detect' }) else: frontend_argos_language_source = next( iter([ l for l in languages if l.code == args.frontend_language_source ]), None) frontend_argos_language_target = next( iter([l for l in languages if l.code == args.frontend_language_target]), None) # Raise AttributeError to prevent app startup if user input is not valid. if frontend_argos_language_source is None: raise AttributeError( f"{args.frontend_language_source} as frontend source language is not supported." ) if frontend_argos_language_target is None: raise AttributeError( f"{args.frontend_language_target} as frontend target language is not supported." ) if args.req_limit > 0 or args.api_keys: from flask_limiter import Limiter limiter = Limiter(app, key_func=get_remote_address, default_limits=get_routes_limits( args.req_limit, Database() if args.api_keys else None)) model_load_start = timer() ds = Model(os.path.join(home_dir, "models", "deepspeech-0.9.3-models.pbmm")) ds.enableExternalScorer( os.path.join(home_dir, "models", "deepspeech-0.9.3-models.scorer")) model_load_end = timer() - model_load_start logging.info('Loaded model in {:.3}s.'.format(model_load_end)) desired_sample_rate = ds.sampleRate() logging.info('Model optimized for a sample rate of ' + str(desired_sample_rate)) uuid4hex = re.compile( '[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\Z', re.I) @app.errorhandler(400) def invalid_api(e): return jsonify({"error": str(e.description)}), 400 @app.errorhandler(500) def server_error(e): return jsonify({"error": str(e.description)}), 500 @app.errorhandler(429) def slow_down_error(e): return jsonify({"error": "Slowdown: " + str(e.description)}), 429 @app.route("/") @limiter.exempt def index(): return render_template('index.html', gaId=args.ga_id, frontendTimeout=args.frontend_timeout, offline=args.offline, api_keys=args.api_keys, web_version=os.environ.get('LT_WEB') is not None) @app.route("/projects") @limiter.exempt def projects(): return render_template('projects.html', gaId=args.ga_id, frontendTimeout=args.frontend_timeout, offline=args.offline, api_keys=args.api_keys, projects=loadAllProjects(), web_version=os.environ.get('LT_WEB') is not None) @app.route("/project/<id>") @limiter.exempt def project(id): if not uuid4hex.match(id): logging.error("Invalid project id") return redirect("/projects") return render_template('project.html', gaId=args.ga_id, frontendTimeout=args.frontend_timeout, offline=args.offline, api_keys=args.api_keys, project=loadProjectDetails(id), web_version=os.environ.get('LT_WEB') is not None) @app.route("/project/<id>/delete") @limiter.exempt def projectDelete(id): delete_project(id) return redirect("/projects") @app.route("/project/<id>/transcription") @limiter.exempt def projectTranscribe(id): if not uuid4hex.match(id): flash("Invalid project id") return redirect("/projects") logging.info("Starting the transcription job for project ID " + id) cmd = [ sys.executable, os.path.join(home_dir, 'scripts', 'batch.py', "--target-dir", os.path.join(project_directory, id)) ] subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) return redirect("/project/" + id) @app.route("/project/<id>/download/<file>") def download(id, file): # todo validate the file part metadata = loadProjectDetails(id) if metadata is None: logging.info("Unable to find metdata for project ID: " + id) return redirect("/projects") return send_from_directory(directory=metadata['project_dir'], filename=file, as_attachment=True) @app.route("/create-project") @limiter.exempt def createProject(): return render_template('create-project.html', gaId=args.ga_id, frontendTimeout=args.frontend_timeout, offline=args.offline, api_keys=args.api_keys, web_version=os.environ.get('LT_WEB') is not None) def allowed_file(filename): return '.' in filename and filename.rsplit( '.', 1)[1].lower() in ALLOWED_EXTENSIONS @app.route('/new-project-upload', methods=['GET', 'POST']) def uploadProject(): if request.method == 'POST': # check if the post request has the file part if 'file' not in request.files: return redirect(request.url) file = request.files['file'] # if user does not select file, browser also # submit an empty part without filename if file.filename == '': return redirect(request.url) if file and allowed_file(file.filename): project_id = str(uuid.uuid4()) if not os.path.exists( os.path.join(project_directory, project_id)): os.makedirs(os.path.join(project_directory, project_id)) fileending = file.filename.rsplit('.', 1)[1].lower() file.save( os.path.join(project_directory, project_id, "rawMedia." + fileending)) # TODO store original file name metadata = createMetadata(project_id, request.form['name'], fileending) with open( os.path.join(project_directory, project_id, "metadata.json"), 'w') as f: json.dump(metadata, f) return redirect("./project/" + project_id) @timeit def createMetadata(project_id, name, ending): metadata = {"name": name, "fileEnding": ending} in_filename = os.path.join(project_directory, project_id, "rawMedia." + ending) probe = ffmpeg.probe(in_filename) video_stream = next((stream for stream in probe['streams'] if stream['codec_type'] == 'video'), None) logging.debug(str(video_stream)) metadata['width'] = int(video_stream['width']) metadata['height'] = int(video_stream['height']) metadata['durationSeconds'] = float(video_stream['duration']) (ffmpeg.input(in_filename, ss=3).filter('scale', 512, -1).output(os.path.join( project_directory, project_id, "thumbnail.png"), vframes=1).run()) return metadata def delete_project(project_id): logging.info("Deleting a project with ID: " + project_id) # TODO make sure tha ID is a valid ID an not just some bad path shutil.rmtree(os.path.join(project_directory, project_id)) @app.route("/languages", methods=['GET', 'POST']) @limiter.exempt def langs(): """ Retrieve list of supported languages --- tags: - translate responses: 200: description: List of languages schema: id: languages type: array items: type: object properties: code: type: string description: Language code name: type: string description: Human-readable language name (in English) 429: description: Slow down schema: id: error-slow-down type: object properties: error: type: string description: Reason for slow down """ return jsonify([{'code': l.code, 'name': l.name} for l in languages]) # Add cors @app.after_request def after_request(response): response.headers.add('Access-Control-Allow-Origin', '*') response.headers.add('Access-Control-Allow-Headers', "Authorization, Content-Type") response.headers.add('Access-Control-Expose-Headers', "Authorization") response.headers.add('Access-Control-Allow-Methods', "GET, POST") response.headers.add('Access-Control-Allow-Credentials', "true") response.headers.add('Access-Control-Max-Age', 60 * 60 * 24 * 20) return response @app.route("/project", methods=['GET']) def list_projects(): """ List available projects --- tags: - list """ return jsonify({"projects": loadAllProjects()}) def loadAllProjects(): output = [] for project_id in os.listdir(project_directory): project_details = loadProjectDetails(project_id) if project_details is not None: output.append(project_details) return output def loadProjectDetails(project_id): metadata_path = os.path.join(project_directory, project_id, "metadata.json") if not os.path.exists(metadata_path): return None metadata = json.loads(Path(metadata_path).read_text()) metadata["id"] = project_id metadata['project_dir'] = os.path.join(project_directory, project_id) # TODO rely on this data for everything metadata['subtitles'] = [] for file in os.listdir(metadata['project_dir']): if file.endswith(".srt"): metadata['subtitles'].append(file) if os.path.exists(os.path.join(project_directory, "subtitles.zip")): metadata['subtitles'].insert(0, 'subtitles.zip') metadata['inputVideo'] = "rawMedia." + metadata['fileEnding'] metadata['audio'] = "audio.wav" return metadata @app.route("/translate", methods=['POST']) def translate(): """ Translate text from a language to another --- tags: - translate parameters: - in: formData name: q schema: oneOf: - type: string example: Hello world! - type: array example: ['Hello world!'] required: true description: Text(s) to translate - in: formData name: source schema: type: string example: en required: true description: Source language code - in: formData name: target schema: type: string example: es required: true description: Target language code - in: formData name: api_key schema: type: string example: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx required: false description: API key responses: 200: description: Translated text schema: id: translate type: object properties: translatedText: oneOf: - type: string - type: array description: Translated text(s) 400: description: Invalid request schema: id: error-response type: object properties: error: type: string description: Error message 500: description: Translation error schema: id: error-response type: object properties: error: type: string description: Error message 429: description: Slow down schema: id: error-slow-down type: object properties: error: type: string description: Reason for slow down """ if request.is_json: json = request.get_json() q = json.get('q') source_lang = json.get('source') target_lang = json.get('target') else: q = request.values.get("q") source_lang = request.values.get("source") target_lang = request.values.get("target") if not q: abort(400, description="Invalid request: missing q parameter") if not source_lang: abort(400, description="Invalid request: missing source parameter") if not target_lang: abort(400, description="Invalid request: missing target parameter") batch = isinstance(q, list) if batch and args.batch_limit != -1: batch_size = len(q) if args.batch_limit < batch_size: abort(400, description= "Invalid request: Request (%d) exceeds text limit (%d)" % (batch_size, args.batch_limit)) if args.char_limit != -1: if batch: chars = sum([len(text) for text in q]) else: chars = len(q) if args.char_limit < chars: abort( 400, description= "Invalid request: Request (%d) exceeds character limit (%d)" % (chars, args.char_limit)) if source_lang == 'auto': candidate_langs = list( filter(lambda l: l.lang in language_map, detect_langs(q))) if len(candidate_langs) > 0: candidate_langs.sort(key=lambda l: l.prob, reverse=True) if args.debug: print(candidate_langs) source_lang = next( iter([ l.code for l in languages if l.code == candidate_langs[0].lang ]), None) if not source_lang: source_lang = 'en' else: source_lang = 'en' if args.debug: print("Auto detected: %s" % source_lang) src_lang = next(iter([l for l in languages if l.code == source_lang]), None) tgt_lang = next(iter([l for l in languages if l.code == target_lang]), None) if src_lang is None: abort(400, description="%s is not supported" % source_lang) if tgt_lang is None: abort(400, description="%s is not supported" % target_lang) translator = src_lang.get_translation(tgt_lang) try: if batch: return jsonify({ "translatedText": [translator.translate(text) for text in q] }) else: return jsonify({"translatedText": translator.translate(q)}) except Exception as e: abort(500, description="Cannot translate text: %s" % str(e)) @app.route("/detect", methods=['POST']) def detect(): """ Detect the language of a single text --- tags: - translate parameters: - in: formData name: q schema: type: string example: Hello world! required: true description: Text to detect - in: formData name: api_key schema: type: string example: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx required: false description: API key responses: 200: description: Detections schema: id: detections type: array items: type: object properties: confidence: type: number format: float minimum: 0 maximum: 1 description: Confidence value example: 0.6 language: type: string description: Language code example: en 400: description: Invalid request schema: id: error-response type: object properties: error: type: string description: Error message 500: description: Detection error schema: id: error-response type: object properties: error: type: string description: Error message 429: description: Slow down schema: id: error-slow-down type: object properties: error: type: string description: Reason for slow down """ if request.is_json: json = request.get_json() q = json.get('q') else: q = request.values.get("q") if not q: abort(400, description="Invalid request: missing q parameter") candidate_langs = list( filter(lambda l: l.lang in language_map, detect_langs(q))) candidate_langs.sort(key=lambda l: l.prob, reverse=True) return jsonify([{ 'confidence': l.prob, 'language': l.lang } for l in candidate_langs]) @app.route("/frontend/settings") @limiter.exempt def frontend_settings(): """ Retrieve frontend specific settings --- tags: - frontend responses: 200: description: frontend settings schema: id: frontend-settings type: object properties: charLimit: type: integer description: Character input limit for this language (-1 indicates no limit) frontendTimeout: type: integer description: Frontend translation timeout language: type: object properties: source: type: object properties: code: type: string description: Language code name: type: string description: Human-readable language name (in English) target: type: object properties: code: type: string description: Language code name: type: string description: Human-readable language name (in English) """ return jsonify({ 'charLimit': args.char_limit, 'frontendTimeout': args.frontend_timeout, 'language': { 'source': { 'code': frontend_argos_language_source.code, 'name': frontend_argos_language_source.name }, 'target': { 'code': frontend_argos_language_target.code, 'name': frontend_argos_language_target.name } } }) swag = swagger(app) swag['info']['version'] = "1.2" swag['info']['title'] = "LibreTranslate" @app.route("/spec") @limiter.exempt def spec(): return jsonify(swag) SWAGGER_URL = '/docs' # URL for exposing Swagger UI (without trailing '/') API_URL = '/spec' # Call factory function to create our blueprint swaggerui_blueprint = get_swaggerui_blueprint(SWAGGER_URL, API_URL) app.register_blueprint(swaggerui_blueprint) return app
class DeepSpeechInput(AudioInput): """ Input from DeepSpeech using the US English language model. """ def __init__(self, notifier, rate=None, wav_dir=None, model=os.path.join(_MODEL_DIR, 'models.pbmm'), scorer=os.path.join(_MODEL_DIR, 'models.scorer')): """ @see AudioInput.__init__() :type rate: :param rate: The override for the rate, if not the model's one. :type wav_dir: :param wav_dir: Where to save the wave files, if anywhere. :type model: :param model: The path to the DeepSpeech model file. :type scorer: :param scorer: The path to the DeepSpeech scorer file. """ # If these don't exist then DeepSpeech will segfault when inferring! if not os.path.exists(model): raise IOError("Not found: %s" % (model, )) # Load in and configure the model. LOG.info("Loading model from %s" % (model, )) self._model = Model(model) if os.path.exists(scorer): LOG.info("Loading scorer from %s" % (scorer, )) self._model.enableExternalScorer(scorer) # Handle any rate override if rate is None: rate = self._model.sampleRate() # Wen can now init the superclass super(DeepSpeechInput, self).__init__(notifier, format=pyaudio.paInt16, channels=1, rate=rate, wav_dir=wav_dir) # Where we put the stream context self._context = None def _feed_raw(self, data): """ @see AudioInput._feed_raw() """ if self._context is None: self._context = self._model.createStream() audio = numpy.frombuffer(data, numpy.int16) self._context.feedAudioContent(audio) def _decode(self): """ @see AudioInput._decode() """ if self._context is None: # No context means no tokens LOG.warning("Had no stream context to close") tokens = [] else: # Finish up by finishing the decoding words = self._context.finishStream() LOG.info("Got: %s" % (words, )) self._context = None # And tokenize tokens = [ Token(word.strip(), 1.0, True) for word in words.split(' ') if len(word.strip()) > 0 ] return tokens
def app_sst_with_video(model_path: str, lm_path: str, lm_alpha: float, lm_beta: float, beam: int): class AudioProcessor(AudioProcessorBase): frames_lock: threading.Lock frames: deque def __init__(self) -> None: self.frames_lock = threading.Lock() self.frames = deque([]) async def recv_queued(self, frames: List[av.AudioFrame]) -> av.AudioFrame: with self.frames_lock: self.frames.extend(frames) # Return empty frames to be silent. new_frames = [] for frame in frames: input_array = frame.to_ndarray() new_frame = av.AudioFrame.from_ndarray( np.zeros(input_array.shape, dtype=input_array.dtype), layout=frame.layout.name, ) new_frame.sample_rate = frame.sample_rate new_frames.append(new_frame) return new_frames webrtc_ctx = webrtc_streamer( key="speech-to-text-w-video", mode=WebRtcMode.SENDRECV, audio_processor_factory=AudioProcessor, rtc_configuration={ "iceServers": [{ "urls": ["stun:stun.l.google.com:19302"] }] }, media_stream_constraints={ "video": True, "audio": True }, ) status_indicator = st.empty() if not webrtc_ctx.state.playing: return status_indicator.write("Loading...") text_output = st.empty() stream = None while True: if webrtc_ctx.audio_processor: if stream is None: from deepspeech import Model model = Model(model_path) model.enableExternalScorer(lm_path) model.setScorerAlphaBeta(lm_alpha, lm_beta) model.setBeamWidth(beam) stream = model.createStream() status_indicator.write("Model loaded.") sound_chunk = pydub.AudioSegment.empty() audio_frames = [] with webrtc_ctx.audio_processor.frames_lock: while len(webrtc_ctx.audio_processor.frames) > 0: frame = webrtc_ctx.audio_processor.frames.popleft() audio_frames.append(frame) if len(audio_frames) == 0: time.sleep(0.1) status_indicator.write("No frame arrived.") continue status_indicator.write("Running. Say something!") for audio_frame in audio_frames: sound = pydub.AudioSegment( data=audio_frame.to_ndarray().tobytes(), sample_width=audio_frame.format.bytes, frame_rate=audio_frame.sample_rate, channels=len(audio_frame.layout.channels), ) sound_chunk += sound if len(sound_chunk) > 0: sound_chunk = sound_chunk.set_channels(1).set_frame_rate( model.sampleRate()) buffer = np.array(sound_chunk.get_array_of_samples()) stream.feedAudioContent(buffer) text = stream.intermediateDecode() text_output.markdown(f"**Text:** {text}") else: status_indicator.write("AudioReciver is not set. Abort.") break
import wave import numpy as np import sys import shlex import subprocess from deepspeech import Model from tqdm import tqdm try: from shhlex import quote except ImportError: from pipes import quote model = Model("deepspeech-0.9.3-models.pbmm") model.enableExternalScorer("deepspeech-0.9.3-models.scorer") desired_sample_rate = model.sampleRate() PATH = os.path.join("LJSpeech-1.1", "wavs") TOTAL_SAMPLES = 100 def convert_samplerate(audio_path, desired_sample_rate): sox_cmd = "sox {} --type raw --bits 16 --channels 1 --rate {} --encoding signed-integer --endian little --compression 0.0 --no-dither - ".format( quote(audio_path), desired_sample_rate) try: output = subprocess.check_output(shlex.split(sox_cmd), stderr=subprocess.PIPE) except subprocess.CalledProcessError as e: raise RuntimeError("SoX returned non-zero status: {}".format(e.stderr)) except OSError as e: raise OSError( e.errno, "SoX not found, use {}hz files or install it: {}".format(
def app_sst(model_path: str, lm_path: str, lm_alpha: float, lm_beta: float, beam: int): webrtc_ctx = webrtc_streamer( key="speech-to-text", mode=WebRtcMode.SENDONLY, audio_receiver_size=1024, rtc_configuration={ "iceServers": [{ "urls": ["stun:stun.l.google.com:19302"] }] }, media_stream_constraints={ "video": False, "audio": True }, ) status_indicator = st.empty() if not webrtc_ctx.state.playing: return status_indicator.write("Loading...") text_output = st.empty() stream = None while True: if webrtc_ctx.audio_receiver: if stream is None: from deepspeech import Model model = Model(model_path) model.enableExternalScorer(lm_path) model.setScorerAlphaBeta(lm_alpha, lm_beta) model.setBeamWidth(beam) stream = model.createStream() status_indicator.write("Model loaded.") sound_chunk = pydub.AudioSegment.empty() try: audio_frames = webrtc_ctx.audio_receiver.get_frames(timeout=1) except queue.Empty: time.sleep(0.1) status_indicator.write("No frame arrived.") continue status_indicator.write("Running. Say something!") for audio_frame in audio_frames: sound = pydub.AudioSegment( data=audio_frame.to_ndarray().tobytes(), sample_width=audio_frame.format.bytes, frame_rate=audio_frame.sample_rate, channels=len(audio_frame.layout.channels), ) sound_chunk += sound if len(sound_chunk) > 0: sound_chunk = sound_chunk.set_channels(1).set_frame_rate( model.sampleRate()) buffer = np.array(sound_chunk.get_array_of_samples()) stream.feedAudioContent(buffer) text = stream.intermediateDecode() text_output.markdown(f"**Text:** {text}") else: status_indicator.write("AudioReciver is not set. Abort.") break
class SpeechToTextEngine: """ Class to perform speech-to-text transcription and related functionality """ def __init__(self, scorer='deepspeech_model.scorer') -> None: """ Initializing the DeepSpeech model """ self.model = Model(model_path=Path(__file__).parents[2].joinpath( 'deepspeech_model.pbmm').absolute().as_posix()) self.model.enableExternalScorer(scorer_path=Path( __file__).parents[2].joinpath(scorer).absolute().as_posix()) def run(self, audio) -> str: """ Receives the audio, normalizes it and is sent to the model to be transcribed. Returns the result of the transcribe audio in string format.""" normalized_audio = normalize_audio_input(audio) audio_streams = BytesIO(normalized_audio) with wave.Wave_read(audio_streams) as wav: audio_streams = np.frombuffer(wav.readframes(wav.getnframes()), np.int16) results = self.model.stt(audio_buffer=audio_streams) return results def run_with_metadata(self, audio) -> Metadata: normalized_audio = normalize_audio_input(audio) audio_streams = BytesIO(normalized_audio) with wave.Wave_read(audio_streams) as wav: audio_streams = np.frombuffer(wav.readframes(wav.getnframes()), np.int16) results = self.model.sttWithMetadata(audio_buffer=audio_streams) return results def add_hot_words(self, data) -> list: """ Receives data in form of hot-words and boosts, adds them to the language model and return the list of the added hot-words """ all_hot_words = [] try: logger.info('----------------------------------------------------') for hot_word in data: # Change all the characters of the hot-word to lower case word = hot_word.lower() # Get numeric value of the boost boost = float(data.get(hot_word)) # Adding the hot-word and its boost to the language model self.model.addHotWord(hot_word, boost) # Printing on the prompt the activity logger.info( f"`{word}` hot-word with boost `{boost}` was added.") all_hot_words.append(word) return all_hot_words except RuntimeError: return [] def erase_hot_word(self, hot_words) -> None: try: for hot_word in hot_words: self.model.eraseHotWord(hot_word) logger.info(f"`{hot_word}` hot-word is erased.") logger.info('----------------------------------------------------') except RuntimeError: return def clear_hot_words(self) -> str: try: self.model.clearHotWords() return f"All hot-words were erased." except RuntimeError: return f"No more hot-words are left." def sample_rate(self): return self.model.sampleRate()
def main_transcript(video_to_encode): msg = "" mp3file = video_to_encode.get_video_mp3( ).source_file if video_to_encode.get_video_mp3() else None lang = video_to_encode.main_lang # check if DS_PARAM [lang] exist if not DS_PARAM.get(lang): msg += "\n no deepspeech model found for lang:%s." % lang msg += "Please add it in DS_PARAM." return msg ds_model = Model(DS_PARAM[lang]['model'], DS_PARAM[lang]['beam_width']) if all([ cond in DS_PARAM[lang] for cond in ['alphabet', 'lm', 'trie', 'lm_alpha', 'lm_beta'] ]): ds_model.enableDecoderWithLM(DS_PARAM[lang]['lm'], DS_PARAM[lang]['trie'], DS_PARAM[lang]['lm_alpha'], DS_PARAM[lang]['lm_beta']) desired_sample_rate = ds_model.sampleRate() webvtt = WebVTT() inference_start = timer() last_item = None sentences = [] sentence = [] metadata = None for start_trim in range(0, video_to_encode.duration, AUDIO_SPLIT_TIME): end_trim = video_to_encode.duration if start_trim + \ AUDIO_SPLIT_TIME > video_to_encode.duration else ( start_trim + AUDIO_SPLIT_TIME + SENTENCE_MAX_LENGTH) duration = (AUDIO_SPLIT_TIME + SENTENCE_MAX_LENGTH) if start_trim + \ AUDIO_SPLIT_TIME + SENTENCE_MAX_LENGTH < video_to_encode.duration \ else (video_to_encode.duration - start_trim) msg += "\ntake audio from %s to %s - %s" % (start_trim, end_trim, duration) audio = convert_samplerate(mp3file.path, desired_sample_rate, start_trim, duration) msg += '\nRunning inference.' metadata = ds_model.sttWithMetadata(audio) msg += '\nConfidence : %s' % metadata.confidence sentences[:] = [] # empty list sentence[:] = [] # empty list refItem = metadata.items[0] index = get_index(metadata, last_item, start_trim) if last_item else 0 # nb of character in AUDIO_SPLIT_TIME msg += "METADATA ITEMS : %d " % len(metadata.items) sentences = get_sentences(metadata, refItem, index) last_item = ( sentences[-1][-1].character, sentences[-1][-1].start_time) if len(sentences) > 0 else () for sent in sentences: if len(sent) > 0: start_time = sent[0].start_time + start_trim end_time = sent[-1].start_time + start_trim str_sentence = ''.join(item.character for item in sent) # print(start_time, end_time, str_sentence) caption = Caption( '%s.%s' % (timedelta(seconds=int(str(start_time).split('.')[0])), str('%.3f' % start_time).split('.')[1]), '%s.%s' % (timedelta(seconds=int(str(end_time).split('.')[0])), str('%.3f' % end_time).split('.')[1]), ['%s' % str_sentence]) webvtt.captions.append(caption) # print(webvtt) msg += saveVTT(video_to_encode, webvtt) inference_end = timer() - inference_start msg += '\nInference took %0.3fs.' % inference_end # print(msg) return msg
def main(): parser = argparse.ArgumentParser(description='Running DeepSpeech inference.') parser.add_argument('--model', required=True, help='Path to the model (protocol buffer binary file)') parser.add_argument('--scorer', required=False, help='Path to the external scorer file') parser.add_argument('--audio', required=True, help='Path to the audio file to run (WAV format)') parser.add_argument('--beam_width', type=int, help='Beam width for the CTC decoder') parser.add_argument('--lm_alpha', type=float, help='Language model weight (lm_alpha). If not specified, use default from the scorer package.') parser.add_argument('--lm_beta', type=float, help='Word insertion bonus (lm_beta). If not specified, use default from the scorer package.') parser.add_argument('--version', action=VersionAction, help='Print version and exits') parser.add_argument('--extended', required=False, action='store_true', help='Output string from extended metadata') parser.add_argument('--json', required=False, action='store_true', help='Output json from metadata with timestamp of each word') parser.add_argument('--candidate_transcripts', type=int, default=3, help='Number of candidate transcripts to include in JSON output') args = parser.parse_args() # print('Loading model from file {}'.format(args.model), file=sys.stderr) model_load_start = timer() # sphinx-doc: python_ref_model_start ds = Model(args.model) # sphinx-doc: python_ref_model_stop model_load_end = timer() - model_load_start # print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) if args.beam_width: ds.setBeamWidth(args.beam_width) desired_sample_rate = ds.sampleRate() if args.scorer: # print('Loading scorer from files {}'.format(args.scorer), file=sys.stderr) scorer_load_start = timer() ds.enableExternalScorer(args.scorer) scorer_load_end = timer() - scorer_load_start # print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr) if args.lm_alpha and args.lm_beta: ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta) fin = wave.open(args.audio, 'rb') fs_orig = fin.getframerate() if fs_orig != desired_sample_rate: # print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format(fs_orig, desired_sample_rate), file=sys.stderr) fs_new, audio = convert_samplerate(args.audio, desired_sample_rate) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1/fs_orig) fin.close() # print('Running inference.', file=sys.stderr) inference_start = timer() # sphinx-doc: python_ref_inference_start if args.extended: print(metadata_to_string(ds.sttWithMetadata(audio, 1).transcripts[0])) elif args.json: print(metadata_json_output(ds.sttWithMetadata(audio, args.candidate_transcripts))) else: print("Translation: "+ds.stt(audio)) # sphinx-doc: python_ref_inference_stop inference_end = timer() - inference_start
window['lbl_deep'].update("Ascolto...") window.refresh() rec = sd.rec(int(duration_of_recording * sample_rate), dtype="int16", samplerate=sample_rate, channels=1) sd.wait() write('DS/out.wav', sample_rate, rec) window['lbl_deep'].update("Elaboro...") window.refresh() # starts ds recognizer fin = wave.open("DS/out.wav", 'rb') fs_orig = fin.getframerate() if fs_orig != ds.sampleRate(): print( "Your audio has not been correctly recorded. Please try to fix it and try again! (must be in 16000khz)" ) exit(1) audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) stt_text = ds.stt(audio).lower().replace(" ", "") # check the output if float( utils.similar( stt_text, random_questions[progressed]["answer"].lower())) >= 0.5: progressed += 1 window['progbar'].update(progressed) window['lbl_result'].update("Esatto!", background_color="green")