def create_model(engine, model, scorer): """Instantiate model and scorer Args: engine : "ds" for DeepSpeech and "stt" for Coqui STT model : .pbmm model file scorer : .scorer file """ try: if engine == "ds": ds = DModel(model) else: ds = SModel(model) except: _logger.error("Invalid model file") sys.exit(1) try: ds.enableExternalScorer(scorer) except: _logger.warn( "Invalid scorer file. Running inference using only model file") return (ds)
def load(model, scorer, verbose=True, beam_width="", lm_alpha="", lm_beta="", hot_words=""): """ Load models""" model_load_start = timer() # sphinx-doc: python_ref_model_start ds = Model(model) # sphinx-doc: python_ref_model_stop model_load_end = timer() - model_load_start if verbose==True: print('\nLoading model from files {}'.format(model), file=sys.stderr) print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) if beam_width: ds.setBeamWidth(beam_width) desired_sample_rate = ds.sampleRate() if scorer: if verbose == True: print('Loading scorer from files {}'.format(scorer), file=sys.stderr) scorer_load_start = timer() ds.enableExternalScorer(scorer) scorer_load_end = timer() - scorer_load_start if verbose == True: print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr) if lm_alpha and lm_beta: ds.setScorerAlphaBeta(lm_alpha, lm_beta) if hot_words: if verbose == True: print('Adding hot-words', file=sys.stderr) for word_boost in hot_words.split(','): word, boost = word_boost.split(':') ds.addHotWord(word, float(boost)) return ds, desired_sample_rate
def __init__(self, ): print('Loading model from file {}'.format(args.model), file=sys.stderr) model_load_start = timer() # sphinx-doc: python_ref_model_start model_path = os.path.dirname(os.path.abspath(__file__)) ds = Model(os.path.join(model_path, args.model)) # sphinx-doc: python_ref_model_stop model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) if args.beam_width: ds.setBeamWidth(args.beam_width) self.desired_sample_rate = ds.sampleRate() if args.scorer: print('Loading scorer from files {}'.format(args.scorer), file=sys.stderr) scorer_load_start = timer() ds.enableExternalScorer(os.path.join(model_path, args.scorer)) scorer_load_end = timer() - scorer_load_start print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr) if args.lm_alpha and args.lm_beta: ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta) if args.hot_words: print('Adding hot-words', file=sys.stderr) for word_boost in args.hot_words.split(','): word, boost = word_boost.split(':') ds.addHotWord(word, float(boost)) self.ds = ds
help='Path to the .scorer file') parser.add_argument('--beam_width', type=int, default=500, help='Beam width for the CTC decoder') parser.add_argument('--port', type=int, default=8008, help='The port number to listen on') args = parser.parse_args() # Load in the model logging.info("Loading model from %s" % args.model) model = Model(args.model) # Configure it model.setBeamWidth(args.beam_width) if args.scorer: logging.info("Loading scorer from %s" % (args.scorer,)) model.enableExternalScorer(args.scorer) # Set up the server socket logging.info("Opening socket on port %d" % (args.port,)) sckt = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sckt.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) sckt.bind(('0.0.0.0', args.port)) sckt.listen(5) # Do this forever while True: try: # Get a connection logging.info("Waiting for a connection") (conn, addr) = sckt.accept() logging.info("Got connection from %s" % (addr,))
import os import wave import numpy as np import sys import shlex import subprocess from deepspeech import Model from tqdm import tqdm try: from shhlex import quote except ImportError: from pipes import quote model = Model("deepspeech-0.9.3-models.pbmm") model.enableExternalScorer("deepspeech-0.9.3-models.scorer") desired_sample_rate = model.sampleRate() PATH = os.path.join("LJSpeech-1.1", "wavs") TOTAL_SAMPLES = 100 def convert_samplerate(audio_path, desired_sample_rate): sox_cmd = "sox {} --type raw --bits 16 --channels 1 --rate {} --encoding signed-integer --endian little --compression 0.0 --no-dither - ".format( quote(audio_path), desired_sample_rate) try: output = subprocess.check_output(shlex.split(sox_cmd), stderr=subprocess.PIPE) except subprocess.CalledProcessError as e: raise RuntimeError("SoX returned non-zero status: {}".format(e.stderr)) except OSError as e: raise OSError(
def app_sst_with_video(model_path: str, lm_path: str, lm_alpha: float, lm_beta: float, beam: int): class AudioProcessor(AudioProcessorBase): frames_lock: threading.Lock frames: deque def __init__(self) -> None: self.frames_lock = threading.Lock() self.frames = deque([]) async def recv_queued(self, frames: List[av.AudioFrame]) -> av.AudioFrame: with self.frames_lock: self.frames.extend(frames) # Return empty frames to be silent. new_frames = [] for frame in frames: input_array = frame.to_ndarray() new_frame = av.AudioFrame.from_ndarray( np.zeros(input_array.shape, dtype=input_array.dtype), layout=frame.layout.name, ) new_frame.sample_rate = frame.sample_rate new_frames.append(new_frame) return new_frames webrtc_ctx = webrtc_streamer( key="speech-to-text-w-video", mode=WebRtcMode.SENDRECV, audio_processor_factory=AudioProcessor, rtc_configuration={ "iceServers": [{ "urls": ["stun:stun.l.google.com:19302"] }] }, media_stream_constraints={ "video": True, "audio": True }, ) status_indicator = st.empty() if not webrtc_ctx.state.playing: return status_indicator.write("Loading...") text_output = st.empty() stream = None while True: if webrtc_ctx.audio_processor: if stream is None: from deepspeech import Model model = Model(model_path) model.enableExternalScorer(lm_path) model.setScorerAlphaBeta(lm_alpha, lm_beta) model.setBeamWidth(beam) stream = model.createStream() status_indicator.write("Model loaded.") sound_chunk = pydub.AudioSegment.empty() audio_frames = [] with webrtc_ctx.audio_processor.frames_lock: while len(webrtc_ctx.audio_processor.frames) > 0: frame = webrtc_ctx.audio_processor.frames.popleft() audio_frames.append(frame) if len(audio_frames) == 0: time.sleep(0.1) status_indicator.write("No frame arrived.") continue status_indicator.write("Running. Say something!") for audio_frame in audio_frames: sound = pydub.AudioSegment( data=audio_frame.to_ndarray().tobytes(), sample_width=audio_frame.format.bytes, frame_rate=audio_frame.sample_rate, channels=len(audio_frame.layout.channels), ) sound_chunk += sound if len(sound_chunk) > 0: sound_chunk = sound_chunk.set_channels(1).set_frame_rate( model.sampleRate()) buffer = np.array(sound_chunk.get_array_of_samples()) stream.feedAudioContent(buffer) text = stream.intermediateDecode() text_output.markdown(f"**Text:** {text}") else: status_indicator.write("AudioReciver is not set. Abort.") break
def main(): parser = argparse.ArgumentParser(description='Running DeepSpeech inference.') parser.add_argument('--model', required=True, help='Path to the model (protocol buffer binary file)') parser.add_argument('--scorer', required=False, help='Path to the external scorer file') parser.add_argument('--audio', required=True, help='Path to the audio file to run (WAV format)') parser.add_argument('--beam_width', type=int, help='Beam width for the CTC decoder') parser.add_argument('--lm_alpha', type=float, help='Language model weight (lm_alpha). If not specified, use default from the scorer package.') parser.add_argument('--lm_beta', type=float, help='Word insertion bonus (lm_beta). If not specified, use default from the scorer package.') parser.add_argument('--version', action=VersionAction, help='Print version and exits') parser.add_argument('--extended', required=False, action='store_true', help='Output string from extended metadata') parser.add_argument('--json', required=False, action='store_true', help='Output json from metadata with timestamp of each word') parser.add_argument('--candidate_transcripts', type=int, default=3, help='Number of candidate transcripts to include in JSON output') args = parser.parse_args() # print('Loading model from file {}'.format(args.model), file=sys.stderr) model_load_start = timer() # sphinx-doc: python_ref_model_start ds = Model(args.model) # sphinx-doc: python_ref_model_stop model_load_end = timer() - model_load_start # print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) if args.beam_width: ds.setBeamWidth(args.beam_width) desired_sample_rate = ds.sampleRate() if args.scorer: # print('Loading scorer from files {}'.format(args.scorer), file=sys.stderr) scorer_load_start = timer() ds.enableExternalScorer(args.scorer) scorer_load_end = timer() - scorer_load_start # print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr) if args.lm_alpha and args.lm_beta: ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta) fin = wave.open(args.audio, 'rb') fs_orig = fin.getframerate() if fs_orig != desired_sample_rate: # print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format(fs_orig, desired_sample_rate), file=sys.stderr) fs_new, audio = convert_samplerate(args.audio, desired_sample_rate) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1/fs_orig) fin.close() # print('Running inference.', file=sys.stderr) inference_start = timer() # sphinx-doc: python_ref_inference_start if args.extended: print(metadata_to_string(ds.sttWithMetadata(audio, 1).transcripts[0])) elif args.json: print(metadata_json_output(ds.sttWithMetadata(audio, args.candidate_transcripts))) else: print("Translation: "+ds.stt(audio)) # sphinx-doc: python_ref_inference_stop inference_end = timer() - inference_start
def _get_model(): ds = Model("data/quran/output_graph.pb") ds.enableExternalScorer("data/quran/lm/quran.scorer") return ds
class DeepSpeechInput(AudioInput): """ Input from DeepSpeech using the US English language model. """ def __init__(self, notifier, rate=None, wav_dir=None, model=os.path.join(_MODEL_DIR, 'models.pbmm'), scorer=os.path.join(_MODEL_DIR, 'models.scorer')): """ @see AudioInput.__init__() :type rate: :param rate: The override for the rate, if not the model's one. :type wav_dir: :param wav_dir: Where to save the wave files, if anywhere. :type model: :param model: The path to the DeepSpeech model file. :type scorer: :param scorer: The path to the DeepSpeech scorer file. """ # If these don't exist then DeepSpeech will segfault when inferring! if not os.path.exists(model): raise IOError("Not found: %s" % (model, )) # Load in and configure the model. LOG.info("Loading model from %s" % (model, )) self._model = Model(model) if os.path.exists(scorer): LOG.info("Loading scorer from %s" % (scorer, )) self._model.enableExternalScorer(scorer) # Handle any rate override if rate is None: rate = self._model.sampleRate() # Wen can now init the superclass super(DeepSpeechInput, self).__init__(notifier, format=pyaudio.paInt16, channels=1, rate=rate, wav_dir=wav_dir) # Where we put the stream context self._context = None def _feed_raw(self, data): """ @see AudioInput._feed_raw() """ if self._context is None: self._context = self._model.createStream() audio = numpy.frombuffer(data, numpy.int16) self._context.feedAudioContent(audio) def _decode(self): """ @see AudioInput._decode() """ if self._context is None: # No context means no tokens LOG.warning("Had no stream context to close") tokens = [] else: # Finish up by finishing the decoding words = self._context.finishStream() LOG.info("Got: %s" % (words, )) self._context = None # And tokenize tokens = [ Token(word.strip(), 1.0, True) for word in words.split(' ') if len(word.strip()) > 0 ] return tokens
def main(): if(args.min>=args.max): print("Error: min_prio can't be bigger than max_prio.") else: test_file(args.audio, args.hot_words.split(','), args.min, args.max, args.steps) if __name__ == '__main__': parser = argparse.ArgumentParser(description='DeepSpeech hot-word adjusting.') parser.add_argument('--model', required=True, help='Path to the model (protocol buffer binary file)') parser.add_argument('--scorer', required=True, help='Path to the external scorer file') parser.add_argument('--audio', type=str, required=True, help='Path to the audio file to run (WAV format)') parser.add_argument('--min', type=float, default=-10.0, help='Minimum boost value.') parser.add_argument('--max', type=float, default=10.0, help='Maximum boost value.') parser.add_argument('--steps', type=int, default=6, help='Number of tests per each hot-word.') parser.add_argument('--hot_words', type=str, required=True, help='Hot-words separated by comma.') args = parser.parse_args() DeepSpeech = Model(args.model) DeepSpeech.enableExternalScorer(args.scorer) main()
def load_model(model_path, scorer_path): model = Model(model_path) model.enableExternalScorer(scorer_path) return model
def load_model(model_file, scorer_file): ds = Model(model_file) ds.enableExternalScorer(scorer_file) return ds
# Enable logging logging.basicConfig( format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO ) logger = logging.getLogger("DeepspeechBot") TOKEN = os.environ.get("TELEGRAM_TOKEN") BASE_PATH = os.path.abspath(os.path.dirname(sys.argv[0])) AUDIO_FILE_PATH = BASE_PATH + "/tmp/{}_{}.{}" MODEL_PATH = os.environ.get("MODEL_PATH", BASE_PATH + "/model/model.pbmm") SCORER_PATH = os.environ.get("SCORER_PATH", BASE_PATH + "/model/kenlm.scorer") ds = Model(MODEL_PATH) ds.enableExternalScorer(SCORER_PATH) def start(update, context): update.message.reply_text( "Hola\! Sóc un bot creat per a provar les capacitats de reconeixement automàtic de la parla " "del motor DeepSpeech de Mozilla amb les dades de CommonVoice\. Envia'm un missatge de veu i " "el transcriuré\. Pots trobar més informació sobre el model català [aquí](https://github.com/ccoreilly/deepspeech-catala)\.", parse_mode="MarkdownV2", ) def info(update, context): update.message.reply_text("Envia'm un missatge de veu i el transcriuré")
def load_model(self, model_path, scorer_path): ds = Model(model_path) self.desired_sample_rate = ds.sampleRate() ds.enableExternalScorer(scorer_path) self.ds = ds
def load_model(graph_path, scorer): ds = Model(graph_path) ds.enableExternalScorer(scorer) return ds
def main(): parser = argparse.ArgumentParser(description='Running DeepSpeech inference.') parser.add_argument('--model', required=True, help='Path to the model (protocol buffer binary file)') parser.add_argument('--scorer', required=False, help='Path to the external scorer file') parser.add_argument('--audio', required=True, help='Path to the audio file to run (WAV format)') parser.add_argument('--beam_width', type=int, help='Beam width for the CTC decoder') parser.add_argument('--lm_alpha', type=float, help='Language model weight (lm_alpha). If not specified, use default from the scorer package.') parser.add_argument('--lm_beta', type=float, help='Word insertion bonus (lm_beta). If not specified, use default from the scorer package.') # parser.add_argument('--version', action=VersionAction, # help='Print version and exits') parser.add_argument('--extended', required=False, action='store_true', help='Output string from extended metadata') parser.add_argument('--json', required=False, action='store_true', help='Output json from metadata with timestamp of each word') parser.add_argument('--candidate_transcripts', type=int, default=3, help='Number of candidate transcripts to include in JSON output') parser.add_argument('--hot_words', type=str, help='Hot-words and their boosts.') args = parser.parse_args() print('Loading model from file {}'.format(args.model), file=sys.stderr) model_load_start = timer() # sphinx-doc: python_ref_model_start ds = Model(args.model) # sphinx-doc: python_ref_model_stop model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) if args.beam_width: ds.setBeamWidth(args.beam_width) desired_sample_rate = ds.sampleRate() if args.scorer: print('Loading scorer from files {}'.format(args.scorer), file=sys.stderr) scorer_load_start = timer() ds.enableExternalScorer(args.scorer) scorer_load_end = timer() - scorer_load_start print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr) if args.lm_alpha and args.lm_beta: ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta) if args.hot_words: print('Adding hot-words', file=sys.stderr) for word_boost in args.hot_words.split(','): word,boost = word_boost.split(':') ds.addHotWord(word,float(boost)) fin = wave.open(args.audio, 'rb') fs_orig = fin.getframerate() if fs_orig != desired_sample_rate: print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format(fs_orig, desired_sample_rate), file=sys.stderr) fs_new, audio = convert_samplerate(args.audio, desired_sample_rate) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1/fs_orig) fin.close() print('Running inference.', file=sys.stderr) inference_start = timer() # sphinx-doc: python_ref_inference_start if args.extended: print(metadata_to_string(ds.sttWithMetadata(audio, 1).transcripts[0])) elif args.json: print(metadata_json_output(ds.sttWithMetadata(audio, args.candidate_transcripts))) else: print(ds.stt(audio)) test = ds.createStream().sentencefit(audio, "ka arohia katoatia te hāhi me ōna whakapono e te hapū o ōtākou") [print(f"letter: {t.letter}, confidence: {t.confidence}, timestep: {t.timestep}") for t in test.tokens] # sphinx-doc: python_ref_inference_stop inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
def record_voice_and_predict_text(self): """Records the speech and predicts its text """ #Recording the speech stream_file_name = 'AudioFile/speech_stream.wav' stream_format = pyaudio.paInt16 # Sampling size and format no_of_channels = 1 # Number of audio channels sampling_rate = 16000 # Sampling rate in Hertz frames_count = 1024 # Number of frames per buffer record_seconds = 5 stream = pyaudio.PyAudio() stream_data = stream.open(format=stream_format, channels=no_of_channels, rate=sampling_rate, input=True, frames_per_buffer=frames_count) frames = [ stream_data.read(frames_count) for i in range(0, int(sampling_rate / frames_count * record_seconds)) ] stream_data.stop_stream() stream_data.close() stream.terminate() wave_file = wave.open(stream_file_name, 'wb') wave_file.setnchannels(no_of_channels) wave_file.setsampwidth(stream.get_sample_size(stream_format)) wave_file.setframerate(sampling_rate) wave_file.writeframes(b''.join(frames)) wave_file.close() try: self.label_info.setText('Recording completed.') except: pass #Text prediction Part alpha = 0.75 beta = 1.85 beam_width = 500 # Initialize the model speech_model = Model(MODEL_PATH) # set beam width. A larger beam width value generates better results at the cost of decoding time. speech_model.setBeamWidth(beam_width) # Enable language scorer to improve the accuracy speech_model.enableExternalScorer(SCORER_PATH) # You can play with setting the model Beam Width, Scorer language model weight and word insertion weight # Set hyperparameters alpha and beta of the external scorer. # alpha: Language model weight. # beta: Word insertion weight speech_model.setScorerAlphaBeta(alpha, beta) # Use scipy to covert wav file into numpy array _, audio = wav.read(stream_file_name) text = speech_model.stt(audio) try: self.text_pred.setText(text) except: pass show_images(text)
class SpeechToTextEngine: """ Class to perform speech-to-text transcription and related functionality """ FORMAT = pyaudio.paInt16 SAMPLE_RATE = 16000 CHANNELS = 1 BLOCKS_PER_SECOND = 50 def __init__(self, scorer='deepspeech_model.scorer') -> None: """ Initializing the DeepSpeech model """ self.model = Model(model_path=Path(__file__).parents[2].joinpath( 'deepspeech_model.pbmm').absolute().as_posix()) self.model.enableExternalScorer(scorer_path=Path( __file__).parents[2].joinpath(scorer).absolute().as_posix()) self.vad = webrtcvad.Vad(mode=3) self.sample_rate = self.SAMPLE_RATE self.buffer_queue = queue.Queue() def run(self, audio) -> str: """ Receives the audio, normalizes it and is sent to the model to be transcribed. Returns the result of the transcribe audio in string format.""" normalized_audio = normalize_audio_input(audio) audio_streams = BytesIO(normalized_audio) with wave.Wave_read(audio_streams) as wav: audio_streams = np.frombuffer(wav.readframes(wav.getnframes()), np.int16) results = self.model.stt(audio_buffer=audio_streams) return results def run_with_metadata(self, audio) -> Metadata: normalized_audio = normalize_audio_input(audio) audio_streams = BytesIO(normalized_audio) with wave.Wave_read(audio_streams) as wav: audio_streams = np.frombuffer(wav.readframes(wav.getnframes()), np.int16) results = self.model.sttWithMetadata(audio_buffer=audio_streams) return results def add_hot_words(self, data) -> list: """ Receives data in form of hot-words and boosts, adds them to the language model and return the list of the added hot-words """ all_hot_words = [] try: logger.info('----------------------------------------------------') for hot_word in data: # Change all the characters of the hot-word to lower case word = hot_word.lower() # Get numeric value of the boost boost = float(data.get(hot_word)) # Adding the hot-word and its boost to the language model self.model.addHotWord(hot_word, boost) # Printing on the prompt the activity logger.info( f"`{word}` hot-word with boost `{boost}` was added.") all_hot_words.append(word) return all_hot_words except RuntimeError: return [] def erase_hot_word(self, hot_words) -> None: try: for hot_word in hot_words: self.model.eraseHotWord(hot_word) logger.info(f"`{hot_word}` hot-word is erased.") logger.info('----------------------------------------------------') except RuntimeError: return def clear_hot_words(self) -> str: try: self.model.clearHotWords() return f"All hot-words were erased." except RuntimeError: return f"No more hot-words are left." def deep_stream(self): return self.model.createStream() def frame_generator(self, audio, sample_rate=16000, frame_duration_ms=30): """ Takes the desired frame duration in milliseconds, the PCM data, and the sample rate. Yields Frames of the requested duration. """ # audio = np.frombuffer(audio, np.int16) n = int(sample_rate * (frame_duration_ms / 1000.0) * 2) offset = 0 timestamp = 0.0 duration = (float(n) / sample_rate) / 2.0 while offset + n < len(audio): yield Frame(audio[offset:offset + n], timestamp, duration) timestamp += duration offset += n
class SpeechToTextEngine: """ Class to perform speech-to-text transcription and related functionality """ def __init__(self, scorer='deepspeech_model.scorer') -> None: """ Initializing the DeepSpeech model """ self.model = Model(model_path=Path(__file__).parents[2].joinpath( 'deepspeech_model.pbmm').absolute().as_posix()) self.model.enableExternalScorer(scorer_path=Path( __file__).parents[2].joinpath(scorer).absolute().as_posix()) def run(self, audio) -> str: """ Receives the audio, normalizes it and is sent to the model to be transcribed. Returns the result of the transcribe audio in string format.""" normalized_audio = normalize_audio_input(audio) audio_streams = BytesIO(normalized_audio) with wave.Wave_read(audio_streams) as wav: audio_streams = np.frombuffer(wav.readframes(wav.getnframes()), np.int16) results = self.model.stt(audio_buffer=audio_streams) return results def run_with_metadata(self, audio) -> Metadata: normalized_audio = normalize_audio_input(audio) audio_streams = BytesIO(normalized_audio) with wave.Wave_read(audio_streams) as wav: audio_streams = np.frombuffer(wav.readframes(wav.getnframes()), np.int16) results = self.model.sttWithMetadata(audio_buffer=audio_streams) return results def add_hot_words(self, data) -> list: """ Receives data in form of hot-words and boosts, adds them to the language model and return the list of the added hot-words """ all_hot_words = [] try: logger.info('----------------------------------------------------') for hot_word in data: # Change all the characters of the hot-word to lower case word = hot_word.lower() # Get numeric value of the boost boost = float(data.get(hot_word)) # Adding the hot-word and its boost to the language model self.model.addHotWord(hot_word, boost) # Printing on the prompt the activity logger.info( f"`{word}` hot-word with boost `{boost}` was added.") all_hot_words.append(word) return all_hot_words except RuntimeError: return [] def erase_hot_word(self, hot_words) -> None: try: for hot_word in hot_words: self.model.eraseHotWord(hot_word) logger.info(f"`{hot_word}` hot-word is erased.") logger.info('----------------------------------------------------') except RuntimeError: return def clear_hot_words(self) -> str: try: self.model.clearHotWords() return f"All hot-words were erased." except RuntimeError: return f"No more hot-words are left." def sample_rate(self): return self.model.sampleRate()
class Brain: MODULE_BASE_PATH = 'lucia.tasks.' def __init__(self): self.model = None self.r = sr.Recognizer() self.nlp = spacy.load(conf.get_property('spacy')['model']) self.espeak = conf.get_property('espeak') # Load low-level Duckling model self.duckling = Duckling() self.duckling.load( languages=conf.get_property('duckling')['languages']) # Remember tasks self.task_memory = [] def create_model(self): # Create a DeepSpeech model with model path self.model = Model(conf.get_property('deepspeech')['model_path']) # Enable decoding using an external scorer self.model.enableExternalScorer( conf.get_property('deepspeech')['scorer_path']) def listen(self, debug_mode=False): while True: with sr.Microphone(sample_rate=conf.get_property( 'speech_recognition')['audio_rate']) as source: # Listen for a while and adjust the energy threshold to start and stop recording voice to account for ambient noise self.r.adjust_for_ambient_noise( source, duration=conf.get_property( 'speech_recognition')['energy_threshold']) self.r.dynamic_energy_threshold = True if debug_mode is False: print("Say something") audio = self.r.listen(source) # Speech to text audio = np.frombuffer(audio.frame_data, np.int16) text = self.model.stt(audio) self.speak(text) else: text = input() # Wake up on hearing the wake word #if any(subtext in text for subtext in conf.get_property('wake_words')): # self.understand(text) self.understand(text) def speak(self, text): subprocess.call('espeak-ng -v {}+{}{} "{}"'.format( self.espeak['language'], self.espeak['gender'], self.espeak['pitch'], text), shell=True) def understand(self, sentence): # Break paragraph into sentences tokenized_sentence = sent_tokenize(sentence) # Break sentence into words for sent in tokenized_sentence: tokenized_word = word_tokenize(sent) # Tag corpora with universal POS tagset # For tag list, read https://www.nltk.org/book/ch05.html#tab-universal-tagset pos_tags = nltk.pos_tag(tokenized_word, tagset='universal') # Divide sentence into noun phrases with regular expression grammar = 'NOUN: {<DET>?<ADJ>*<NOUN>}' cp = nltk.RegexpParser(grammar) # Find chunk structure cs = cp.parse(pos_tags) # B-{tag} beginning, I-{tag} inside, O-{tag} outside iob_tags = np.asarray(tree2conlltags(cs)).tolist() # Recognize named entities doc = self.nlp(sent) # Parse word into numeral, ordinal, and time parse = lambda ne: dict([[ _['dim'], _['value']['value'] ] for _ in self.duckling.parse( ne, dim_filter=conf.get_property('duckling')['dimensions'])]) # [Word, character positions and entity type]. For all entity types, read https://spacy.io/api/annotation#named-entities ne = list([ ent.text, ent.start_char, ent.end_char, ent.label_, parse(ent.text) ] for ent in doc.ents) ne_tags = [_.ent_type_ for _ in doc] # Merge iob tags and named entity tags tagged_sent = [ list(np.append(iob_tags[i], ne_tags[i])) for i in range(len(iob_tags)) ] tagged_sent = ''.join(str(x) for x in tagged_sent) self.decide(tagged_sent, ne) def think(self, pattern, tagged_sent): # Match tagged sentence against combinations of POS tags, words in any order: (?=.*\bword\b)(?=.*\bADJ\bNOUN\b).* r = re.compile( '(?=.*\\b' + pattern.replace(' ', '\\b.*\\b').replace(' ', '\\b)(?=.*\\b') + '\\b).*') return r.search(tagged_sent) def decide(self, tagged_sent, named_entity): for task in conf.get_property('tasks'): for pattern in conf.get_property('tasks')[task]: # If sentence matches any pattern, dynamtically create class if self.think(pattern, tagged_sent): # Split module name and class name with dot module = importlib.import_module(self.MODULE_BASE_PATH + task.rsplit('.', 1)[0]) instance = getattr(module, task.rsplit('.', 1)[1])() print(instance) # Search whether task_memory contains the same class instance _run = False for mem in self.task_memory: if type(instance) == type(mem): mem.run(self, tagged_sent, named_entity) _run = True break if not _run: # If not exists, store new class instance in task_memory self.task_memory = [ instance.run(self, tagged_sent, named_entity) ] + self.task_memory break
clip = converted_audio[annotation['start']:annotation['end']] clip.export(annotation['clip'], format = 'wav') converted_audio_file.close() f.write(f'\nAudio cut succesfully.\n') print("PROGRESS: 0.7 Starting STT with DeepSpeech", flush = True) temp_dir = tempfile.TemporaryDirectory() # Model path has to be taken from ELAN ds = Model(params['model']) if params['language_model']: ds.enableExternalScorer(params['language_model']) f.write("\n\nLoaded DeepSpeech model.\n\n") for annotation in annotations: fin = wave.open(annotation['clip'].name, 'rb') audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) annotation['value'] = (ds.stt(audio)) # Then open 'output_tier' for writing, and return all of the new phoneme # strings produced by Persephone as the contents of <span> elements (see # below). print("PROGRESS: 0.95 Preparing output tier", flush = True)
def main(): parser = argparse.ArgumentParser( description='Running DeepSpeech inference.') parser.add_argument('--model', required=True, help='Path to the model (protocol buffer binary file)') parser.add_argument('--scorer', required=False, help='Path to the external scorer file') parser.add_argument( '--prediction_in', required=True, help='Path to the directory with sound files (mp3/ogg/wav) to analyze') parser.add_argument( '--prediction_out', required=True, help='Path to the directory for moving the processed sound files to') parser.add_argument( '--prediction_tmp', required=False, help= 'Path to the temp directory for storing the predictions initially before moving them to "--prediction_out"' ) parser.add_argument( '--continuous', action='store_true', help='Whether to continuously load test images and perform prediction', required=False, default=False) parser.add_argument( '--delete_input', action='store_true', help= 'Whether to delete the input files rather than move them to "--prediction_out" directory', required=False, default=False) parser.add_argument('--beam_width', type=int, help='Beam width for the CTC decoder') parser.add_argument( '--lm_alpha', type=float, help= 'Language model weight (lm_alpha). If not specified, use default from the scorer package.' ) parser.add_argument( '--lm_beta', type=float, help= 'Word insertion bonus (lm_beta). If not specified, use default from the scorer package.' ) parser.add_argument( '--json', required=False, action='store_true', help='Output json from metadata with timestamp of each word') parser.add_argument( '--candidate_transcripts', type=int, default=3, help='Number of candidate transcripts to include in JSON output') parser.add_argument( '--normalize', required=False, action='store_true', help='Whether to apply standard amplitude normalization') parsed = parser.parse_args() print('Loading model from file {}'.format(parsed.model)) ds = Model(parsed.model) if parsed.beam_width: ds.setBeamWidth(parsed.beam_width) if parsed.scorer: print('Loading scorer from file {}'.format(parsed.scorer)) ds.enableExternalScorer(parsed.scorer) if parsed.lm_alpha and parsed.lm_beta: ds.setScorerAlphaBeta(parsed.lm_alpha, parsed.lm_beta) process(model=ds, prediction_in=parsed.prediction_in, prediction_out=parsed.prediction_out, prediction_tmp=parsed.prediction_tmp, continuous=parsed.continuous, delete_input=parsed.delete_input, json=parsed.json, candidate_transcripts=parsed.candidate_transcripts, normalize=parsed.normalize)
def app_sst(model_path: str, lm_path: str, lm_alpha: float, lm_beta: float, beam: int): webrtc_ctx = webrtc_streamer( key="speech-to-text", mode=WebRtcMode.SENDONLY, audio_receiver_size=1024, rtc_configuration={ "iceServers": [{ "urls": ["stun:stun.l.google.com:19302"] }] }, media_stream_constraints={ "video": False, "audio": True }, ) status_indicator = st.empty() if not webrtc_ctx.state.playing: return status_indicator.write("Loading...") text_output = st.empty() stream = None while True: if webrtc_ctx.audio_receiver: if stream is None: from deepspeech import Model model = Model(model_path) model.enableExternalScorer(lm_path) model.setScorerAlphaBeta(lm_alpha, lm_beta) model.setBeamWidth(beam) stream = model.createStream() status_indicator.write("Model loaded.") sound_chunk = pydub.AudioSegment.empty() try: audio_frames = webrtc_ctx.audio_receiver.get_frames(timeout=1) except queue.Empty: time.sleep(0.1) status_indicator.write("No frame arrived.") continue status_indicator.write("Running. Say something!") for audio_frame in audio_frames: sound = pydub.AudioSegment( data=audio_frame.to_ndarray().tobytes(), sample_width=audio_frame.format.bytes, frame_rate=audio_frame.sample_rate, channels=len(audio_frame.layout.channels), ) sound_chunk += sound if len(sound_chunk) > 0: sound_chunk = sound_chunk.set_channels(1).set_frame_rate( model.sampleRate()) buffer = np.array(sound_chunk.get_array_of_samples()) stream.feedAudioContent(buffer) text = stream.intermediateDecode() text_output.markdown(f"**Text:** {text}") else: status_indicator.write("AudioReciver is not set. Abort.") break
from deepspeech import Model import gradio as gr import numpy as np model_file_path = "deepspeech-0.8.2-models.pbmm" lm_file_path = "deepspeech-0.8.2-models.scorer" beam_width = 100 lm_alpha = 0.93 lm_beta = 1.18 model = Model(model_file_path) model.enableExternalScorer(lm_file_path) model.setScorerAlphaBeta(lm_alpha, lm_beta) model.setBeamWidth(beam_width) def reformat_freq(sr, y): if sr not in ( 48000, 16000, ): # Deepspeech only supports 16k, (we convert 48k -> 16k) raise ValueError("Unsupported rate", sr) if sr == 48000: y = (((y / max(np.max(y), 1)) * 32767).reshape( (-1, 3)).mean(axis=1).astype("int16")) sr = 16000 return sr, y def transcribe(speech, stream): _, y = reformat_freq(*speech)
class SocketIOInput(InputChannel): """A socket.io input channel.""" @classmethod def name(cls): return "socketio" @classmethod def from_credentials(cls, credentials): credentials = credentials or {} return cls( credentials.get("user_message_evt", "user_uttered"), credentials.get("bot_message_evt", "bot_uttered"), credentials.get("namespace"), credentials.get("session_persistence", False), credentials.get("socketio_path", "/socket.io"), ) def __init__(self, user_message_evt: Text = "user_uttered", bot_message_evt: Text = "bot_uttered", namespace: Optional[Text] = None, session_persistence: bool = False, socketio_path: Optional[Text] = '/socket.io'): self.bot_message_evt = bot_message_evt self.session_persistence = session_persistence self.user_message_evt = user_message_evt self.namespace = namespace self.socketio_path = socketio_path self.speech_to_text_model = Model('stt/deepspeech-0.9.1-models.pbmm') self.speech_to_text_model.enableExternalScorer( 'stt/deepspeech-0.9.1-models.scorer') def blueprint(self, on_new_message): sio = AsyncServer(async_mode="sanic", logger=True, cors_allowed_origins='*') socketio_webhook = SocketBlueprint(sio, self.socketio_path, "socketio_webhook", __name__) @socketio_webhook.route("/", methods=['GET']) async def health(request): return response.json({"status": "ok"}) @sio.on('connect') async def connect(sid, environ): print("User {} connected to socketIO endpoint.".format(sid)) @sio.on('disconnect') async def disconnect(sid): print("User {} disconnected from socketIO endpoint." "".format(sid)) @sio.on('session_request') async def session_request(sid, data): print('Session request received') if data is None: data = {} if 'session_id' not in data or data['session_id'] is None: data['session_id'] = uuid.uuid4().hex await sio.emit("session_confirm", data['session_id'], room=sid) print("User {} connected to socketIO endpoint." "".format(sid)) @sio.on('user_uttered') async def handle_message(sid, data): print('User uttered') output_channel = SocketIOOutput(sio, sid, self.bot_message_evt, data['message']) if data['message'] == "/get_started": message = data['message'] else: ##receive audio received_file = 'output_' + sid + '.wav' request.urlretrieve(data['message'], received_file) # fs, audio = wav.read("output_{0}.wav".format(sid)) input_audio_file = wave.open("output_{0}.wav".format(sid), 'rb') converted_audio_to_bytes = numpy.frombuffer( input_audio_file.readframes(input_audio_file.getnframes()), numpy.int16) input_audio_file.close() message = self.speech_to_text_model.stt( converted_audio_to_bytes) insert_user_message_to_database(message, sid) await sio.emit(self.user_message_evt, {"text": message}, room=sid) message_rasa = UserMessage(message, output_channel, sid, input_channel=self.name()) await on_new_message(message_rasa) return socketio_webhook
devs = [ a.get_device_info_by_index(i) for i in range(a.get_device_count()) ] for i, dev in enumerate(devs): if "HD Pro Webcam" in dev["name"]: device_index = i print("connecting to:\n{}\n".format(dev)) break # load in DS model model_path = os.path.join(os.getcwd(), "models") pb = glob.glob(model_path + "/*.pbmm")[0] scorer = glob.glob(model_path + "/*.scorer")[0] # load them in ds = Model(pb) ds.enableExternalScorer(scorer) model = ds desired_sample_rate = ds.sampleRate() print("TEST_MODE: {}".format(TEST_MODE)) if not TEST_MODE: # connect to mqtt server client = mqtt.Client(client_id="", clean_session=True, userdata=None, transport="tcp") client.on_connect = on_connect client.connect("127.0.0.1", 1883, 60) # start another thread to react to incoming messages
def create_app(args): logging.basicConfig(level=logging.DEBUG) sys.stdout = LoggerWriter(logging.debug) sys.stderr = LoggerWriter(logging.warning) if not args.offline: from app.init import boot boot() from app.language import languages app = Flask(__name__) project_directory = args.project_directory if not os.path.exists(project_directory): os.makedirs(project_directory) # For faster access language_map = {} for l in languages: language_map[l.code] = l.name if args.debug: app.config['TEMPLATES_AUTO_RELOAD'] = True app.config['MAX_CONTENT_LENGTH'] = 64 * 1024 * 1024 # Map userdefined frontend languages to argos language object. if args.frontend_language_source == "auto": frontend_argos_language_source = type('obj', (object, ), { 'code': 'auto', 'name': 'Auto Detect' }) else: frontend_argos_language_source = next( iter([ l for l in languages if l.code == args.frontend_language_source ]), None) frontend_argos_language_target = next( iter([l for l in languages if l.code == args.frontend_language_target]), None) # Raise AttributeError to prevent app startup if user input is not valid. if frontend_argos_language_source is None: raise AttributeError( f"{args.frontend_language_source} as frontend source language is not supported." ) if frontend_argos_language_target is None: raise AttributeError( f"{args.frontend_language_target} as frontend target language is not supported." ) if args.req_limit > 0 or args.api_keys: from flask_limiter import Limiter limiter = Limiter(app, key_func=get_remote_address, default_limits=get_routes_limits( args.req_limit, Database() if args.api_keys else None)) model_load_start = timer() ds = Model(os.path.join(home_dir, "models", "deepspeech-0.9.3-models.pbmm")) ds.enableExternalScorer( os.path.join(home_dir, "models", "deepspeech-0.9.3-models.scorer")) model_load_end = timer() - model_load_start logging.info('Loaded model in {:.3}s.'.format(model_load_end)) desired_sample_rate = ds.sampleRate() logging.info('Model optimized for a sample rate of ' + str(desired_sample_rate)) uuid4hex = re.compile( '[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\Z', re.I) @app.errorhandler(400) def invalid_api(e): return jsonify({"error": str(e.description)}), 400 @app.errorhandler(500) def server_error(e): return jsonify({"error": str(e.description)}), 500 @app.errorhandler(429) def slow_down_error(e): return jsonify({"error": "Slowdown: " + str(e.description)}), 429 @app.route("/") @limiter.exempt def index(): return render_template('index.html', gaId=args.ga_id, frontendTimeout=args.frontend_timeout, offline=args.offline, api_keys=args.api_keys, web_version=os.environ.get('LT_WEB') is not None) @app.route("/projects") @limiter.exempt def projects(): return render_template('projects.html', gaId=args.ga_id, frontendTimeout=args.frontend_timeout, offline=args.offline, api_keys=args.api_keys, projects=loadAllProjects(), web_version=os.environ.get('LT_WEB') is not None) @app.route("/project/<id>") @limiter.exempt def project(id): if not uuid4hex.match(id): logging.error("Invalid project id") return redirect("/projects") return render_template('project.html', gaId=args.ga_id, frontendTimeout=args.frontend_timeout, offline=args.offline, api_keys=args.api_keys, project=loadProjectDetails(id), web_version=os.environ.get('LT_WEB') is not None) @app.route("/project/<id>/delete") @limiter.exempt def projectDelete(id): delete_project(id) return redirect("/projects") @app.route("/project/<id>/transcription") @limiter.exempt def projectTranscribe(id): if not uuid4hex.match(id): flash("Invalid project id") return redirect("/projects") logging.info("Starting the transcription job for project ID " + id) cmd = [ sys.executable, os.path.join(home_dir, 'scripts', 'batch.py', "--target-dir", os.path.join(project_directory, id)) ] subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) return redirect("/project/" + id) @app.route("/project/<id>/download/<file>") def download(id, file): # todo validate the file part metadata = loadProjectDetails(id) if metadata is None: logging.info("Unable to find metdata for project ID: " + id) return redirect("/projects") return send_from_directory(directory=metadata['project_dir'], filename=file, as_attachment=True) @app.route("/create-project") @limiter.exempt def createProject(): return render_template('create-project.html', gaId=args.ga_id, frontendTimeout=args.frontend_timeout, offline=args.offline, api_keys=args.api_keys, web_version=os.environ.get('LT_WEB') is not None) def allowed_file(filename): return '.' in filename and filename.rsplit( '.', 1)[1].lower() in ALLOWED_EXTENSIONS @app.route('/new-project-upload', methods=['GET', 'POST']) def uploadProject(): if request.method == 'POST': # check if the post request has the file part if 'file' not in request.files: return redirect(request.url) file = request.files['file'] # if user does not select file, browser also # submit an empty part without filename if file.filename == '': return redirect(request.url) if file and allowed_file(file.filename): project_id = str(uuid.uuid4()) if not os.path.exists( os.path.join(project_directory, project_id)): os.makedirs(os.path.join(project_directory, project_id)) fileending = file.filename.rsplit('.', 1)[1].lower() file.save( os.path.join(project_directory, project_id, "rawMedia." + fileending)) # TODO store original file name metadata = createMetadata(project_id, request.form['name'], fileending) with open( os.path.join(project_directory, project_id, "metadata.json"), 'w') as f: json.dump(metadata, f) return redirect("./project/" + project_id) @timeit def createMetadata(project_id, name, ending): metadata = {"name": name, "fileEnding": ending} in_filename = os.path.join(project_directory, project_id, "rawMedia." + ending) probe = ffmpeg.probe(in_filename) video_stream = next((stream for stream in probe['streams'] if stream['codec_type'] == 'video'), None) logging.debug(str(video_stream)) metadata['width'] = int(video_stream['width']) metadata['height'] = int(video_stream['height']) metadata['durationSeconds'] = float(video_stream['duration']) (ffmpeg.input(in_filename, ss=3).filter('scale', 512, -1).output(os.path.join( project_directory, project_id, "thumbnail.png"), vframes=1).run()) return metadata def delete_project(project_id): logging.info("Deleting a project with ID: " + project_id) # TODO make sure tha ID is a valid ID an not just some bad path shutil.rmtree(os.path.join(project_directory, project_id)) @app.route("/languages", methods=['GET', 'POST']) @limiter.exempt def langs(): """ Retrieve list of supported languages --- tags: - translate responses: 200: description: List of languages schema: id: languages type: array items: type: object properties: code: type: string description: Language code name: type: string description: Human-readable language name (in English) 429: description: Slow down schema: id: error-slow-down type: object properties: error: type: string description: Reason for slow down """ return jsonify([{'code': l.code, 'name': l.name} for l in languages]) # Add cors @app.after_request def after_request(response): response.headers.add('Access-Control-Allow-Origin', '*') response.headers.add('Access-Control-Allow-Headers', "Authorization, Content-Type") response.headers.add('Access-Control-Expose-Headers', "Authorization") response.headers.add('Access-Control-Allow-Methods', "GET, POST") response.headers.add('Access-Control-Allow-Credentials', "true") response.headers.add('Access-Control-Max-Age', 60 * 60 * 24 * 20) return response @app.route("/project", methods=['GET']) def list_projects(): """ List available projects --- tags: - list """ return jsonify({"projects": loadAllProjects()}) def loadAllProjects(): output = [] for project_id in os.listdir(project_directory): project_details = loadProjectDetails(project_id) if project_details is not None: output.append(project_details) return output def loadProjectDetails(project_id): metadata_path = os.path.join(project_directory, project_id, "metadata.json") if not os.path.exists(metadata_path): return None metadata = json.loads(Path(metadata_path).read_text()) metadata["id"] = project_id metadata['project_dir'] = os.path.join(project_directory, project_id) # TODO rely on this data for everything metadata['subtitles'] = [] for file in os.listdir(metadata['project_dir']): if file.endswith(".srt"): metadata['subtitles'].append(file) if os.path.exists(os.path.join(project_directory, "subtitles.zip")): metadata['subtitles'].insert(0, 'subtitles.zip') metadata['inputVideo'] = "rawMedia." + metadata['fileEnding'] metadata['audio'] = "audio.wav" return metadata @app.route("/translate", methods=['POST']) def translate(): """ Translate text from a language to another --- tags: - translate parameters: - in: formData name: q schema: oneOf: - type: string example: Hello world! - type: array example: ['Hello world!'] required: true description: Text(s) to translate - in: formData name: source schema: type: string example: en required: true description: Source language code - in: formData name: target schema: type: string example: es required: true description: Target language code - in: formData name: api_key schema: type: string example: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx required: false description: API key responses: 200: description: Translated text schema: id: translate type: object properties: translatedText: oneOf: - type: string - type: array description: Translated text(s) 400: description: Invalid request schema: id: error-response type: object properties: error: type: string description: Error message 500: description: Translation error schema: id: error-response type: object properties: error: type: string description: Error message 429: description: Slow down schema: id: error-slow-down type: object properties: error: type: string description: Reason for slow down """ if request.is_json: json = request.get_json() q = json.get('q') source_lang = json.get('source') target_lang = json.get('target') else: q = request.values.get("q") source_lang = request.values.get("source") target_lang = request.values.get("target") if not q: abort(400, description="Invalid request: missing q parameter") if not source_lang: abort(400, description="Invalid request: missing source parameter") if not target_lang: abort(400, description="Invalid request: missing target parameter") batch = isinstance(q, list) if batch and args.batch_limit != -1: batch_size = len(q) if args.batch_limit < batch_size: abort(400, description= "Invalid request: Request (%d) exceeds text limit (%d)" % (batch_size, args.batch_limit)) if args.char_limit != -1: if batch: chars = sum([len(text) for text in q]) else: chars = len(q) if args.char_limit < chars: abort( 400, description= "Invalid request: Request (%d) exceeds character limit (%d)" % (chars, args.char_limit)) if source_lang == 'auto': candidate_langs = list( filter(lambda l: l.lang in language_map, detect_langs(q))) if len(candidate_langs) > 0: candidate_langs.sort(key=lambda l: l.prob, reverse=True) if args.debug: print(candidate_langs) source_lang = next( iter([ l.code for l in languages if l.code == candidate_langs[0].lang ]), None) if not source_lang: source_lang = 'en' else: source_lang = 'en' if args.debug: print("Auto detected: %s" % source_lang) src_lang = next(iter([l for l in languages if l.code == source_lang]), None) tgt_lang = next(iter([l for l in languages if l.code == target_lang]), None) if src_lang is None: abort(400, description="%s is not supported" % source_lang) if tgt_lang is None: abort(400, description="%s is not supported" % target_lang) translator = src_lang.get_translation(tgt_lang) try: if batch: return jsonify({ "translatedText": [translator.translate(text) for text in q] }) else: return jsonify({"translatedText": translator.translate(q)}) except Exception as e: abort(500, description="Cannot translate text: %s" % str(e)) @app.route("/detect", methods=['POST']) def detect(): """ Detect the language of a single text --- tags: - translate parameters: - in: formData name: q schema: type: string example: Hello world! required: true description: Text to detect - in: formData name: api_key schema: type: string example: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx required: false description: API key responses: 200: description: Detections schema: id: detections type: array items: type: object properties: confidence: type: number format: float minimum: 0 maximum: 1 description: Confidence value example: 0.6 language: type: string description: Language code example: en 400: description: Invalid request schema: id: error-response type: object properties: error: type: string description: Error message 500: description: Detection error schema: id: error-response type: object properties: error: type: string description: Error message 429: description: Slow down schema: id: error-slow-down type: object properties: error: type: string description: Reason for slow down """ if request.is_json: json = request.get_json() q = json.get('q') else: q = request.values.get("q") if not q: abort(400, description="Invalid request: missing q parameter") candidate_langs = list( filter(lambda l: l.lang in language_map, detect_langs(q))) candidate_langs.sort(key=lambda l: l.prob, reverse=True) return jsonify([{ 'confidence': l.prob, 'language': l.lang } for l in candidate_langs]) @app.route("/frontend/settings") @limiter.exempt def frontend_settings(): """ Retrieve frontend specific settings --- tags: - frontend responses: 200: description: frontend settings schema: id: frontend-settings type: object properties: charLimit: type: integer description: Character input limit for this language (-1 indicates no limit) frontendTimeout: type: integer description: Frontend translation timeout language: type: object properties: source: type: object properties: code: type: string description: Language code name: type: string description: Human-readable language name (in English) target: type: object properties: code: type: string description: Language code name: type: string description: Human-readable language name (in English) """ return jsonify({ 'charLimit': args.char_limit, 'frontendTimeout': args.frontend_timeout, 'language': { 'source': { 'code': frontend_argos_language_source.code, 'name': frontend_argos_language_source.name }, 'target': { 'code': frontend_argos_language_target.code, 'name': frontend_argos_language_target.name } } }) swag = swagger(app) swag['info']['version'] = "1.2" swag['info']['title'] = "LibreTranslate" @app.route("/spec") @limiter.exempt def spec(): return jsonify(swag) SWAGGER_URL = '/docs' # URL for exposing Swagger UI (without trailing '/') API_URL = '/spec' # Call factory function to create our blueprint swaggerui_blueprint = get_swaggerui_blueprint(SWAGGER_URL, API_URL) app.register_blueprint(swaggerui_blueprint) return app
print(" Recording complete.") audio_data = (np.frombuffer(b''.join(frames), dtype=np.int16) / 32767) bg_data = (np.frombuffer(b''.join(frames_bg), dtype=np.int16) / 32767) # denoised_data = removeNoise(audio_data, bg_data)#.astype('float32') return audio_data #denoised_data #######Deepspeech Voice-To-Text Parameters######## DS_FOLDER = 'deepspeech_data' if not os.path.exists(DS_FOLDER): os.mkdir(DS_FOLDER) DS_model_file_path = 'deepspeech_data/deepspeech-0.7.4-models.pbmm' beam_width = 500 DS_model = Model(DS_model_file_path) DS_model.setBeamWidth(beam_width) DS_model.enableExternalScorer('deepspeech_data/deepspeech-0.7.4-models.scorer') def get_text(data, model=DS_model): """ Transcribe text from audio. data: audio data as in array read from librosa with sampling rate 16000. model: Deepspeech ASR model. """ # y , s = librosa.load(fpath, sr=16000) y = (data * 32767).astype('int16') text = model.stt(y) return text
def main(): global line_count print("AutoSub v0.1\n") parser = argparse.ArgumentParser(description="AutoSub v0.1") parser.add_argument('--model', required=True, help='DeepSpeech model file') parser.add_argument('--scorer', help='DeepSpeech scorer file') parser.add_argument('--file', required=True, help='Input video file') args = parser.parse_args() ds_model = args.model if not ds_model.endswith(".pbmm"): print("Invalid model file. Exiting\n") exit(1) # Load DeepSpeech model ds = Model(ds_model) if args.scorer: ds_scorer = args.scorer if not ds_scorer.endswith(".scorer"): print( "Invalid scorer file. Running inference using only model file\n" ) else: ds.enableExternalScorer(ds_scorer) input_file = args.file print("\nInput file:", input_file) base_directory = os.getcwd() output_directory = os.path.join(base_directory, "output") audio_directory = os.path.join(base_directory, "audio") video_file_name = input_file.split(os.sep)[-1].split(".")[0] audio_file_name = os.path.join(audio_directory, video_file_name + ".wav") srt_file_name = os.path.join(output_directory, video_file_name + ".srt") # Extract audio from input video file extract_audio(input_file, audio_file_name) print("Splitting on silent parts in audio file") silenceRemoval(audio_file_name) # Output SRT file file_handle = open(srt_file_name, "a+") print("\nRunning inference:") for file in tqdm(sort_alphanumeric(os.listdir(audio_directory))): audio_segment_path = os.path.join(audio_directory, file) # Dont run inference on the original audio file if audio_segment_path.split(os.sep)[-1] != audio_file_name.split( os.sep)[-1]: ds_process_audio(ds, audio_segment_path, file_handle) print("\nSRT file saved to", srt_file_name) file_handle.close() # Clean audio/ directory shutil.rmtree(audio_directory) os.mkdir(audio_directory)
class DeepSpeech(): def __init__(self, model_path, scorer_path, result_json_path, result_txt_path, candidate_transcripts=3, beam_width=None): # Path to the Speech-To-Text model self.MODEL_PATH = model_path # Path to the scorer language mode self.SCORER_PATH = scorer_path # The number of times to trascript self.CANDIDATE_TRANSCRIPTS = candidate_transcripts self.result_json_path = result_json_path self.result_txt_path = result_txt_path self.beam_width = beam_width self._setup() def _setup(self): self.ds = Model(self.MODEL_PATH) # Declare the model obj # Set desired sample rate for STT model. self.sample_rate = '16000' if self.beam_width: self.ds.setBeamWidth(self.beam_width) if self.SCORER_PATH: self.ds.enableExternalScorer(self.SCORER_PATH) def convert_samplerate(self, audio_path, desired_sample_rate): sox_cmd = 'sox {} --type raw --bits 16 --channels 1 --rate {}\ --encoding signed-integer --endian little\ --compression 0.0 --no-dither - '\ .format(quote(audio_path), desired_sample_rate) try: output = subprocess.check_output( shlex.split(sox_cmd), stderr=subprocess.PIPE) except subprocess.CalledProcessError as e: raise RuntimeError( 'SoX returned non-zero status: {}'.format(e.stderr)) except OSError as e: raise OSError(e.errno, 'SoX not found, use {}hz files or install it: {}' .format(desired_sample_rate, e.strerror)) return desired_sample_rate, np.frombuffer(output, np.int16) def words_from_candidate_transcript(self, metadata): word = "" word_list = [] word_start_time = 0 # Loop through each character for i, token in enumerate(metadata.tokens): # Append character to word if it's not a space if token.text != " ": if len(word) == 0: # Log the start time of the new word word_start_time = token.start_time word = word + token.text # Word boundary is either a space or the last character in the arr if token.text == " " or i == len(metadata.tokens) - 1: word_duration = token.start_time - word_start_time if word_duration < 0: word_duration = 0 each_word = dict() each_word["word"] = word each_word["start_time "] = round(word_start_time, 4) each_word["duration"] = round(word_duration, 4) word_list.append(each_word) # Reset word = "" word_start_time = 0 return word_list def metadata_json_output(self, metadata): json_result = dict() json_result["transcripts"] = [{ "confidence": transcript.confidence, "words": self.words_from_candidate_transcript(transcript), } for transcript in metadata.transcripts] return json.dumps(json_result, indent=4) def take_audio_info(self): probe = ffmpeg.probe(self.FILE_PATH) self.audio_info = next( (stream for stream in probe['streams'] if stream['codec_type'] == 'audio'), None) print(self.audio_info) return self.audio_info def take_audio(self): out, err = ( ffmpeg .input(self.FILE_PATH) .output('-', format='s16le', acodec='pcm_s16le', ac=1, ar=self.sample_rate) .run(capture_stdout=True, capture_stderr=True) ) self.audio = np.frombuffer(out, np.int16) return self.audio def speech2text(self): metadata = self.ds.sttWithMetadata( self.audio, self.CANDIDATE_TRANSCRIPTS) json_result = self.metadata_json_output(metadata) with open(self.result_json_path, 'w') as outfile: outfile.write(json_result) dict_result = json.loads(json_result) word_list = [item["word"] for item in dict_result["transcripts"][0]["words"]] sentence = " ".join(word_list) self.export2textfile(sentence) return sentence def export2textfile(self, sentence): txt_file = open(self.result_txt_path, "w") txt_file.writelines(sentence) txt_file.close() def set_file(self, filepath): self.FILE_PATH = filepath