Beispiel #1
0
class MozillaDeepSpeechEngine(Engine):
    def __init__(self, pbmm_path: str, scorer_path: str):
        self._model = Model(pbmm_path)
        self._model.enableExternalScorer(scorer_path)
        self._audio_sec = 0.
        self._proc_sec = 0.

    def transcribe(self, path: str) -> str:
        audio, sample_rate = soundfile.read(path, dtype='int16')
        assert sample_rate == self._model.sampleRate()
        self._audio_sec += audio.size / sample_rate

        start_sec = time.time()
        res = self._model.stt(audio)
        self._proc_sec += time.time() - start_sec

        return res

    def rtf(self) -> float:
        return self._proc_sec / self._audio_sec

    def delete(self) -> None:
        pass

    def __str__(self) -> str:
        return 'Mozilla DeepSpeech'
Beispiel #2
0
def main():
    parser = argparse.ArgumentParser(description='Running DeepSpeech inference.')
    parser.add_argument('--model', required=True,
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument('--lm', nargs='?',
                        help='Path to the language model binary file')
    parser.add_argument('--trie', nargs='?',
                        help='Path to the language model trie file created with native_client/generate_trie')
    parser.add_argument('--audio', required=True,
                        help='Path to the audio file to run (WAV format)')
    parser.add_argument('--beam_width', type=int, default=500,
                        help='Beam width for the CTC decoder')
    parser.add_argument('--lm_alpha', type=float, default=0.75,
                        help='Language model weight (lm_alpha)')
    parser.add_argument('--lm_beta', type=float, default=1.85,
                        help='Word insertion bonus (lm_beta)')
    parser.add_argument('--version', action=VersionAction,
                        help='Print version and exits')
    parser.add_argument('--extended', required=False, action='store_true',
                        help='Output string from extended metadata')
    parser.add_argument('--json', required=False, action='store_true',
                        help='Output json from metadata with timestamp of each word')
    args = parser.parse_args()

    print('Loading model from file {}'.format(args.model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(args.model, args.beam_width)
    model_load_end = timer() - model_load_start
    print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

    desired_sample_rate = ds.sampleRate()

    if args.lm and args.trie:
        print('Loading language model from files {} {}'.format(args.lm, args.trie), file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(args.lm, args.trie, args.lm_alpha, args.lm_beta)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr)

    fin = wave.open(args.audio, 'rb')
    fs = fin.getframerate()
    if fs != desired_sample_rate:
        print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format(fs, desired_sample_rate), file=sys.stderr)
        fs, audio = convert_samplerate(args.audio, desired_sample_rate)
    else:
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    audio_length = fin.getnframes() * (1/fs)
    fin.close()

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    if args.extended:
        print(metadata_to_string(ds.sttWithMetadata(audio)))
    elif args.json:
        print(metadata_json_output(ds.sttWithMetadata(audio)))
    else:
        print(ds.stt(audio))
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
Beispiel #3
0
def client(audio_file, lang="uk"):
    model_load_start = timer()
    # sphinx-doc: python_ref_model_start
    model = "./uk.tflite"

    ds = Model(model)
    # ds.enableExternalScorer("kenlm.scorer")
    # sphinx-doc: python_ref_model_stop
    model_load_end = timer() - model_load_start
    print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

    desired_sample_rate = ds.sampleRate()

    fin = wave.open(audio_file, 'rb')
    fs_orig = fin.getframerate()
    audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    audio_length = fin.getnframes() * (1 / fs_orig)
    fin.close()

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    # sphinx-doc: python_ref_inference_start

    result = ds.stt(audio)
    print(result)
    # sphinx-doc: python_ref_inference_stop
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' %
          (inference_end, audio_length),
          file=sys.stderr)
    return result
Beispiel #4
0
def transcribe(audio_path):
    ds = Model(model_path="deepspeech-0.7.0-models.pbmm")

    desired_sample_rate = ds.sampleRate()
    print(desired_sample_rate)
    ds.enableExternalScorer("deepspeech-0.7.0-models.scorer")

    fin = wave.open(audio_path, 'rb')
    fs_orig = fin.getframerate()

    if fs_orig != desired_sample_rate:
        print("Converting from {}hz to {}hz" % (fs_orig, desired_sample_rate))
        fs_new, audio = convert_samplerate(audio_path, desired_sample_rate)
    else:
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
    audio_length = fin.getnframes() * (1 / fs_orig)
    fin.close()

    inference_start = timer()
    transcript = ds.sttWithMetadata(audio, 1).transcripts[0]
    json_result = metadata_json_output(transcript)
    string_result = metadata_to_string(transcript)

    inference_end = timer() - inference_start
    print(json_result)
    print('Inference took %0.3fs for %0.3fs audio file.' %
          (inference_end, audio_length),
          file=sys.stderr)

    return json_result, string_result
Beispiel #5
0
class DeepSpeechRecognizer:

    def __init__(self):

        self.file_path = Path(__file__).parent

        self.model = Model('/Users/shihangyu/Scripts/python/stt_server/model/deepspeech-0.6.1-models/output_graph.pbmm',
                           aBeamWidth=500)

        self.desired_sample_rate = self.model.sampleRate()

        self.logger = getLogger(self.__module__)

        self.tmp_path = self.file_path / 'tmp.wav'

    def __convert_samplerate(self, audio_path):
        sox_cmd = 'sox {} --type raw --bits 16 --channels 1 --rate {} --encoding signed-integer --endian little --compression 0.0 --no-dither - '.format(
            quote(audio_path), self.desired_sample_rate)
        try:
            output = subprocess.check_output(shlex.split(sox_cmd), stderr=subprocess.PIPE)
        except subprocess.CalledProcessError as e:
            raise RuntimeError('SoX returned non-zero status: {}'.format(e.stderr))
        except OSError as e:
            raise OSError(e.errno,
                          'SoX not found, use {}hz files or install it: {}'.format(self.desired_sample_rate,
                                                                                   e.strerror))

        return self.desired_sample_rate, np.frombuffer(output, np.int16)

    def inference(self, audio_path):

        try:
            fin = wave.open(audio_path, 'rb')
        except Exception as e:

            x, _ = librosa.load(str(audio_path), sr=16000)

            sf.write(str(self.tmp_path), x, 16000)

            fin = wave.open(str(self.tmp_path), 'rb')

        fs = fin.getframerate()

        if fs != self.desired_sample_rate:
            # self.logger.warning(f'Warning: original sample rate ({fs}) is different than {self.desired_sample_rate}hz. '
            #                     f'Resampling might produce erratic speech recognition.')
            fs, audio = self.__convert_samplerate(audio_path)
        else:
            audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

        fin.close()

        output = self.model.stt(audio)

        self.logger.debug(f"DeepSpeechRecognizer inference output: {output}")

        return output
Beispiel #6
0
def transcribe(args, filepath="", verbose=0):

    if verbose > 0:
        print('Loading model from file {}'.format(args.model), file=sys.stderr)
        model_load_start = timer()

    ds = Model(args.model, args.beam_width)
    if verbose > 0:
        model_load_end = timer() - model_load_start
        print('Loaded model in {:.3}s.'.format(
            model_load_end), file=sys.stderr)

    desired_sample_rate = ds.sampleRate()
    if args.lm and args.trie:
        if verbose > 0:
            print('Loading language model from files {} {}'.format(
                args.lm, args.trie), file=sys.stderr)
            lm_load_start = timer()
        ds.enableDecoderWithLM(args.lm, args.trie, args.lm_alpha, args.lm_beta)
        if verbose > 0:
            lm_load_end = timer() - lm_load_start
            print('Loaded language model in {:.3}s.'.format(
                lm_load_end), file=sys.stderr)

    fin = wave.open(filepath, 'rb')
    fs = fin.getframerate()
    if fs != desired_sample_rate:
        if verbose > 0:
            print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format(
                fs, desired_sample_rate), file=sys.stderr)
        fs, audio = convert_samplerate(filepath, desired_sample_rate)
    else:
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    audio_length = fin.getnframes() * (1/fs)
    fin.close()

    if verbose > 0:
        print('Running inference.', file=sys.stderr)
        inference_start = timer()
    audio_metadata = ds.sttWithMetadata(audio)
    if verbose > 0:
        inference_end = timer() - inference_start
        print('Inference took %0.3fs for %0.3fs audio file.' %
              (inference_end, audio_length), file=sys.stderr)

    dict_result = dict()
    dict_result["sentence"] = "".join(
        item.character for item in audio_metadata.items)
    dict_result["words"] = words_from_metadata(audio_metadata)
    dict_result["characters"] = audio_metadata
    dict_result["confidence"] = audio_metadata.confidence

    return dict_result
Beispiel #7
0
def load_transcribe_model():
    model_load_start = timer()
    global ds
    ds = Model(os.path.join(home_dir, "models",
                            "deepspeech-0.9.3-models.pbmm"))
    ds.enableExternalScorer(
        os.path.join(home_dir, "models", "deepspeech-0.9.3-models.scorer"))
    model_load_end = timer() - model_load_start
    logging.info('Loaded model in {:.3}s.'.format(model_load_end))
    global desired_sample_rate
    desired_sample_rate = ds.sampleRate()
    logging.info('Model optimized for a sample rate of ' +
                 str(desired_sample_rate))
def load_model(models, lm, trie):

    BEAM_WIDTH = 500

    LM_ALPHA = 0.75

    LM_BETA = 1.85

    ds = Model(models, BEAM_WIDTH)

    ds.enableDecoderWithLM(lm, trie, LM_ALPHA, LM_BETA)

    sample_rate = ds.sampleRate()

    return [ds, sample_rate]
Beispiel #9
0
def speech_to_text(input_file,
                   file_length,
                   return_speed_per_chunk=False,
                   chunk_size=10):
    """
    Compute the words pronounced in the input_file
    :param input_file: sound file path
    :param file_length: time length of the input file (in seconds)
    :param return_speed_per_chunk: if True, the function return a list of words per chunk, if false it returns all the words in the extract
    :return: words as string
    """
    # setup the model
    if return_speed_per_chunk:
        result = []
    else:
        result = ""
    recognizer = Model("models/deepspeech-0.8.2-models.pbmm")
    recognizer.setBeamWidth(2000)
    recognizer.enableExternalScorer("models/deepspeech-0.8.2-models.scorer")
    desired_sample_rate = recognizer.sampleRate()
    # convert input file into smaller audio chunks (apparently works better)
    CHUNK_SIZE = chunk_size
    n_chunks = int(file_length // CHUNK_SIZE)
    for i in range(n_chunks):
        tfm = sox.Transformer()
        tfm.trim(i * CHUNK_SIZE, (i + 1) * CHUNK_SIZE)
        tfm.set_output_format(channels=1)
        tfm.build(input_file, "temp_folder/chunked_file{}.wav".format(i))
        #cmb = sox.Combiner()
        input_list = [
            "audio-files/silence.wav",
            "temp_folder/chunked_file{}.wav".format(i),
            "audio-files/silence.wav"
        ]
        input_list_correct_sample_rate = list(
            map(lambda file: convert_samplerate(file, desired_sample_rate)[1],
                input_list))
        audio = np.concatenate(input_list_correct_sample_rate)
        #cmb.build(input_list, "temp_folder/chunked_file_with_silence{}.wav".format(i), combine_type="concatenate")
        #fs, audio = convert_samplerate("temp_folder/chunked_file_with_silence{}.wav".format(i), desired_sample_rate)
        if return_speed_per_chunk:
            result.append(recognizer.stt(audio))
        else:
            result += recognizer.stt(audio)
        os.remove("temp_folder/chunked_file{}.wav".format(i))
        #os.remove("temp_folder/chunked_file_with_silence{}.wav".format(i))
    print(result)
    return result
Beispiel #10
0
def main():

    audio_files = glob.glob("uploads/*.wav")
    speech_model = "deepspeech-0.9.3-models.pbmm"
    speech_scorer = "deepspeech-0.9.3-models.scorer"
    speech_audio = audio_files[0]
    print('Loading model from file', file=sys.stderr)
    model_load_start = timer()
    ds = Model(speech_model)
    model_load_end = timer() - model_load_start
    print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

    desired_sample_rate = ds.sampleRate()

    print('Loading scorer from files', file=sys.stderr)
    scorer_load_start = timer()
    ds.enableExternalScorer(speech_scorer)
    scorer_load_end = timer() - scorer_load_start
    print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr)

    fin = wave.open(speech_audio, 'rb')
    fs_orig = fin.getframerate()
    if fs_orig != desired_sample_rate:
        print(
            'Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'
            .format(fs_orig, desired_sample_rate),
            file=sys.stderr)
        fs_new, audio = convert_samplerate(speech_audio, desired_sample_rate)
    else:
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    audio_length = fin.getnframes() * (1 / fs_orig)
    fin.close()

    print('Running inference.', file=sys.stderr)
    inference_start = timer()

    inference_end = timer() - inference_start

    audio_transcription = metadata_json_output(ds.sttWithMetadata(audio, 3))

    print('Inference took %0.3fs for %0.3fs audio file.' %
          (inference_end, audio_length),
          file=sys.stderr)
    print('Candidate Transcritps:', 3)

    return audio_transcription
Beispiel #11
0
def load_model():

    models = "models/output_graph.tflite"  #.tflite
    lm = "models/lm.binary"  # lm.binary
    trie = "models/trie"  # trie

    BEAM_WIDTH = 500
    LM_ALPHA = 0.75
    LM_BETA = 1.85

    ds = Model(models, BEAM_WIDTH)

    ds.enableDecoderWithLM(lm, trie, LM_ALPHA, LM_BETA)

    sample_rate = ds.sampleRate()

    return [ds, sample_rate]
Beispiel #12
0
class DeepLearnModel:
    def __init__(self):
        self.model = None

    def init_app(self, app):
        self.model = Model(str(app.config["DL_MODEL_PATH"]))
        scorer = app.config['DL_SCORER_PATH']
        if scorer:
            self._load_scorer(scorer)

    def _load_scorer(self, scorer):
        logging.info('Loading scorer from files {}'.format(scorer))
        self.model.enableExternalScorer(scorer)

    def infer(self, audio_sample):
        audio = sample_audio(audio_sample, self.model.sampleRate())
        return self.model.stt(audio)
def load_model(models, lm, trie):
    BEAM_WIDTH = 500
    LM_ALPHA = 0.75
    LM_BETA = 1.85

    model_load_start = timer()
    ds = Model(models, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    logging.debug("Loaded model in %0.3fs." % (model_load_end))

    lm_load_start = timer()
    ds.enableDecoderWithLM(lm, trie, LM_ALPHA, LM_BETA)
    lm_load_end = timer() - lm_load_start
    logging.debug('Loaded language model in %0.3fs.' % (lm_load_end))

    sample_rate = ds.sampleRate()
    logging.debug('Loaded model sample rate: %dHz.' % (sample_rate))

    return [ds, model_load_end, lm_load_end, sample_rate]
Beispiel #14
0
def stt(model_path,
        audio,
        beam_width=None,
        scorer_path=None,
        lm_alpha=None,
        lm_beta=None,
        hot_words=None):
    ds = Model(model_path)

    if beam_width:
        ds.setBeamWidth(beam_width)

    desired_sample_rate = ds.sampleRate()

    if scorer_path:
        ds.enableExternalScorer(scorer_path)

        if lm_alpha and lm_beta:
            ds.setScorerAlphaBeta(lm_alpha, lm_beta)

    # TODO
    # if hot_words:
    #     print('Adding hot-words', file=sys.stderr)
    #     for w in hot_words:
    #         ds.addHotWord(w, 6.2)

    fin = wave.open(audio, 'rb')
    fs_orig = fin.getframerate()
    if fs_orig != desired_sample_rate:
        print(
            f'ERROR: original sample rate ({fs_orig}) is different than {desired_sample_rate}hz.',
            file=sys.stderr)
        exit(1)

    audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    fin.close()

    print('Running inference.', file=sys.stderr)
    res = ds.sttWithMetadata(audio, 1)
    res = postprocess_metadata(res)
    return res
Beispiel #15
0
def process_input_file(conn, options, out_queue, background=True):
    """Given socket/pipe process audio input and push to out_queue"""
    log.info("Starting recognition on %s", conn)
    model = Model(options.model,)
    if options.beam_width:
        model.setBeamWidth(options.beam_width)
    desired_sample_rate = model.sampleRate()
    if desired_sample_rate != defaults.SAMPLE_RATE:
        log.error("Model expects rate of %s", desired_sample_rate)
    # if options.scorer:
    #     model.enableExternalScorer(options.scorer)
    # else:
    log.info("Disabling the built-in scorer")
    model.disableExternalScorer()
    out_queue.put({'partial': False, 'final': False, 'message': ['Connected']})
    if background:
        thread = threading.Thread(target=run_recognition, args=(model, conn, out_queue))
        thread.setDaemon(background)
        thread.start()
    else:
        run_recognition(model, conn, out_queue)
Beispiel #16
0
def process_input_file(conn, options, out_queue, background=True):
    # TODO: allow socket connections from *clients* to choose
    # the model rather than setting it in the daemon...
    # to be clear, *output* clients, not audio sinks
    log.info("Starting recognition on %s", conn)
    model = Model(options.model,)
    if options.beam_width:
        model.setBeamWidth(options.beam_width)
    desired_sample_rate = model.sampleRate()
    if desired_sample_rate != defaults.SAMPLE_RATE:
        log.error("Model expects rate of %s", desired_sample_rate)
    if options.scorer:
        model.enableExternalScorer(options.scorer)
    else:
        log.info("Disabling the scorer")
        model.disableExternalScorer()
    if background:
        t = threading.Thread(target=run_recognition, args=(model, conn, out_queue))
        t.setDaemon(background)
        t.start()
    else:
        run_recognition(model, conn, out_queue)
Beispiel #17
0
def MozillaSTT(audio_path):

    # TODO: handle different rates (not implemented)
    fin = wave.open(audio_path, 'rb')
    output = ""
    # print("SS")
    ds = Model(model_file_path)
    # print("SS")
    ds.enableExternalScorer(scorer_file_path)
    # print("SS")

    lm_alpha = 0.75  # ??
    lm_beta = 1.85
    desired_sample_rate = ds.sampleRate()
    ds.setScorerAlphaBeta(lm_alpha, lm_beta)
    fs_orig = fin.getframerate()
    # print("Desired Sampling Rate: %d", desired_sample_rate)
    if fs_orig != desired_sample_rate:
        print('Warning: original sample rate ({}) is different than {}hz. \
Resampling might produce erratic speech   recognition.'.format(
            fs_orig, desired_sample_rate),
              file=sys.stderr)
        fs_new, audio = convert_samplerate(audio_path, desired_sample_rate)
    else:
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    # audio_length = fin.getnframes() * (1/fs_orig)
    fin.close()
    print('Running inference.', file=sys.stderr)
    # print(metadata_to_string(ds.sttWithMetadata(audio, 1).transcripts[0]))
    # print(metadata_json_output(ds.sttWithMetadata(audio, 3)))
    # print(ds.stt(audio))
    output += ds.stt(audio)
    output += '\n'
    output += metadata_json_output(ds.sttWithMetadata(audio, 3))
    return output
Beispiel #18
0
def load(model, scorer, verbose=True, beam_width="", lm_alpha="", lm_beta="", hot_words=""):
    """ Load models"""

    model_load_start = timer()
    # sphinx-doc: python_ref_model_start
    ds = Model(model)
    # sphinx-doc: python_ref_model_stop
    model_load_end = timer() - model_load_start
    if verbose==True:
        print('\nLoading model from files {}'.format(model), file=sys.stderr)
        print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

    if beam_width:
        ds.setBeamWidth(beam_width)

    desired_sample_rate = ds.sampleRate()

    if scorer:
        if verbose == True:
            print('Loading scorer from files {}'.format(scorer), file=sys.stderr)
        scorer_load_start = timer()
        ds.enableExternalScorer(scorer)
        scorer_load_end = timer() - scorer_load_start
        if verbose == True:
            print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr)

        if lm_alpha and lm_beta:
            ds.setScorerAlphaBeta(lm_alpha, lm_beta)

    if hot_words:
        if verbose == True:
            print('Adding hot-words', file=sys.stderr)
        for word_boost in hot_words.split(','):
            word, boost = word_boost.split(':')
            ds.addHotWord(word, float(boost))
    return ds, desired_sample_rate
Beispiel #19
0
    def __init__(self, ):

        print('Loading model from file {}'.format(args.model), file=sys.stderr)
        model_load_start = timer()
        # sphinx-doc: python_ref_model_start
        model_path = os.path.dirname(os.path.abspath(__file__))

        ds = Model(os.path.join(model_path, args.model))
        # sphinx-doc: python_ref_model_stop
        model_load_end = timer() - model_load_start
        print('Loaded model in {:.3}s.'.format(model_load_end),
              file=sys.stderr)

        if args.beam_width:
            ds.setBeamWidth(args.beam_width)

        self.desired_sample_rate = ds.sampleRate()

        if args.scorer:
            print('Loading scorer from files {}'.format(args.scorer),
                  file=sys.stderr)
            scorer_load_start = timer()
            ds.enableExternalScorer(os.path.join(model_path, args.scorer))
            scorer_load_end = timer() - scorer_load_start
            print('Loaded scorer in {:.3}s.'.format(scorer_load_end),
                  file=sys.stderr)

            if args.lm_alpha and args.lm_beta:
                ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta)

        if args.hot_words:
            print('Adding hot-words', file=sys.stderr)
            for word_boost in args.hot_words.split(','):
                word, boost = word_boost.split(':')
                ds.addHotWord(word, float(boost))
        self.ds = ds
Beispiel #20
0
# -*- coding: utf-8 -*-
"""
Created on Sat Mar  6 15:41:29 2021

@author: Marshall.McDougall
"""
#import argparse
#import numpy as np
#import shlex
#import subprocess
#import sys
#import wave
#import json

from deepspeech import Model  #, version
#from timeit import default_timer as timer

#try:
#    from shhlex import quote
#except ImportError:
#    from pipes import quote
# deepspeech --model deepspeech-0.9.3-models.pbmm --scorer deepspeech-0.9.3-models.scorer --audio audio/SimpleTest3.wav --json

ds = Model("deepspeech-0.9.3-models.pbmm")
desired_sample_rate = ds.sampleRate()
ds.enableExternalScorer("deepspeech-0.9.3-models.scorer")
ds.sttWithMetadata("audio/SimpleTest3.wav", "3")
Beispiel #21
0
    def create_data(X: dt.Frame = None) -> dt.Frame:
        if X is None:
            return []

        from deepspeech import Model

        try:
            logger = logging.getLogger(__name__)
            hdlr = logging.FileHandler(LOG_FILE)
            formatter = logging.Formatter(
                '%(asctime)s %(levelname)s %(message)s')
            hdlr.setFormatter(formatter)
            logger.addHandler(hdlr)
            logger.setLevel(logging.INFO)
        except:
            logger = False

        X = X.to_pandas()
        if WAV_COLNAME in X.columns:
            model = os.path.join(MODEL_PATH, "output_graph.pbmm")
            lm = os.path.join(MODEL_PATH, "lm.binary")
            trie = os.path.join(MODEL_PATH, "trie")

            if logger:
                logger.info('Loading model from file {}'.format(model))
            model_load_start = timer()
            ds = Model(model, beam_width)
            model_load_end = timer() - model_load_start
            if logger:
                logger.info('Loaded model in {:.3}s.'.format(model_load_end))

            desired_sample_rate = ds.sampleRate()

            if logger:
                logger.info('Loading language model from files {} {}'.format(
                    lm, trie))

            lm_load_start = timer()
            ds.enableDecoderWithLM(lm, trie, lm_alpha, lm_beta)
            lm_load_end = timer() - lm_load_start

            if logger:
                logger.info(
                    'Loaded language model in {:.3}s.'.format(lm_load_end))
                logger.info('Running inference.')

            results = []
            ds_len = len(X[WAV_COLNAME])
            for i, audio_fn in enumerate(X[WAV_COLNAME].values.tolist()):
                inference_start = timer()
                audio_length = 0
                fin = wave.open(audio_fn, 'rb')
                fs = fin.getframerate()
                if fs != desired_sample_rate:
                    if logger:
                        err_msg = 'Original sample rate ({}) is different than {}hz. '\
                                  'Resampling might produce erratic speech recognition.'
                        logger.warning(err_msg.format(fs, desired_sample_rate))

                    fs, audio = convert_samplerate(audio_fn,
                                                   desired_sample_rate)
                else:
                    audio = np.frombuffer(fin.readframes(fin.getnframes()),
                                          np.int16)

                if MAX_SEC > 0:
                    audio = audio[:int(fs * MAX_SEC)]

                audio_length = len(audio) * (1 / fs)
                fin.close()

                try:
                    text = ds.stt(audio)
                except Exception as e:
                    text = ''
                    logger.error(e)

                results.append(text)

                inference_end = timer() - inference_start
                if logger:
                    logger.info(
                        'Record {:d} of {:d}. Inference took {:0.3f}s for {:0.3f}s audio file.'
                        .format(i, ds_len, inference_end, audio_length))

            X[WAV_COLNAME + "_txt"] = results

        return dt.Frame(X)
Beispiel #22
0
def create_app(args):
    logging.basicConfig(level=logging.DEBUG)
    sys.stdout = LoggerWriter(logging.debug)
    sys.stderr = LoggerWriter(logging.warning)
    if not args.offline:
        from app.init import boot
        boot()

    from app.language import languages
    app = Flask(__name__)

    project_directory = args.project_directory
    if not os.path.exists(project_directory):
        os.makedirs(project_directory)

    # For faster access
    language_map = {}
    for l in languages:
        language_map[l.code] = l.name

    if args.debug:
        app.config['TEMPLATES_AUTO_RELOAD'] = True
    app.config['MAX_CONTENT_LENGTH'] = 64 * 1024 * 1024
    # Map userdefined frontend languages to argos language object.
    if args.frontend_language_source == "auto":
        frontend_argos_language_source = type('obj', (object, ), {
            'code': 'auto',
            'name': 'Auto Detect'
        })
    else:
        frontend_argos_language_source = next(
            iter([
                l for l in languages if l.code == args.frontend_language_source
            ]), None)

    frontend_argos_language_target = next(
        iter([l for l in languages
              if l.code == args.frontend_language_target]), None)

    # Raise AttributeError to prevent app startup if user input is not valid.
    if frontend_argos_language_source is None:
        raise AttributeError(
            f"{args.frontend_language_source} as frontend source language is not supported."
        )
    if frontend_argos_language_target is None:
        raise AttributeError(
            f"{args.frontend_language_target} as frontend target language is not supported."
        )

    if args.req_limit > 0 or args.api_keys:
        from flask_limiter import Limiter
        limiter = Limiter(app,
                          key_func=get_remote_address,
                          default_limits=get_routes_limits(
                              args.req_limit,
                              Database() if args.api_keys else None))
    model_load_start = timer()
    ds = Model(os.path.join(home_dir, "models",
                            "deepspeech-0.9.3-models.pbmm"))
    ds.enableExternalScorer(
        os.path.join(home_dir, "models", "deepspeech-0.9.3-models.scorer"))
    model_load_end = timer() - model_load_start
    logging.info('Loaded model in {:.3}s.'.format(model_load_end))
    desired_sample_rate = ds.sampleRate()
    logging.info('Model optimized for a sample rate of ' +
                 str(desired_sample_rate))
    uuid4hex = re.compile(
        '[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\Z', re.I)

    @app.errorhandler(400)
    def invalid_api(e):
        return jsonify({"error": str(e.description)}), 400

    @app.errorhandler(500)
    def server_error(e):
        return jsonify({"error": str(e.description)}), 500

    @app.errorhandler(429)
    def slow_down_error(e):
        return jsonify({"error": "Slowdown: " + str(e.description)}), 429

    @app.route("/")
    @limiter.exempt
    def index():
        return render_template('index.html',
                               gaId=args.ga_id,
                               frontendTimeout=args.frontend_timeout,
                               offline=args.offline,
                               api_keys=args.api_keys,
                               web_version=os.environ.get('LT_WEB')
                               is not None)

    @app.route("/projects")
    @limiter.exempt
    def projects():
        return render_template('projects.html',
                               gaId=args.ga_id,
                               frontendTimeout=args.frontend_timeout,
                               offline=args.offline,
                               api_keys=args.api_keys,
                               projects=loadAllProjects(),
                               web_version=os.environ.get('LT_WEB')
                               is not None)

    @app.route("/project/<id>")
    @limiter.exempt
    def project(id):
        if not uuid4hex.match(id):
            logging.error("Invalid project id")
            return redirect("/projects")
        return render_template('project.html',
                               gaId=args.ga_id,
                               frontendTimeout=args.frontend_timeout,
                               offline=args.offline,
                               api_keys=args.api_keys,
                               project=loadProjectDetails(id),
                               web_version=os.environ.get('LT_WEB')
                               is not None)

    @app.route("/project/<id>/delete")
    @limiter.exempt
    def projectDelete(id):
        delete_project(id)
        return redirect("/projects")

    @app.route("/project/<id>/transcription")
    @limiter.exempt
    def projectTranscribe(id):
        if not uuid4hex.match(id):
            flash("Invalid project id")
            return redirect("/projects")
        logging.info("Starting the transcription job for project ID " + id)
        cmd = [
            sys.executable,
            os.path.join(home_dir, 'scripts', 'batch.py', "--target-dir",
                         os.path.join(project_directory, id))
        ]
        subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
        return redirect("/project/" + id)

    @app.route("/project/<id>/download/<file>")
    def download(id, file):
        # todo validate the file part

        metadata = loadProjectDetails(id)
        if metadata is None:
            logging.info("Unable to find metdata for project ID: " + id)
            return redirect("/projects")
        return send_from_directory(directory=metadata['project_dir'],
                                   filename=file,
                                   as_attachment=True)

    @app.route("/create-project")
    @limiter.exempt
    def createProject():
        return render_template('create-project.html',
                               gaId=args.ga_id,
                               frontendTimeout=args.frontend_timeout,
                               offline=args.offline,
                               api_keys=args.api_keys,
                               web_version=os.environ.get('LT_WEB')
                               is not None)

    def allowed_file(filename):
        return '.' in filename and filename.rsplit(
            '.', 1)[1].lower() in ALLOWED_EXTENSIONS

    @app.route('/new-project-upload', methods=['GET', 'POST'])
    def uploadProject():
        if request.method == 'POST':
            # check if the post request has the file part
            if 'file' not in request.files:
                return redirect(request.url)
            file = request.files['file']
            # if user does not select file, browser also
            # submit an empty part without filename
            if file.filename == '':
                return redirect(request.url)
            if file and allowed_file(file.filename):
                project_id = str(uuid.uuid4())
                if not os.path.exists(
                        os.path.join(project_directory, project_id)):
                    os.makedirs(os.path.join(project_directory, project_id))
                fileending = file.filename.rsplit('.', 1)[1].lower()
                file.save(
                    os.path.join(project_directory, project_id,
                                 "rawMedia." + fileending))
                # TODO store original file name
                metadata = createMetadata(project_id, request.form['name'],
                                          fileending)
                with open(
                        os.path.join(project_directory, project_id,
                                     "metadata.json"), 'w') as f:
                    json.dump(metadata, f)

                return redirect("./project/" + project_id)

    @timeit
    def createMetadata(project_id, name, ending):
        metadata = {"name": name, "fileEnding": ending}
        in_filename = os.path.join(project_directory, project_id,
                                   "rawMedia." + ending)
        probe = ffmpeg.probe(in_filename)
        video_stream = next((stream for stream in probe['streams']
                             if stream['codec_type'] == 'video'), None)
        logging.debug(str(video_stream))
        metadata['width'] = int(video_stream['width'])
        metadata['height'] = int(video_stream['height'])
        metadata['durationSeconds'] = float(video_stream['duration'])
        (ffmpeg.input(in_filename,
                      ss=3).filter('scale', 512, -1).output(os.path.join(
                          project_directory, project_id, "thumbnail.png"),
                                                            vframes=1).run())
        return metadata

    def delete_project(project_id):
        logging.info("Deleting a project with ID: " + project_id)
        # TODO make sure tha ID is a valid ID an not just some bad path
        shutil.rmtree(os.path.join(project_directory, project_id))

    @app.route("/languages", methods=['GET', 'POST'])
    @limiter.exempt
    def langs():
        """
        Retrieve list of supported languages
        ---
        tags:
          - translate
        responses:
          200:
            description: List of languages
            schema:
              id: languages
              type: array
              items:
                type: object
                properties:
                  code:
                    type: string
                    description: Language code
                  name:
                    type: string
                    description: Human-readable language name (in English)
          429:
            description: Slow down
            schema:
              id: error-slow-down
              type: object
              properties:
                error:
                  type: string
                  description: Reason for slow down
        """
        return jsonify([{'code': l.code, 'name': l.name} for l in languages])

    # Add cors
    @app.after_request
    def after_request(response):
        response.headers.add('Access-Control-Allow-Origin', '*')
        response.headers.add('Access-Control-Allow-Headers',
                             "Authorization, Content-Type")
        response.headers.add('Access-Control-Expose-Headers', "Authorization")
        response.headers.add('Access-Control-Allow-Methods', "GET, POST")
        response.headers.add('Access-Control-Allow-Credentials', "true")
        response.headers.add('Access-Control-Max-Age', 60 * 60 * 24 * 20)
        return response

    @app.route("/project", methods=['GET'])
    def list_projects():
        """
        List available projects
        ---
        tags:
          - list
        """
        return jsonify({"projects": loadAllProjects()})

    def loadAllProjects():
        output = []
        for project_id in os.listdir(project_directory):
            project_details = loadProjectDetails(project_id)
            if project_details is not None:
                output.append(project_details)
        return output

    def loadProjectDetails(project_id):
        metadata_path = os.path.join(project_directory, project_id,
                                     "metadata.json")
        if not os.path.exists(metadata_path):
            return None
        metadata = json.loads(Path(metadata_path).read_text())
        metadata["id"] = project_id
        metadata['project_dir'] = os.path.join(project_directory, project_id)
        # TODO rely on this data for everything
        metadata['subtitles'] = []
        for file in os.listdir(metadata['project_dir']):
            if file.endswith(".srt"):
                metadata['subtitles'].append(file)
        if os.path.exists(os.path.join(project_directory, "subtitles.zip")):
            metadata['subtitles'].insert(0, 'subtitles.zip')
        metadata['inputVideo'] = "rawMedia." + metadata['fileEnding']

        metadata['audio'] = "audio.wav"
        return metadata

    @app.route("/translate", methods=['POST'])
    def translate():
        """
        Translate text from a language to another
        ---
        tags:
          - translate
        parameters:
          - in: formData
            name: q
            schema:
              oneOf:
                - type: string
                  example: Hello world!
                - type: array
                  example: ['Hello world!']
            required: true
            description: Text(s) to translate
          - in: formData
            name: source
            schema:
              type: string
              example: en
            required: true
            description: Source language code
          - in: formData
            name: target
            schema:
              type: string
              example: es
            required: true
            description: Target language code
          - in: formData
            name: api_key
            schema:
              type: string
              example: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
            required: false
            description: API key
        responses:
          200:
            description: Translated text
            schema:
              id: translate
              type: object
              properties:
                translatedText:
                  oneOf:
                    - type: string
                    - type: array
                  description: Translated text(s)
          400:
            description: Invalid request
            schema:
              id: error-response
              type: object
              properties:
                error:
                  type: string
                  description: Error message
          500:
            description: Translation error
            schema:
              id: error-response
              type: object
              properties:
                error:
                  type: string
                  description: Error message
          429:
            description: Slow down
            schema:
              id: error-slow-down
              type: object
              properties:
                error:
                  type: string
                  description: Reason for slow down
        """

        if request.is_json:
            json = request.get_json()
            q = json.get('q')
            source_lang = json.get('source')
            target_lang = json.get('target')
        else:
            q = request.values.get("q")
            source_lang = request.values.get("source")
            target_lang = request.values.get("target")

        if not q:
            abort(400, description="Invalid request: missing q parameter")
        if not source_lang:
            abort(400, description="Invalid request: missing source parameter")
        if not target_lang:
            abort(400, description="Invalid request: missing target parameter")

        batch = isinstance(q, list)

        if batch and args.batch_limit != -1:
            batch_size = len(q)
            if args.batch_limit < batch_size:
                abort(400,
                      description=
                      "Invalid request: Request (%d) exceeds text limit (%d)" %
                      (batch_size, args.batch_limit))

        if args.char_limit != -1:
            if batch:
                chars = sum([len(text) for text in q])
            else:
                chars = len(q)

            if args.char_limit < chars:
                abort(
                    400,
                    description=
                    "Invalid request: Request (%d) exceeds character limit (%d)"
                    % (chars, args.char_limit))

        if source_lang == 'auto':
            candidate_langs = list(
                filter(lambda l: l.lang in language_map, detect_langs(q)))

            if len(candidate_langs) > 0:
                candidate_langs.sort(key=lambda l: l.prob, reverse=True)

                if args.debug:
                    print(candidate_langs)

                source_lang = next(
                    iter([
                        l.code for l in languages
                        if l.code == candidate_langs[0].lang
                    ]), None)
                if not source_lang:
                    source_lang = 'en'
            else:
                source_lang = 'en'

            if args.debug:
                print("Auto detected: %s" % source_lang)

        src_lang = next(iter([l for l in languages if l.code == source_lang]),
                        None)
        tgt_lang = next(iter([l for l in languages if l.code == target_lang]),
                        None)

        if src_lang is None:
            abort(400, description="%s is not supported" % source_lang)
        if tgt_lang is None:
            abort(400, description="%s is not supported" % target_lang)

        translator = src_lang.get_translation(tgt_lang)

        try:
            if batch:
                return jsonify({
                    "translatedText":
                    [translator.translate(text) for text in q]
                })
            else:
                return jsonify({"translatedText": translator.translate(q)})
        except Exception as e:
            abort(500, description="Cannot translate text: %s" % str(e))

    @app.route("/detect", methods=['POST'])
    def detect():
        """
        Detect the language of a single text
        ---
        tags:
          - translate
        parameters:
          - in: formData
            name: q
            schema:
              type: string
              example: Hello world!
            required: true
            description: Text to detect
          - in: formData
            name: api_key
            schema:
              type: string
              example: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
            required: false
            description: API key
        responses:
          200:
            description: Detections
            schema:
              id: detections
              type: array
              items:
                type: object
                properties:
                  confidence:
                    type: number
                    format: float
                    minimum: 0
                    maximum: 1
                    description: Confidence value
                    example: 0.6
                  language:
                    type: string
                    description: Language code
                    example: en
          400:
            description: Invalid request
            schema:
              id: error-response
              type: object
              properties:
                error:
                  type: string
                  description: Error message
          500:
            description: Detection error
            schema:
              id: error-response
              type: object
              properties:
                error:
                  type: string
                  description: Error message
          429:
            description: Slow down
            schema:
              id: error-slow-down
              type: object
              properties:
                error:
                  type: string
                  description: Reason for slow down
        """
        if request.is_json:
            json = request.get_json()
            q = json.get('q')
        else:
            q = request.values.get("q")

        if not q:
            abort(400, description="Invalid request: missing q parameter")

        candidate_langs = list(
            filter(lambda l: l.lang in language_map, detect_langs(q)))
        candidate_langs.sort(key=lambda l: l.prob, reverse=True)
        return jsonify([{
            'confidence': l.prob,
            'language': l.lang
        } for l in candidate_langs])

    @app.route("/frontend/settings")
    @limiter.exempt
    def frontend_settings():
        """
        Retrieve frontend specific settings
        ---
        tags:
          - frontend
        responses:
          200:
            description: frontend settings
            schema:
              id: frontend-settings
              type: object
              properties:
                charLimit:
                  type: integer
                  description: Character input limit for this language (-1 indicates no limit)
                frontendTimeout:
                  type: integer
                  description: Frontend translation timeout
                language:
                  type: object
                  properties:
                    source:
                      type: object
                      properties:
                        code:
                          type: string
                          description: Language code
                        name:
                          type: string
                          description: Human-readable language name (in English)
                    target:
                      type: object
                      properties:
                        code:
                          type: string
                          description: Language code
                        name:
                          type: string
                          description: Human-readable language name (in English)
        """
        return jsonify({
            'charLimit': args.char_limit,
            'frontendTimeout': args.frontend_timeout,
            'language': {
                'source': {
                    'code': frontend_argos_language_source.code,
                    'name': frontend_argos_language_source.name
                },
                'target': {
                    'code': frontend_argos_language_target.code,
                    'name': frontend_argos_language_target.name
                }
            }
        })

    swag = swagger(app)
    swag['info']['version'] = "1.2"
    swag['info']['title'] = "LibreTranslate"

    @app.route("/spec")
    @limiter.exempt
    def spec():
        return jsonify(swag)

    SWAGGER_URL = '/docs'  # URL for exposing Swagger UI (without trailing '/')
    API_URL = '/spec'

    # Call factory function to create our blueprint
    swaggerui_blueprint = get_swaggerui_blueprint(SWAGGER_URL, API_URL)

    app.register_blueprint(swaggerui_blueprint)

    return app
Beispiel #23
0
class DeepSpeechInput(AudioInput):
    """
    Input from DeepSpeech using the US English language model.
    """
    def __init__(self,
                 notifier,
                 rate=None,
                 wav_dir=None,
                 model=os.path.join(_MODEL_DIR, 'models.pbmm'),
                 scorer=os.path.join(_MODEL_DIR, 'models.scorer')):
        """
        @see AudioInput.__init__()

        :type  rate:
        :param rate:
            The override for the rate, if not the model's one.
        :type  wav_dir:
        :param wav_dir:
            Where to save the wave files, if anywhere.
        :type  model:
        :param model:
            The path to the DeepSpeech model file.
        :type  scorer:
        :param scorer:
            The path to the DeepSpeech scorer file.
        """
        # If these don't exist then DeepSpeech will segfault when inferring!
        if not os.path.exists(model):
            raise IOError("Not found: %s" % (model, ))

        # Load in and configure the model.
        LOG.info("Loading model from %s" % (model, ))
        self._model = Model(model)
        if os.path.exists(scorer):
            LOG.info("Loading scorer from %s" % (scorer, ))
            self._model.enableExternalScorer(scorer)

        # Handle any rate override
        if rate is None:
            rate = self._model.sampleRate()

        # Wen can now init the superclass
        super(DeepSpeechInput, self).__init__(notifier,
                                              format=pyaudio.paInt16,
                                              channels=1,
                                              rate=rate,
                                              wav_dir=wav_dir)

        # Where we put the stream context
        self._context = None

    def _feed_raw(self, data):
        """
        @see AudioInput._feed_raw()
        """
        if self._context is None:
            self._context = self._model.createStream()
        audio = numpy.frombuffer(data, numpy.int16)
        self._context.feedAudioContent(audio)

    def _decode(self):
        """
        @see AudioInput._decode()
        """
        if self._context is None:
            # No context means no tokens
            LOG.warning("Had no stream context to close")
            tokens = []
        else:
            # Finish up by finishing the decoding
            words = self._context.finishStream()
            LOG.info("Got: %s" % (words, ))
            self._context = None

            # And tokenize
            tokens = [
                Token(word.strip(), 1.0, True) for word in words.split(' ')
                if len(word.strip()) > 0
            ]
        return tokens
def app_sst_with_video(model_path: str, lm_path: str, lm_alpha: float,
                       lm_beta: float, beam: int):
    class AudioProcessor(AudioProcessorBase):
        frames_lock: threading.Lock
        frames: deque

        def __init__(self) -> None:
            self.frames_lock = threading.Lock()
            self.frames = deque([])

        async def recv_queued(self,
                              frames: List[av.AudioFrame]) -> av.AudioFrame:
            with self.frames_lock:
                self.frames.extend(frames)

            # Return empty frames to be silent.
            new_frames = []
            for frame in frames:
                input_array = frame.to_ndarray()
                new_frame = av.AudioFrame.from_ndarray(
                    np.zeros(input_array.shape, dtype=input_array.dtype),
                    layout=frame.layout.name,
                )
                new_frame.sample_rate = frame.sample_rate
                new_frames.append(new_frame)

            return new_frames

    webrtc_ctx = webrtc_streamer(
        key="speech-to-text-w-video",
        mode=WebRtcMode.SENDRECV,
        audio_processor_factory=AudioProcessor,
        rtc_configuration={
            "iceServers": [{
                "urls": ["stun:stun.l.google.com:19302"]
            }]
        },
        media_stream_constraints={
            "video": True,
            "audio": True
        },
    )

    status_indicator = st.empty()

    if not webrtc_ctx.state.playing:
        return

    status_indicator.write("Loading...")
    text_output = st.empty()
    stream = None

    while True:
        if webrtc_ctx.audio_processor:
            if stream is None:
                from deepspeech import Model

                model = Model(model_path)
                model.enableExternalScorer(lm_path)
                model.setScorerAlphaBeta(lm_alpha, lm_beta)
                model.setBeamWidth(beam)

                stream = model.createStream()

                status_indicator.write("Model loaded.")

            sound_chunk = pydub.AudioSegment.empty()

            audio_frames = []
            with webrtc_ctx.audio_processor.frames_lock:
                while len(webrtc_ctx.audio_processor.frames) > 0:
                    frame = webrtc_ctx.audio_processor.frames.popleft()
                    audio_frames.append(frame)

            if len(audio_frames) == 0:
                time.sleep(0.1)
                status_indicator.write("No frame arrived.")
                continue

            status_indicator.write("Running. Say something!")

            for audio_frame in audio_frames:
                sound = pydub.AudioSegment(
                    data=audio_frame.to_ndarray().tobytes(),
                    sample_width=audio_frame.format.bytes,
                    frame_rate=audio_frame.sample_rate,
                    channels=len(audio_frame.layout.channels),
                )
                sound_chunk += sound

            if len(sound_chunk) > 0:
                sound_chunk = sound_chunk.set_channels(1).set_frame_rate(
                    model.sampleRate())
                buffer = np.array(sound_chunk.get_array_of_samples())
                stream.feedAudioContent(buffer)
                text = stream.intermediateDecode()
                text_output.markdown(f"**Text:** {text}")
        else:
            status_indicator.write("AudioReciver is not set. Abort.")
            break
Beispiel #25
0
import wave
import numpy as np
import sys
import shlex
import subprocess
from deepspeech import Model
from tqdm import tqdm

try:
    from shhlex import quote
except ImportError:
    from pipes import quote

model = Model("deepspeech-0.9.3-models.pbmm")
model.enableExternalScorer("deepspeech-0.9.3-models.scorer")
desired_sample_rate = model.sampleRate()
PATH = os.path.join("LJSpeech-1.1", "wavs")
TOTAL_SAMPLES = 100


def convert_samplerate(audio_path, desired_sample_rate):
    sox_cmd = "sox {} --type raw --bits 16 --channels 1 --rate {} --encoding signed-integer --endian little --compression 0.0 --no-dither - ".format(
        quote(audio_path), desired_sample_rate)
    try:
        output = subprocess.check_output(shlex.split(sox_cmd),
                                         stderr=subprocess.PIPE)
    except subprocess.CalledProcessError as e:
        raise RuntimeError("SoX returned non-zero status: {}".format(e.stderr))
    except OSError as e:
        raise OSError(
            e.errno, "SoX not found, use {}hz files or install it: {}".format(
def app_sst(model_path: str, lm_path: str, lm_alpha: float, lm_beta: float,
            beam: int):
    webrtc_ctx = webrtc_streamer(
        key="speech-to-text",
        mode=WebRtcMode.SENDONLY,
        audio_receiver_size=1024,
        rtc_configuration={
            "iceServers": [{
                "urls": ["stun:stun.l.google.com:19302"]
            }]
        },
        media_stream_constraints={
            "video": False,
            "audio": True
        },
    )

    status_indicator = st.empty()

    if not webrtc_ctx.state.playing:
        return

    status_indicator.write("Loading...")
    text_output = st.empty()
    stream = None

    while True:
        if webrtc_ctx.audio_receiver:
            if stream is None:
                from deepspeech import Model

                model = Model(model_path)
                model.enableExternalScorer(lm_path)
                model.setScorerAlphaBeta(lm_alpha, lm_beta)
                model.setBeamWidth(beam)

                stream = model.createStream()

                status_indicator.write("Model loaded.")

            sound_chunk = pydub.AudioSegment.empty()
            try:
                audio_frames = webrtc_ctx.audio_receiver.get_frames(timeout=1)
            except queue.Empty:
                time.sleep(0.1)
                status_indicator.write("No frame arrived.")
                continue

            status_indicator.write("Running. Say something!")

            for audio_frame in audio_frames:
                sound = pydub.AudioSegment(
                    data=audio_frame.to_ndarray().tobytes(),
                    sample_width=audio_frame.format.bytes,
                    frame_rate=audio_frame.sample_rate,
                    channels=len(audio_frame.layout.channels),
                )
                sound_chunk += sound

            if len(sound_chunk) > 0:
                sound_chunk = sound_chunk.set_channels(1).set_frame_rate(
                    model.sampleRate())
                buffer = np.array(sound_chunk.get_array_of_samples())
                stream.feedAudioContent(buffer)
                text = stream.intermediateDecode()
                text_output.markdown(f"**Text:** {text}")
        else:
            status_indicator.write("AudioReciver is not set. Abort.")
            break
Beispiel #27
0
class SpeechToTextEngine:
    """ Class to perform speech-to-text transcription and related functionality """
    def __init__(self, scorer='deepspeech_model.scorer') -> None:
        """ Initializing the DeepSpeech model """
        self.model = Model(model_path=Path(__file__).parents[2].joinpath(
            'deepspeech_model.pbmm').absolute().as_posix())
        self.model.enableExternalScorer(scorer_path=Path(
            __file__).parents[2].joinpath(scorer).absolute().as_posix())

    def run(self, audio) -> str:
        """ Receives the audio,  normalizes it and is sent to the model to be transcribed. Returns the result of the
        transcribe audio in string format."""

        normalized_audio = normalize_audio_input(audio)
        audio_streams = BytesIO(normalized_audio)
        with wave.Wave_read(audio_streams) as wav:
            audio_streams = np.frombuffer(wav.readframes(wav.getnframes()),
                                          np.int16)
        results = self.model.stt(audio_buffer=audio_streams)
        return results

    def run_with_metadata(self, audio) -> Metadata:
        normalized_audio = normalize_audio_input(audio)
        audio_streams = BytesIO(normalized_audio)
        with wave.Wave_read(audio_streams) as wav:
            audio_streams = np.frombuffer(wav.readframes(wav.getnframes()),
                                          np.int16)
        results = self.model.sttWithMetadata(audio_buffer=audio_streams)
        return results

    def add_hot_words(self, data) -> list:
        """ Receives data in form of hot-words and boosts, adds them to the language model and return the list of the
        added hot-words """

        all_hot_words = []
        try:
            logger.info('----------------------------------------------------')
            for hot_word in data:
                # Change all the characters of the hot-word to lower case
                word = hot_word.lower()

                # Get numeric value of the boost
                boost = float(data.get(hot_word))

                # Adding the hot-word and its boost to the language model
                self.model.addHotWord(hot_word, boost)

                # Printing on the prompt the activity
                logger.info(
                    f"`{word}` hot-word with boost `{boost}` was added.")
                all_hot_words.append(word)
            return all_hot_words
        except RuntimeError:
            return []

    def erase_hot_word(self, hot_words) -> None:
        try:
            for hot_word in hot_words:
                self.model.eraseHotWord(hot_word)
                logger.info(f"`{hot_word}` hot-word is erased.")
            logger.info('----------------------------------------------------')
        except RuntimeError:
            return

    def clear_hot_words(self) -> str:
        try:
            self.model.clearHotWords()
            return f"All hot-words were erased."
        except RuntimeError:
            return f"No more hot-words are left."

    def sample_rate(self):
        return self.model.sampleRate()
Beispiel #28
0
def main_transcript(video_to_encode):
    msg = ""

    mp3file = video_to_encode.get_video_mp3(
    ).source_file if video_to_encode.get_video_mp3() else None

    lang = video_to_encode.main_lang

    # check if DS_PARAM [lang] exist
    if not DS_PARAM.get(lang):
        msg += "\n no deepspeech model found for lang:%s." % lang
        msg += "Please add it in DS_PARAM."
        return msg

    ds_model = Model(DS_PARAM[lang]['model'], DS_PARAM[lang]['beam_width'])

    if all([
            cond in DS_PARAM[lang]
            for cond in ['alphabet', 'lm', 'trie', 'lm_alpha', 'lm_beta']
    ]):
        ds_model.enableDecoderWithLM(DS_PARAM[lang]['lm'],
                                     DS_PARAM[lang]['trie'],
                                     DS_PARAM[lang]['lm_alpha'],
                                     DS_PARAM[lang]['lm_beta'])

    desired_sample_rate = ds_model.sampleRate()

    webvtt = WebVTT()
    inference_start = timer()
    last_item = None
    sentences = []
    sentence = []
    metadata = None

    for start_trim in range(0, video_to_encode.duration, AUDIO_SPLIT_TIME):

        end_trim = video_to_encode.duration if start_trim + \
            AUDIO_SPLIT_TIME > video_to_encode.duration else (
                start_trim + AUDIO_SPLIT_TIME + SENTENCE_MAX_LENGTH)

        duration = (AUDIO_SPLIT_TIME + SENTENCE_MAX_LENGTH) if start_trim + \
            AUDIO_SPLIT_TIME + SENTENCE_MAX_LENGTH < video_to_encode.duration \
            else (video_to_encode.duration - start_trim)

        msg += "\ntake audio from %s to %s - %s" % (start_trim, end_trim,
                                                    duration)

        audio = convert_samplerate(mp3file.path, desired_sample_rate,
                                   start_trim, duration)
        msg += '\nRunning inference.'

        metadata = ds_model.sttWithMetadata(audio)

        msg += '\nConfidence : %s' % metadata.confidence

        sentences[:] = []  # empty list
        sentence[:] = []  # empty list

        refItem = metadata.items[0]

        index = get_index(metadata, last_item, start_trim) if last_item else 0

        # nb of character in AUDIO_SPLIT_TIME
        msg += "METADATA ITEMS : %d " % len(metadata.items)

        sentences = get_sentences(metadata, refItem, index)

        last_item = (
            sentences[-1][-1].character,
            sentences[-1][-1].start_time) if len(sentences) > 0 else ()

        for sent in sentences:
            if len(sent) > 0:
                start_time = sent[0].start_time + start_trim
                end_time = sent[-1].start_time + start_trim
                str_sentence = ''.join(item.character for item in sent)
                # print(start_time, end_time, str_sentence)
                caption = Caption(
                    '%s.%s' %
                    (timedelta(seconds=int(str(start_time).split('.')[0])),
                     str('%.3f' % start_time).split('.')[1]), '%s.%s' %
                    (timedelta(seconds=int(str(end_time).split('.')[0])),
                     str('%.3f' % end_time).split('.')[1]),
                    ['%s' % str_sentence])

                webvtt.captions.append(caption)
    # print(webvtt)
    msg += saveVTT(video_to_encode, webvtt)
    inference_end = timer() - inference_start
    msg += '\nInference took %0.3fs.' % inference_end
    # print(msg)
    return msg
Beispiel #29
0
def main():
    parser = argparse.ArgumentParser(description='Running DeepSpeech inference.')
    parser.add_argument('--model', required=True,
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument('--scorer', required=False,
                        help='Path to the external scorer file')
    parser.add_argument('--audio', required=True,
                        help='Path to the audio file to run (WAV format)')
    parser.add_argument('--beam_width', type=int,
                        help='Beam width for the CTC decoder')
    parser.add_argument('--lm_alpha', type=float,
                        help='Language model weight (lm_alpha). If not specified, use default from the scorer package.')
    parser.add_argument('--lm_beta', type=float,
                        help='Word insertion bonus (lm_beta). If not specified, use default from the scorer package.')
    parser.add_argument('--version', action=VersionAction,
                        help='Print version and exits')
    parser.add_argument('--extended', required=False, action='store_true',
                        help='Output string from extended metadata')
    parser.add_argument('--json', required=False, action='store_true',
                        help='Output json from metadata with timestamp of each word')
    parser.add_argument('--candidate_transcripts', type=int, default=3,
                        help='Number of candidate transcripts to include in JSON output')
    args = parser.parse_args()

#     print('Loading model from file {}'.format(args.model), file=sys.stderr)
    model_load_start = timer()
    # sphinx-doc: python_ref_model_start
    ds = Model(args.model)
    # sphinx-doc: python_ref_model_stop
    model_load_end = timer() - model_load_start
#     print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

    if args.beam_width:
        ds.setBeamWidth(args.beam_width)

    desired_sample_rate = ds.sampleRate()

    if args.scorer:
#         print('Loading scorer from files {}'.format(args.scorer), file=sys.stderr)
        scorer_load_start = timer()
        ds.enableExternalScorer(args.scorer)
        scorer_load_end = timer() - scorer_load_start
#         print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr)

        if args.lm_alpha and args.lm_beta:
            ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta)

    fin = wave.open(args.audio, 'rb')
    fs_orig = fin.getframerate()
    if fs_orig != desired_sample_rate:
#         print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format(fs_orig, desired_sample_rate), file=sys.stderr)
        fs_new, audio = convert_samplerate(args.audio, desired_sample_rate)
    else:
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    audio_length = fin.getnframes() * (1/fs_orig)
    fin.close()

#     print('Running inference.', file=sys.stderr)
    inference_start = timer()
    # sphinx-doc: python_ref_inference_start
    if args.extended:
        print(metadata_to_string(ds.sttWithMetadata(audio, 1).transcripts[0]))
    elif args.json:
        print(metadata_json_output(ds.sttWithMetadata(audio, args.candidate_transcripts)))
    else:
        print("Translation: "+ds.stt(audio))
    # sphinx-doc: python_ref_inference_stop
    inference_end = timer() - inference_start
Beispiel #30
0
        window['lbl_deep'].update("Ascolto...")
        window.refresh()
        rec = sd.rec(int(duration_of_recording * sample_rate),
                     dtype="int16",
                     samplerate=sample_rate,
                     channels=1)
        sd.wait()
        write('DS/out.wav', sample_rate, rec)

        window['lbl_deep'].update("Elaboro...")
        window.refresh()

        # starts ds recognizer
        fin = wave.open("DS/out.wav", 'rb')
        fs_orig = fin.getframerate()
        if fs_orig != ds.sampleRate():
            print(
                "Your audio has not been correctly recorded. Please try to fix it and try again! (must be in 16000khz)"
            )
            exit(1)
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
        stt_text = ds.stt(audio).lower().replace(" ", "")

        # check the output
        if float(
                utils.similar(
                    stt_text,
                    random_questions[progressed]["answer"].lower())) >= 0.5:
            progressed += 1
            window['progbar'].update(progressed)
            window['lbl_result'].update("Esatto!", background_color="green")