Exemple #1
0
def main():
    parser = argparse.ArgumentParser(description='Running DeepSpeech inference.')
    parser.add_argument('--model', required=True,
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument('--lm', nargs='?',
                        help='Path to the language model binary file')
    parser.add_argument('--trie', nargs='?',
                        help='Path to the language model trie file created with native_client/generate_trie')
    parser.add_argument('--audio', required=True,
                        help='Path to the audio file to run (WAV format)')
    parser.add_argument('--beam_width', type=int, default=500,
                        help='Beam width for the CTC decoder')
    parser.add_argument('--lm_alpha', type=float, default=0.75,
                        help='Language model weight (lm_alpha)')
    parser.add_argument('--lm_beta', type=float, default=1.85,
                        help='Word insertion bonus (lm_beta)')
    parser.add_argument('--version', action=VersionAction,
                        help='Print version and exits')
    parser.add_argument('--extended', required=False, action='store_true',
                        help='Output string from extended metadata')
    parser.add_argument('--json', required=False, action='store_true',
                        help='Output json from metadata with timestamp of each word')
    args = parser.parse_args()

    print('Loading model from file {}'.format(args.model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(args.model, args.beam_width)
    model_load_end = timer() - model_load_start
    print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

    desired_sample_rate = ds.sampleRate()

    if args.lm and args.trie:
        print('Loading language model from files {} {}'.format(args.lm, args.trie), file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(args.lm, args.trie, args.lm_alpha, args.lm_beta)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr)

    fin = wave.open(args.audio, 'rb')
    fs = fin.getframerate()
    if fs != desired_sample_rate:
        print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format(fs, desired_sample_rate), file=sys.stderr)
        fs, audio = convert_samplerate(args.audio, desired_sample_rate)
    else:
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    audio_length = fin.getnframes() * (1/fs)
    fin.close()

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    if args.extended:
        print(metadata_to_string(ds.sttWithMetadata(audio)))
    elif args.json:
        print(metadata_json_output(ds.sttWithMetadata(audio)))
    else:
        print(ds.stt(audio))
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
Exemple #2
0
def batching_after_silence(
    audio: np.ndarray,
    silence_threshold: int,
    model: Model,
    verbose: bool = False,
    filters: list = None,
) -> List[Any]:
    """
    Infer after natural gaps of silence

    Ref: http://jamesmontgomery.us/blog/Voice_Recognition_Model.html
    """
    results: list = []
    audio = audio.astype("float32")
    y: np.ndarray = librosa.effects.split(audio, top_db=silence_threshold, ref=np.mean)

    clips: list = []
    for i in tqdm(y):
        clip = audio[i[0] : i[1]]
        clip = clip.astype("int16")

        if filters:
            clip = apply_filters(clip, filters)

        clips.append((clip, filters or ["no filter"]))

    for clip, meta in tqdm(clips):
        transcripts = metadata_to_string(model.sttWithMetadata(clip, 1).transcripts[0])
        if transcripts and verbose:
            print(transcripts, " : ", meta)
        results.append(transcripts)

    return results
Exemple #3
0
def transcribe(audio_path):
    ds = Model(model_path="deepspeech-0.7.0-models.pbmm")

    desired_sample_rate = ds.sampleRate()
    print(desired_sample_rate)
    ds.enableExternalScorer("deepspeech-0.7.0-models.scorer")

    fin = wave.open(audio_path, 'rb')
    fs_orig = fin.getframerate()

    if fs_orig != desired_sample_rate:
        print("Converting from {}hz to {}hz" % (fs_orig, desired_sample_rate))
        fs_new, audio = convert_samplerate(audio_path, desired_sample_rate)
    else:
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
    audio_length = fin.getnframes() * (1 / fs_orig)
    fin.close()

    inference_start = timer()
    transcript = ds.sttWithMetadata(audio, 1).transcripts[0]
    json_result = metadata_json_output(transcript)
    string_result = metadata_to_string(transcript)

    inference_end = timer() - inference_start
    print(json_result)
    print('Inference took %0.3fs for %0.3fs audio file.' %
          (inference_end, audio_length),
          file=sys.stderr)

    return json_result, string_result
Exemple #4
0
def get_deepspeech_result(data: bytes,
                          model: deepspeech.Model) -> Tuple[str, float]:
    audio_array = bytes_to_array(data)
    start_time = time.time()
    response = model.sttWithMetadata(audio_array)
    end_time = time.time()
    response_time = end_time - start_time
    return metadata_to_string(response), response_time
Exemple #5
0
def main():
    parser = argparse.ArgumentParser(description='Running DeepSpeech inference.')
    parser.add_argument('--model', required=True,
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument('--alphabet', required=True,
                        help='Path to the configuration file specifying the alphabet used by the network')
    parser.add_argument('--lm', nargs='?',
                        help='Path to the language model binary file')
    parser.add_argument('--trie', nargs='?',
                        help='Path to the language model trie file created with native_client/generate_trie')
    parser.add_argument('--audio', required=True,
                        help='Path to the audio file to run (WAV format)')
    parser.add_argument('--version', action=VersionAction,
                        help='Print version and exits')
    parser.add_argument('--extended', required=False, action='store_true',
                        help='Output string from extended metadata')
    args = parser.parse_args()

    print('Loading model from file {}'.format(args.model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

    if args.lm and args.trie:
        print('Loading language model from files {} {}'.format(args.lm, args.trie), file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_ALPHA, LM_BETA)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr)

    writeFile = open('speechtotext.csv', 'w')
    writer = csv.writer(writeFile)
    writer.writerow(['inputfile', 'inference'])
    for file in glob.glob("{}*.wav".format(args.audio)):

        fin = wave.open(file, 'rb')
        fs = fin.getframerate()
        if fs != SAMPLE_RATE:
            print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format(fs, SAMPLE_RATE), file=sys.stderr)
            fs, audio = convert_samplerate(args.audio)
        else:
            audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

        audio_length = fin.getnframes() * (1/SAMPLE_RATE)
        fin.close()

        print('Running inference for {}'.format(file), file=sys.stderr)
        inference_start = timer()
        if args.extended:
            print(metadata_to_string(ds.sttWithMetadata(audio, fs)))
        else:
            #print(ds.stt(audio, fs))
            writer.writerow(["{}".format(file),"{}".format(ds.stt(audio, fs))])
        inference_end = timer() - inference_start
        print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)

    writeFile.close()
Exemple #6
0
def transcribe(args, filepath="", verbose=0):

    if verbose > 0:
        print('Loading model from file {}'.format(args.model), file=sys.stderr)
        model_load_start = timer()

    ds = Model(args.model, args.beam_width)
    if verbose > 0:
        model_load_end = timer() - model_load_start
        print('Loaded model in {:.3}s.'.format(
            model_load_end), file=sys.stderr)

    desired_sample_rate = ds.sampleRate()
    if args.lm and args.trie:
        if verbose > 0:
            print('Loading language model from files {} {}'.format(
                args.lm, args.trie), file=sys.stderr)
            lm_load_start = timer()
        ds.enableDecoderWithLM(args.lm, args.trie, args.lm_alpha, args.lm_beta)
        if verbose > 0:
            lm_load_end = timer() - lm_load_start
            print('Loaded language model in {:.3}s.'.format(
                lm_load_end), file=sys.stderr)

    fin = wave.open(filepath, 'rb')
    fs = fin.getframerate()
    if fs != desired_sample_rate:
        if verbose > 0:
            print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format(
                fs, desired_sample_rate), file=sys.stderr)
        fs, audio = convert_samplerate(filepath, desired_sample_rate)
    else:
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    audio_length = fin.getnframes() * (1/fs)
    fin.close()

    if verbose > 0:
        print('Running inference.', file=sys.stderr)
        inference_start = timer()
    audio_metadata = ds.sttWithMetadata(audio)
    if verbose > 0:
        inference_end = timer() - inference_start
        print('Inference took %0.3fs for %0.3fs audio file.' %
              (inference_end, audio_length), file=sys.stderr)

    dict_result = dict()
    dict_result["sentence"] = "".join(
        item.character for item in audio_metadata.items)
    dict_result["words"] = words_from_metadata(audio_metadata)
    dict_result["characters"] = audio_metadata
    dict_result["confidence"] = audio_metadata.confidence

    return dict_result
Exemple #7
0
def main():
    parser = argparse.ArgumentParser(description='Running DeepSpeech inference.')
    parser.add_argument('--model', required=True,
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument('--alphabet', required=True,
                        help='Path to the configuration file specifying the alphabet used by the network')
    parser.add_argument('--lm', nargs='?',
                        help='Path to the language model binary file')
    parser.add_argument('--trie', nargs='?',
                        help='Path to the language model trie file created with native_client/generate_trie')
    parser.add_argument('--audio', required=True,
                        help='Path to the audio file to run (WAV format)')
    parser.add_argument('--version', action=VersionAction,
                        help='Print version and exits')
    parser.add_argument('--extended', required=False, action='store_true',
                        help='Output string from extended metadata')
    args = parser.parse_args()

    print('Loading model from file {}'.format(args.model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

    if args.lm and args.trie:
        print('Loading language model from files {} {}'.format(args.lm, args.trie), file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_ALPHA, LM_BETA)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr)

    fin = wave.open(args.audio, 'rb')
    fs = fin.getframerate()
    if fs != 16000:
        print('Warning: original sample rate ({}) is different than 16kHz. Resampling might produce erratic speech recognition.'.format(fs), file=sys.stderr)
        fs, audio = convert_samplerate(args.audio)
    else:
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    audio_length = fin.getnframes() * (1/16000)
    fin.close()

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    if args.extended:
        print(metadata_to_string(ds.sttWithMetadata(audio, fs)))
    else:
        print(ds.stt(audio, fs))
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
Exemple #8
0
def main():

    audio_files = glob.glob("uploads/*.wav")
    speech_model = "deepspeech-0.9.3-models.pbmm"
    speech_scorer = "deepspeech-0.9.3-models.scorer"
    speech_audio = audio_files[0]
    print('Loading model from file', file=sys.stderr)
    model_load_start = timer()
    ds = Model(speech_model)
    model_load_end = timer() - model_load_start
    print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

    desired_sample_rate = ds.sampleRate()

    print('Loading scorer from files', file=sys.stderr)
    scorer_load_start = timer()
    ds.enableExternalScorer(speech_scorer)
    scorer_load_end = timer() - scorer_load_start
    print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr)

    fin = wave.open(speech_audio, 'rb')
    fs_orig = fin.getframerate()
    if fs_orig != desired_sample_rate:
        print(
            'Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'
            .format(fs_orig, desired_sample_rate),
            file=sys.stderr)
        fs_new, audio = convert_samplerate(speech_audio, desired_sample_rate)
    else:
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    audio_length = fin.getnframes() * (1 / fs_orig)
    fin.close()

    print('Running inference.', file=sys.stderr)
    inference_start = timer()

    inference_end = timer() - inference_start

    audio_transcription = metadata_json_output(ds.sttWithMetadata(audio, 3))

    print('Inference took %0.3fs for %0.3fs audio file.' %
          (inference_end, audio_length),
          file=sys.stderr)
    print('Candidate Transcritps:', 3)

    return audio_transcription
Exemple #9
0
def stt(model_path,
        audio,
        beam_width=None,
        scorer_path=None,
        lm_alpha=None,
        lm_beta=None,
        hot_words=None):
    ds = Model(model_path)

    if beam_width:
        ds.setBeamWidth(beam_width)

    desired_sample_rate = ds.sampleRate()

    if scorer_path:
        ds.enableExternalScorer(scorer_path)

        if lm_alpha and lm_beta:
            ds.setScorerAlphaBeta(lm_alpha, lm_beta)

    # TODO
    # if hot_words:
    #     print('Adding hot-words', file=sys.stderr)
    #     for w in hot_words:
    #         ds.addHotWord(w, 6.2)

    fin = wave.open(audio, 'rb')
    fs_orig = fin.getframerate()
    if fs_orig != desired_sample_rate:
        print(
            f'ERROR: original sample rate ({fs_orig}) is different than {desired_sample_rate}hz.',
            file=sys.stderr)
        exit(1)

    audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    fin.close()

    print('Running inference.', file=sys.stderr)
    res = ds.sttWithMetadata(audio, 1)
    res = postprocess_metadata(res)
    return res
Exemple #10
0
def MozillaSTT(audio_path):

    # TODO: handle different rates (not implemented)
    fin = wave.open(audio_path, 'rb')
    output = ""
    # print("SS")
    ds = Model(model_file_path)
    # print("SS")
    ds.enableExternalScorer(scorer_file_path)
    # print("SS")

    lm_alpha = 0.75  # ??
    lm_beta = 1.85
    desired_sample_rate = ds.sampleRate()
    ds.setScorerAlphaBeta(lm_alpha, lm_beta)
    fs_orig = fin.getframerate()
    # print("Desired Sampling Rate: %d", desired_sample_rate)
    if fs_orig != desired_sample_rate:
        print('Warning: original sample rate ({}) is different than {}hz. \
Resampling might produce erratic speech   recognition.'.format(
            fs_orig, desired_sample_rate),
              file=sys.stderr)
        fs_new, audio = convert_samplerate(audio_path, desired_sample_rate)
    else:
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    # audio_length = fin.getnframes() * (1/fs_orig)
    fin.close()
    print('Running inference.', file=sys.stderr)
    # print(metadata_to_string(ds.sttWithMetadata(audio, 1).transcripts[0]))
    # print(metadata_json_output(ds.sttWithMetadata(audio, 3)))
    # print(ds.stt(audio))
    output += ds.stt(audio)
    output += '\n'
    output += metadata_json_output(ds.sttWithMetadata(audio, 3))
    return output
Exemple #11
0
# -*- coding: utf-8 -*-
"""
Created on Sat Mar  6 15:41:29 2021

@author: Marshall.McDougall
"""
#import argparse
#import numpy as np
#import shlex
#import subprocess
#import sys
#import wave
#import json

from deepspeech import Model  #, version
#from timeit import default_timer as timer

#try:
#    from shhlex import quote
#except ImportError:
#    from pipes import quote
# deepspeech --model deepspeech-0.9.3-models.pbmm --scorer deepspeech-0.9.3-models.scorer --audio audio/SimpleTest3.wav --json

ds = Model("deepspeech-0.9.3-models.pbmm")
desired_sample_rate = ds.sampleRate()
ds.enableExternalScorer("deepspeech-0.9.3-models.scorer")
ds.sttWithMetadata("audio/SimpleTest3.wav", "3")
Exemple #12
0
def main():
    parser = argparse.ArgumentParser(description='Running DeepSpeech inference.')
    parser.add_argument('--model', required=True,
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument('--scorer', required=False,
                        help='Path to the external scorer file')
    parser.add_argument('--audio', required=True,
                        help='Path to the audio file to run (WAV format)')
    parser.add_argument('--beam_width', type=int,
                        help='Beam width for the CTC decoder')
    parser.add_argument('--lm_alpha', type=float,
                        help='Language model weight (lm_alpha). If not specified, use default from the scorer package.')
    parser.add_argument('--lm_beta', type=float,
                        help='Word insertion bonus (lm_beta). If not specified, use default from the scorer package.')
    # parser.add_argument('--version', action=VersionAction,
    #                     help='Print version and exits')
    parser.add_argument('--extended', required=False, action='store_true',
                        help='Output string from extended metadata')
    parser.add_argument('--json', required=False, action='store_true',
                        help='Output json from metadata with timestamp of each word')
    parser.add_argument('--candidate_transcripts', type=int, default=3,
                        help='Number of candidate transcripts to include in JSON output')
    parser.add_argument('--hot_words', type=str,
                        help='Hot-words and their boosts.')
    args = parser.parse_args()

    print('Loading model from file {}'.format(args.model), file=sys.stderr)
    model_load_start = timer()
    # sphinx-doc: python_ref_model_start
    ds = Model(args.model)
    # sphinx-doc: python_ref_model_stop
    model_load_end = timer() - model_load_start
    print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

    if args.beam_width:
        ds.setBeamWidth(args.beam_width)

    desired_sample_rate = ds.sampleRate()

    if args.scorer:
        print('Loading scorer from files {}'.format(args.scorer), file=sys.stderr)
        scorer_load_start = timer()
        ds.enableExternalScorer(args.scorer)
        scorer_load_end = timer() - scorer_load_start
        print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr)

        if args.lm_alpha and args.lm_beta:
            ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta)

    if args.hot_words:
        print('Adding hot-words', file=sys.stderr)
        for word_boost in args.hot_words.split(','):
            word,boost = word_boost.split(':')
            ds.addHotWord(word,float(boost))

    fin = wave.open(args.audio, 'rb')
    fs_orig = fin.getframerate()
    if fs_orig != desired_sample_rate:
        print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format(fs_orig, desired_sample_rate), file=sys.stderr)
        fs_new, audio = convert_samplerate(args.audio, desired_sample_rate)
    else:
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    audio_length = fin.getnframes() * (1/fs_orig)
    fin.close()

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    # sphinx-doc: python_ref_inference_start
    if args.extended:
        print(metadata_to_string(ds.sttWithMetadata(audio, 1).transcripts[0]))
    elif args.json:
        print(metadata_json_output(ds.sttWithMetadata(audio, args.candidate_transcripts)))
    else: 
        print(ds.stt(audio))        
        test = ds.createStream().sentencefit(audio, "ka arohia katoatia te hāhi me ōna whakapono e te hapū o ōtākou")
        [print(f"letter: {t.letter}, confidence: {t.confidence}, timestep: {t.timestep}") for t in test.tokens]
        
    # sphinx-doc: python_ref_inference_stop
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
Exemple #13
0
class DeepSpeech():
    def __init__(self, model_path, scorer_path, result_json_path,
                 result_txt_path, candidate_transcripts=3, beam_width=None):

        # Path to the Speech-To-Text model
        self.MODEL_PATH = model_path
        # Path to the scorer language mode
        self.SCORER_PATH = scorer_path
        # The number of times to trascript
        self.CANDIDATE_TRANSCRIPTS = candidate_transcripts

        self.result_json_path = result_json_path
        self.result_txt_path = result_txt_path

        self.beam_width = beam_width

        self._setup()

    def _setup(self):
        self.ds = Model(self.MODEL_PATH)  # Declare the model obj
        # Set desired sample rate for STT model.
        self.sample_rate = '16000'

        if self.beam_width:
            self.ds.setBeamWidth(self.beam_width)

        if self.SCORER_PATH:
            self.ds.enableExternalScorer(self.SCORER_PATH)

    def convert_samplerate(self, audio_path, desired_sample_rate):
        sox_cmd = 'sox {} --type raw --bits 16 --channels 1 --rate {}\
                   --encoding signed-integer --endian little\
                   --compression 0.0 --no-dither - '\
        .format(quote(audio_path), desired_sample_rate)
        try:
            output = subprocess.check_output(
                shlex.split(sox_cmd), stderr=subprocess.PIPE)
        except subprocess.CalledProcessError as e:
            raise RuntimeError(
                'SoX returned non-zero status: {}'.format(e.stderr))
        except OSError as e:
            raise OSError(e.errno,
                          'SoX not found, use {}hz files or install it: {}'
                          .format(desired_sample_rate, e.strerror))

        return desired_sample_rate, np.frombuffer(output, np.int16)

    def words_from_candidate_transcript(self, metadata):
        word = ""
        word_list = []
        word_start_time = 0
        # Loop through each character
        for i, token in enumerate(metadata.tokens):
            # Append character to word if it's not a space
            if token.text != " ":
                if len(word) == 0:
                    # Log the start time of the new word
                    word_start_time = token.start_time

                word = word + token.text
            # Word boundary is either a space or the last character in the arr
            if token.text == " " or i == len(metadata.tokens) - 1:
                word_duration = token.start_time - word_start_time

                if word_duration < 0:
                    word_duration = 0

                each_word = dict()
                each_word["word"] = word
                each_word["start_time "] = round(word_start_time, 4)
                each_word["duration"] = round(word_duration, 4)

                word_list.append(each_word)
                # Reset
                word = ""
                word_start_time = 0

        return word_list

    def metadata_json_output(self, metadata):
        json_result = dict()
        json_result["transcripts"] = [{
            "confidence": transcript.confidence,
            "words": self.words_from_candidate_transcript(transcript),
        } for transcript in metadata.transcripts]
        return json.dumps(json_result, indent=4)

    def take_audio_info(self):
        probe = ffmpeg.probe(self.FILE_PATH)
        self.audio_info = next(
            (stream for stream in probe['streams']
             if stream['codec_type'] == 'audio'), None)
        print(self.audio_info)
        return self.audio_info

    def take_audio(self):
        out, err = (
            ffmpeg
            .input(self.FILE_PATH)
            .output('-', format='s16le',
                    acodec='pcm_s16le', ac=1, ar=self.sample_rate)
            .run(capture_stdout=True, capture_stderr=True)
        )
        self.audio = np.frombuffer(out, np.int16)
        return self.audio

    def speech2text(self):
        metadata = self.ds.sttWithMetadata(
            self.audio, self.CANDIDATE_TRANSCRIPTS)
        json_result = self.metadata_json_output(metadata)

        with open(self.result_json_path, 'w') as outfile:
            outfile.write(json_result)

        dict_result = json.loads(json_result)
        word_list = [item["word"]
                     for item in dict_result["transcripts"][0]["words"]]

        sentence = " ".join(word_list)
        self.export2textfile(sentence)
        return sentence

    def export2textfile(self, sentence):
        txt_file = open(self.result_txt_path, "w")
        txt_file.writelines(sentence)
        txt_file.close()

    def set_file(self, filepath):
        self.FILE_PATH = filepath
Exemple #14
0
def main():
    parser = argparse.ArgumentParser(description='Running DeepSpeech inference.')
    parser.add_argument('--model', required=True,
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument('--scorer', required=False,
                        help='Path to the external scorer file')
    parser.add_argument('--audio', required=True,
                        help='Path to the audio file to run (WAV format)')
    parser.add_argument('--beam_width', type=int,
                        help='Beam width for the CTC decoder')
    parser.add_argument('--lm_alpha', type=float,
                        help='Language model weight (lm_alpha). If not specified, use default from the scorer package.')
    parser.add_argument('--lm_beta', type=float,
                        help='Word insertion bonus (lm_beta). If not specified, use default from the scorer package.')
    parser.add_argument('--version', action=VersionAction,
                        help='Print version and exits')
    parser.add_argument('--extended', required=False, action='store_true',
                        help='Output string from extended metadata')
    parser.add_argument('--json', required=False, action='store_true',
                        help='Output json from metadata with timestamp of each word')
    parser.add_argument('--candidate_transcripts', type=int, default=3,
                        help='Number of candidate transcripts to include in JSON output')
    args = parser.parse_args()

#     print('Loading model from file {}'.format(args.model), file=sys.stderr)
    model_load_start = timer()
    # sphinx-doc: python_ref_model_start
    ds = Model(args.model)
    # sphinx-doc: python_ref_model_stop
    model_load_end = timer() - model_load_start
#     print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

    if args.beam_width:
        ds.setBeamWidth(args.beam_width)

    desired_sample_rate = ds.sampleRate()

    if args.scorer:
#         print('Loading scorer from files {}'.format(args.scorer), file=sys.stderr)
        scorer_load_start = timer()
        ds.enableExternalScorer(args.scorer)
        scorer_load_end = timer() - scorer_load_start
#         print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr)

        if args.lm_alpha and args.lm_beta:
            ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta)

    fin = wave.open(args.audio, 'rb')
    fs_orig = fin.getframerate()
    if fs_orig != desired_sample_rate:
#         print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format(fs_orig, desired_sample_rate), file=sys.stderr)
        fs_new, audio = convert_samplerate(args.audio, desired_sample_rate)
    else:
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    audio_length = fin.getnframes() * (1/fs_orig)
    fin.close()

#     print('Running inference.', file=sys.stderr)
    inference_start = timer()
    # sphinx-doc: python_ref_inference_start
    if args.extended:
        print(metadata_to_string(ds.sttWithMetadata(audio, 1).transcripts[0]))
    elif args.json:
        print(metadata_json_output(ds.sttWithMetadata(audio, args.candidate_transcripts)))
    else:
        print("Translation: "+ds.stt(audio))
    # sphinx-doc: python_ref_inference_stop
    inference_end = timer() - inference_start
def main():
    # parser = argparse.ArgumentParser(description='Running DeepSpeech inference.')
    # parser.add_argument('--model', required=True,
    #                     help='Path to the model (protocol buffer binary file)')
    # parser.add_argument('--alphabet', required=True,
    #                     help='Path to the configuration file specifying the alphabet used by the network')
    # parser.add_argument('--lm', nargs='?',
    #                     help='Path to the language model binary file')
    # parser.add_argument('--trie', nargs='?',
    #                     help='Path to the language model trie file created with native_client/generate_trie')
    # parser.add_argument('--audio', required=True,
    #                     help='Path to the audio file to run (WAV format)')
    # parser.add_argument('--version', action=VersionAction,
    #                     help='Print version and exits')
    # parser.add_argument('--extended', required=False, action='store_true',
    #                     help='Output string from extended metadata')
    # args = parser.parse_args()
    args = {
        'alphabet': 'models/alphabet.txt',
        'audio': 'input/test.wav',
        'extended': False,
        'lm': 'models/lm.binary',
        'model': 'models/output_graph.pbmm',
        'trie': 'models/trie',
        'version': None
    }
    # print("-----------------------------",args['model'])

    # for key, value in args.items():
    #     print (key, value)
    print('Loading model from file {}'.format(args['model']), file=sys.stderr)
    model_load_start = timer()
    ds = Model(args['model'], N_FEATURES, N_CONTEXT, args['alphabet'],
               BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

    if args['lm'] and args['trie']:
        print('Loading language model from files {} {}'.format(
            args['lm'], args['trie']),
              file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(args['alphabet'], args['lm'], args['trie'],
                               LM_ALPHA, LM_BETA)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in {:.3}s.'.format(lm_load_end),
              file=sys.stderr)

    fin = wave.open(args['audio'], 'rb')
    fs = fin.getframerate()
    if fs != 16000:
        print(
            'Warning: original sample rate ({}) is different than 16kHz. Resampling might produce erratic speech recognition.'
            .format(fs),
            file=sys.stderr)
        fs, audio = convert_samplerate(args['audio'])
    else:
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    audio_length = fin.getnframes() * (1 / 16000)
    fin.close()

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    if args['extended']:
        print(metadata_to_string(ds.sttWithMetadata(audio, fs)))
    else:
        print(ds.stt(audio, fs))
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' %
          (inference_end, audio_length),
          file=sys.stderr)
Exemple #16
0
def speakSpell(audioFile):
    TEXT = 'something went wrong'

    def convert_samplerate(audio_path):
        sox_cmd = 'sox {} --type raw --bits 16 --channels 1 --rate 16000 --encoding signed-integer --endian little --compression 0.0 --no-dither - '.format(quote(audio_path))
        try:
            output = subprocess.check_output(shlex.split(sox_cmd), stderr=subprocess.PIPE)
        except subprocess.CalledProcessError as e:
            raise RuntimeError('SoX returned non-zero status: {}'.format(e.stderr))
        except OSError as e:
            raise OSError(e.errno, 'SoX not found, use 16kHz files or install it: {}'.format(e.strerror))

        return 16000, np.frombuffer(output, np.int16)


    def metadata_to_string(metadata):
        return ''.join(item.character for item in metadata.items)

    # Load DeepSpeech model
    #if __name__ == '__main__':
    BEAM_WIDTH = 500 #Beam width used in the CTC decoder when building candidate transcriptions. Default: 500
    LM_ALPHA = 0.75 #The alpha hyperparameter of the CTC decoder. Language Model weight. Default: 0.75
    LM_BETA = 1.85 #The beta hyperparameter of the CTC decoder. Word insertion bonus. Default: 1.85
    N_FEATURES = 26 #Number of MFCC features to use. Default: 26
    N_CONTEXT = 9 #Size of the context window used for producing timesteps in the input vector. Default: 9
    #MOD = str(getFile('http://map-courses.usc.edu/codecollective/CCC/IVO/models/output_graph.pbmm', 'output_graph.pbmm')) #WILL NEED TO HOST ELSEWHERE WITH HIGHER SPEED/SIZE...CORS ISSUE??
    MOD = 'models/output_graph.pbmm'
    #ALPHABET = str(getFile('http://map-courses.usc.edu/codecollective/CCC/IVO/models/alphabet.txt', 'alphabet.txt'))
    ALPHABET = 'models/alphabet.txt'
    LM = ''#'lm.binary'
    TRIE = ''#'trie'#'models/trie'
    EXTENDED = ''
    VAD = 3 #int 0-3 higher is more aggressive filters out more non-speech
    SAVEWAV = 'STTaudio' #folder name for files
    if SAVEWAV: os.makedirs(SAVEWAV, exist_ok=True)
    #main()

    '''
    if os.path.isdir(MOD):
        model_dir = MOD
        MOD = os.path.join(model_dir, 'output_graph.pb')
        ALPHABET = os.path.join(model_dir, ALPHABET if ALPHABET else 'alphabet.txt')
        LM = os.path.join(model_dir, LM)
        TRIE = os.path.join(model_dir, TRIE)
    '''
    print('Initializing model...')
    #self.wfile.write(str('initializing model'))
    #global model
    model_load_start = timer()
    model = Model(MOD, N_FEATURES, N_CONTEXT, ALPHABET, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)
    #self.wfile.write(str('loaded model'))
    if LM and TRIE:
        lm_load_start = timer()
        print('Loading language model from files {} {}'.format(LM, TRIE), file=sys.stderr)
        model.enableDecoderWithLM(ALPHABET, LM, TRIE, LM_ALPHA, LM_BETA)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr)

    #then do stuff here

    #GET URL HERE WILL BE A CALL FROM CLIENT, A SOCKET MSG W URL
    #FILE_URL = 'http://map-courses.usc.edu/codecollective/CCC/IVO/models/speak_2019-08-09_15-53-06_972543.wav'
    #FILE_NAME = str(os.path.join(SAVEWAV, datetime.now().strftime("speak_%Y-%m-%d_%H-%M-%S_%f.wav")))
    FILE = audioFile #'speak_2019-08-09_15-53-06_972543.wav' #change to get file:  #str(getFile(FILE_URL, FILE_NAME))

    fin = wave.open(FILE, 'rb')#wave.open(FILE_NAME, 'rb')
    fs = fin.getframerate()
    '''
    if fs != 16000:
        print('Warning: original sample rate ({}) is different than 16kHz. Resampling might produce erratic speech recognition.'.format(fs), file=sys.stderr)
        fs, audio = convert_samplerate(audio) #convert_samplerate(args.audio)
    else:
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
    '''
    #UnboundLocalError: local variable 'audio' referenced before assignment
    audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
    audio_length = fin.getnframes() * (1/16000)
    fin.close()

    print('Running inference.', file=sys.stderr)
    #self.wfile.write(str('running inference'))
    inference_start = timer()
    if EXTENDED: #if args.extended:
        print(metadata_to_string(model.sttWithMetadata(audio, fs)))
        TEXT = metadata_to_string(model.sttWithMetadata(audio, fs))
    else:
        print(model.stt(audio, fs))
        TEXT = str(model.stt(audio, fs))
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
    #self.wfile.write(str('finished inference'))

    '''
    global FILE_ID
    FILE_ID = uploadFile(FILE_NAME) #added this
    gqlMutateText(FILE_ID, TEXT)
    lyreBird(FILE_ID, TEXT)
    '''
    #THEN ADD SOCKET EMIT TO TELL CLIENT TO PULL THE NEW FILE AND POPULATE THE PAGE

    #return Response("<h1>Flask on Now Zero Config</h1><p>You visited: /%s</p>" % (path), mimetype="text/html")
    #return Response("<h1>Flask on Now Zero Config</h1><p>DeepSpeech heard: %s </p>" % TEXT, mimetype="text/html")
    print(TEXT)
    return TEXT
Exemple #17
0
class SpeechToTextEngine:
    """ Class to perform speech-to-text transcription and related functionality """

    FORMAT = pyaudio.paInt16
    SAMPLE_RATE = 16000
    CHANNELS = 1
    BLOCKS_PER_SECOND = 50

    def __init__(self, scorer='deepspeech_model.scorer') -> None:
        """ Initializing the DeepSpeech model """

        self.model = Model(model_path=Path(__file__).parents[2].joinpath(
            'deepspeech_model.pbmm').absolute().as_posix())
        self.model.enableExternalScorer(scorer_path=Path(
            __file__).parents[2].joinpath(scorer).absolute().as_posix())
        self.vad = webrtcvad.Vad(mode=3)
        self.sample_rate = self.SAMPLE_RATE
        self.buffer_queue = queue.Queue()

    def run(self, audio) -> str:
        """ Receives the audio,  normalizes it and is sent to the model to be transcribed. Returns the result of the
        transcribe audio in string format."""

        normalized_audio = normalize_audio_input(audio)
        audio_streams = BytesIO(normalized_audio)
        with wave.Wave_read(audio_streams) as wav:
            audio_streams = np.frombuffer(wav.readframes(wav.getnframes()),
                                          np.int16)
        results = self.model.stt(audio_buffer=audio_streams)
        return results

    def run_with_metadata(self, audio) -> Metadata:
        normalized_audio = normalize_audio_input(audio)
        audio_streams = BytesIO(normalized_audio)
        with wave.Wave_read(audio_streams) as wav:
            audio_streams = np.frombuffer(wav.readframes(wav.getnframes()),
                                          np.int16)
        results = self.model.sttWithMetadata(audio_buffer=audio_streams)
        return results

    def add_hot_words(self, data) -> list:
        """ Receives data in form of hot-words and boosts, adds them to the language model and return the list of the
        added hot-words """

        all_hot_words = []
        try:
            logger.info('----------------------------------------------------')
            for hot_word in data:
                # Change all the characters of the hot-word to lower case
                word = hot_word.lower()

                # Get numeric value of the boost
                boost = float(data.get(hot_word))

                # Adding the hot-word and its boost to the language model
                self.model.addHotWord(hot_word, boost)

                # Printing on the prompt the activity
                logger.info(
                    f"`{word}` hot-word with boost `{boost}` was added.")
                all_hot_words.append(word)
            return all_hot_words
        except RuntimeError:
            return []

    def erase_hot_word(self, hot_words) -> None:
        try:
            for hot_word in hot_words:
                self.model.eraseHotWord(hot_word)
                logger.info(f"`{hot_word}` hot-word is erased.")
            logger.info('----------------------------------------------------')
        except RuntimeError:
            return

    def clear_hot_words(self) -> str:
        try:
            self.model.clearHotWords()
            return f"All hot-words were erased."
        except RuntimeError:
            return f"No more hot-words are left."

    def deep_stream(self):
        return self.model.createStream()

    def frame_generator(self, audio, sample_rate=16000, frame_duration_ms=30):
        """
        Takes the desired frame duration in milliseconds, the PCM data, and
        the sample rate. Yields Frames of the requested duration.
        """

        # audio = np.frombuffer(audio, np.int16)
        n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)
        offset = 0
        timestamp = 0.0
        duration = (float(n) / sample_rate) / 2.0
        while offset + n < len(audio):
            yield Frame(audio[offset:offset + n], timestamp, duration)
            timestamp += duration
            offset += n
Exemple #18
0
class SpeechToTextEngine:
    """ Class to perform speech-to-text transcription and related functionality """
    def __init__(self, scorer='deepspeech_model.scorer') -> None:
        """ Initializing the DeepSpeech model """
        self.model = Model(model_path=Path(__file__).parents[2].joinpath(
            'deepspeech_model.pbmm').absolute().as_posix())
        self.model.enableExternalScorer(scorer_path=Path(
            __file__).parents[2].joinpath(scorer).absolute().as_posix())

    def run(self, audio) -> str:
        """ Receives the audio,  normalizes it and is sent to the model to be transcribed. Returns the result of the
        transcribe audio in string format."""

        normalized_audio = normalize_audio_input(audio)
        audio_streams = BytesIO(normalized_audio)
        with wave.Wave_read(audio_streams) as wav:
            audio_streams = np.frombuffer(wav.readframes(wav.getnframes()),
                                          np.int16)
        results = self.model.stt(audio_buffer=audio_streams)
        return results

    def run_with_metadata(self, audio) -> Metadata:
        normalized_audio = normalize_audio_input(audio)
        audio_streams = BytesIO(normalized_audio)
        with wave.Wave_read(audio_streams) as wav:
            audio_streams = np.frombuffer(wav.readframes(wav.getnframes()),
                                          np.int16)
        results = self.model.sttWithMetadata(audio_buffer=audio_streams)
        return results

    def add_hot_words(self, data) -> list:
        """ Receives data in form of hot-words and boosts, adds them to the language model and return the list of the
        added hot-words """

        all_hot_words = []
        try:
            logger.info('----------------------------------------------------')
            for hot_word in data:
                # Change all the characters of the hot-word to lower case
                word = hot_word.lower()

                # Get numeric value of the boost
                boost = float(data.get(hot_word))

                # Adding the hot-word and its boost to the language model
                self.model.addHotWord(hot_word, boost)

                # Printing on the prompt the activity
                logger.info(
                    f"`{word}` hot-word with boost `{boost}` was added.")
                all_hot_words.append(word)
            return all_hot_words
        except RuntimeError:
            return []

    def erase_hot_word(self, hot_words) -> None:
        try:
            for hot_word in hot_words:
                self.model.eraseHotWord(hot_word)
                logger.info(f"`{hot_word}` hot-word is erased.")
            logger.info('----------------------------------------------------')
        except RuntimeError:
            return

    def clear_hot_words(self) -> str:
        try:
            self.model.clearHotWords()
            return f"All hot-words were erased."
        except RuntimeError:
            return f"No more hot-words are left."

    def sample_rate(self):
        return self.model.sampleRate()
Exemple #19
0
                fin = wave.open(file_path, 'rb')
                fs_orig = fin.getframerate()
                if fs_orig != desired_sample_rate:
                    print(
                        'Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'
                        .format(fs_orig, desired_sample_rate),
                        file=sys.stderr)
                    fs_new, audio = convert_samplerate(file_path,
                                                       desired_sample_rate)
                else:
                    audio = np.frombuffer(fin.readframes(fin.getnframes()),
                                          np.int16)
                # audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
                fin.close()

                inference = ds.sttWithMetadata(audio, 1).transcripts[0]
                transcript.append(''.join(token.text
                                          for token in inference.tokens))

            print("final transcript:\n\n{}\n\n".format(transcript))
            client.publish(TEST_OUTPUT_TOPIC, json.dumps(transcript))
            print("I got:\n{}".format(transcript))

# This is just a simplified mockup of the deepspeech demo code,
# I stole wavSplit file also from the demo code

if __name__ == "__main__" and False:
    # locate model files
    model_path = os.path.join(os.getcwd(), "models")
    print("model_path: {}".format(model_path))
    pb = glob.glob(model_path + "/*.pbmm")[0]
Exemple #20
0
def main_transcript(video_to_encode):
    msg = ""

    mp3file = video_to_encode.get_video_mp3(
    ).source_file if video_to_encode.get_video_mp3() else None

    lang = video_to_encode.main_lang

    # check if DS_PARAM [lang] exist
    if not DS_PARAM.get(lang):
        msg += "\n no deepspeech model found for lang:%s." % lang
        msg += "Please add it in DS_PARAM."
        return msg

    ds_model = Model(DS_PARAM[lang]['model'], DS_PARAM[lang]['beam_width'])

    if all([
            cond in DS_PARAM[lang]
            for cond in ['alphabet', 'lm', 'trie', 'lm_alpha', 'lm_beta']
    ]):
        ds_model.enableDecoderWithLM(DS_PARAM[lang]['lm'],
                                     DS_PARAM[lang]['trie'],
                                     DS_PARAM[lang]['lm_alpha'],
                                     DS_PARAM[lang]['lm_beta'])

    desired_sample_rate = ds_model.sampleRate()

    webvtt = WebVTT()
    inference_start = timer()
    last_item = None
    sentences = []
    sentence = []
    metadata = None

    for start_trim in range(0, video_to_encode.duration, AUDIO_SPLIT_TIME):

        end_trim = video_to_encode.duration if start_trim + \
            AUDIO_SPLIT_TIME > video_to_encode.duration else (
                start_trim + AUDIO_SPLIT_TIME + SENTENCE_MAX_LENGTH)

        duration = (AUDIO_SPLIT_TIME + SENTENCE_MAX_LENGTH) if start_trim + \
            AUDIO_SPLIT_TIME + SENTENCE_MAX_LENGTH < video_to_encode.duration \
            else (video_to_encode.duration - start_trim)

        msg += "\ntake audio from %s to %s - %s" % (start_trim, end_trim,
                                                    duration)

        audio = convert_samplerate(mp3file.path, desired_sample_rate,
                                   start_trim, duration)
        msg += '\nRunning inference.'

        metadata = ds_model.sttWithMetadata(audio)

        msg += '\nConfidence : %s' % metadata.confidence

        sentences[:] = []  # empty list
        sentence[:] = []  # empty list

        refItem = metadata.items[0]

        index = get_index(metadata, last_item, start_trim) if last_item else 0

        # nb of character in AUDIO_SPLIT_TIME
        msg += "METADATA ITEMS : %d " % len(metadata.items)

        sentences = get_sentences(metadata, refItem, index)

        last_item = (
            sentences[-1][-1].character,
            sentences[-1][-1].start_time) if len(sentences) > 0 else ()

        for sent in sentences:
            if len(sent) > 0:
                start_time = sent[0].start_time + start_trim
                end_time = sent[-1].start_time + start_trim
                str_sentence = ''.join(item.character for item in sent)
                # print(start_time, end_time, str_sentence)
                caption = Caption(
                    '%s.%s' %
                    (timedelta(seconds=int(str(start_time).split('.')[0])),
                     str('%.3f' % start_time).split('.')[1]), '%s.%s' %
                    (timedelta(seconds=int(str(end_time).split('.')[0])),
                     str('%.3f' % end_time).split('.')[1]),
                    ['%s' % str_sentence])

                webvtt.captions.append(caption)
    # print(webvtt)
    msg += saveVTT(video_to_encode, webvtt)
    inference_end = timer() - inference_start
    msg += '\nInference took %0.3fs.' % inference_end
    # print(msg)
    return msg
Exemple #21
0
class DeepSpeechWrapper:
    def __init__(self, dir):
        parser = argparse.ArgumentParser(
            description='Running DeepSpeech inference.')
        parser.add_argument(
            '--model',
            default=os.path.join(dir, 'output_graph.pbmm'),
            help='Path to the model (protocol buffer binary file)')
        parser.add_argument(
            '--alphabet',
            default=os.path.join(dir, 'alphabet.txt'),
            help=
            'Path to the configuration file specifying the alphabet used by the network'
        )
        parser.add_argument('--lm',
                            nargs='?',
                            default=os.path.join(dir, 'lm.binary'),
                            help='Path to the language model binary file')
        parser.add_argument(
            '--trie',
            nargs='?',
            default=os.path.join(dir, 'trie'),
            help=
            'Path to the language model trie file created with native_client/generate_trie'
        )
        parser.add_argument('--version',
                            action=VersionAction,
                            help='Print version and exits')
        parser.add_argument('--extended',
                            required=False,
                            action='store_true',
                            help='Output string from extended metadata')
        self.args = parser.parse_args('')  # shadow the system args

        self.ds = Model(self.args.model, N_FEATURES, N_CONTEXT,
                        self.args.alphabet, BEAM_WIDTH)

        self.audio = None
        self.audio_length = 0
        self.fs = 16000

        if self.args.lm and self.args.trie:
            # print('Loading language model from files {} {}'.format(self.args.lm, self.args.trie), file=sys.stderr)
            # lm_load_start = timer()
            self.ds.enableDecoderWithLM(self.args.alphabet, self.args.lm,
                                        self.args.trie, LM_ALPHA, LM_BETA)
            # lm_load_end = timer() - lm_load_start
            # print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr)

    def set_input(self, filename):
        fin = wave.open(filename, 'rb')
        self.fs = fin.getframerate()
        if self.fs != 16000:
            print(
                'Warning: original sample rate ({}) is different than 16kHz. Resampling might produce erratic speech recognition.'
                .format(self.fs),
                file=sys.stderr)
            self.fs, audio = convert_samplerate(filename)
        else:
            audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

        self.audio = audio
        self.audio_length = fin.getnframes() / self.fs
        fin.close()

    def recognize_audio(self, start_time, end_time):
        start_frame = int(start_time * self.fs)
        end_frame = int(end_time * self.fs)
        seq = self.audio[start_frame:end_frame]
        if self.args.extended:
            return metadata_to_string(self.ds.sttWithMetadata(seq, self.fs))
        else:
            return self.ds.stt(seq, self.fs)