Esempio n. 1
0
def speech_to_text(input_file,
                   file_length,
                   return_speed_per_chunk=False,
                   chunk_size=10):
    """
    Compute the words pronounced in the input_file
    :param input_file: sound file path
    :param file_length: time length of the input file (in seconds)
    :param return_speed_per_chunk: if True, the function return a list of words per chunk, if false it returns all the words in the extract
    :return: words as string
    """
    # setup the model
    if return_speed_per_chunk:
        result = []
    else:
        result = ""
    recognizer = Model("models/deepspeech-0.8.2-models.pbmm")
    recognizer.setBeamWidth(2000)
    recognizer.enableExternalScorer("models/deepspeech-0.8.2-models.scorer")
    desired_sample_rate = recognizer.sampleRate()
    # convert input file into smaller audio chunks (apparently works better)
    CHUNK_SIZE = chunk_size
    n_chunks = int(file_length // CHUNK_SIZE)
    for i in range(n_chunks):
        tfm = sox.Transformer()
        tfm.trim(i * CHUNK_SIZE, (i + 1) * CHUNK_SIZE)
        tfm.set_output_format(channels=1)
        tfm.build(input_file, "temp_folder/chunked_file{}.wav".format(i))
        #cmb = sox.Combiner()
        input_list = [
            "audio-files/silence.wav",
            "temp_folder/chunked_file{}.wav".format(i),
            "audio-files/silence.wav"
        ]
        input_list_correct_sample_rate = list(
            map(lambda file: convert_samplerate(file, desired_sample_rate)[1],
                input_list))
        audio = np.concatenate(input_list_correct_sample_rate)
        #cmb.build(input_list, "temp_folder/chunked_file_with_silence{}.wav".format(i), combine_type="concatenate")
        #fs, audio = convert_samplerate("temp_folder/chunked_file_with_silence{}.wav".format(i), desired_sample_rate)
        if return_speed_per_chunk:
            result.append(recognizer.stt(audio))
        else:
            result += recognizer.stt(audio)
        os.remove("temp_folder/chunked_file{}.wav".format(i))
        #os.remove("temp_folder/chunked_file_with_silence{}.wav".format(i))
    print(result)
    return result
Esempio n. 2
0
def tflite_worker(model, lm, trie, queue_in, queue_out, gpu_mask):
    os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_mask)
    ds = Model(model, BEAM_WIDTH)
    ds.enableDecoderWithLM(lm, trie, LM_ALPHA, LM_BETA)

    while True:
        try:
            msg = queue_in.get()

            filename = msg['filename']
            wavname = os.path.splitext(os.path.basename(filename))[0]
            fin = wave.open(filename, 'rb')
            audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
            fin.close()

            decoded = ds.stt(audio)

            queue_out.put({
                'wav': wavname,
                'prediction': decoded,
                'ground_truth': msg['transcript']
            })
        except FileNotFoundError as ex:
            print('FileNotFoundError: ', ex)

        print(queue_out.qsize(), end='\r')  # Update the current progress
        queue_in.task_done()
Esempio n. 3
0
def tflite_worker(model, scorer, queue_in, queue_out, gpu_mask):
    os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_mask)
    ds = Model(model)

    while True:
        try:
            msg = queue_in.get()

            filename = msg['filename']
            fin = wave.open(filename, 'rb')
            audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
            fin.close()

            decoded = ds.stt(audio)

            queue_out.put({
                'wav': filename,
                'prediction': decoded,
                'ground_truth': msg['transcript']
            })
        except FileNotFoundError as ex:
            print('FileNotFoundError: ', ex)

        print(queue_out.qsize(), end='\r')  # Update the current progress
        queue_in.task_done()
Esempio n. 4
0
def mainCall(model=ROOT_DIR+"models/output_graph.pbmm", alphabet=ROOT_DIR+"models/alphabet.txt", lm=ROOT_DIR+"models/lm.binary", trie=ROOT_DIR+"models/trie", audio=ROOT_DIR+"test.wav"):

    print('Loading model from file {}'.format(model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

    if lm and trie:
        print('Loading language model from files {} {}'.format(lm, trie), file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(alphabet, lm, trie, LM_ALPHA, LM_BETA)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr)

    fin = wave.open(audio, 'rb')
    fs = fin.getframerate()
    if fs != 16000:
        print('Warning: original sample rate ({}) is different than 16kHz. Resampling might produce erratic speech recognition.'.format(fs), file=sys.stderr)
        fs, audio = convert_samplerate(audio)
    else:
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    audio_length = fin.getnframes() * (1/16000)
    fin.close()

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    res = ds.stt(audio, fs)
    print(res)
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
    return res
Esempio n. 5
0
def extract_text(AUDIO):
    ds = Model(MODEL, N_FEATURES, N_CONTEXT, ALPHABET, BEAM_WIDTH)
    ds.enableDecoderWithLM(ALPHABET, LM, TRIE, LM_ALPHA, LM_BETA)

    fin = wave.open(AUDIO, 'rb')
    fs = fin.getframerate()
    if fs != 16000:
        print(
            'Warning: original sample rate ({}) is different than 16kHz. Resampling might produce erratic speech recognition.'
            .format(fs),
            file=sys.stderr)
        fs, audio = convert_samplerate(AUDIO)
    else:
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    audio_length = fin.getnframes() * (1 / 16000)
    fin.close()

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    INFERENCE_RESULT = ds.stt(audio, fs)
    print(INFERENCE_RESULT)
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' %
          (inference_end, audio_length),
          file=sys.stderr)
    # write_text_to_file(INFERENCE_RESULT)
    return INFERENCE_RESULT
Esempio n. 6
0
def main():
    # import soundfile as sf
    # for format, format_desc in sf.available_formats().items():
    #     print(f'Format: {format} {format_desc} ')
    #     for subtype, st_desc in sf.available_subtypes().items():
    #         print(f'{subtype} {st_desc}')
    #     print()

    print(create_args_str(args))
    print(f'Loading model from file {args.model}', file=sys.stderr)
    model_load_start = timer()
    ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print(f'Loaded model in {model_load_end:.3}s.', file=sys.stderr)

    # if args.lm and args.trie:
    #     print(f'Loading language model from files {args.lm} {args.trie}', file=sys.stderr)
    #     lm_load_start = timer()
    #     ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT, VALID_WORD_COUNT_WEIGHT)
    #     lm_load_end = timer() - lm_load_start
    #     print(f'Loaded language model in {lm_load_end:.3}s.', file=sys.stderr)

    corpus = get_corpus('ls')
    corpus_entry = corpus[0]
    for i, segment in enumerate(corpus_entry[:5]):
        audio, rate = segment.audio, segment.rate
        transcription = ds.stt(audio, rate)
        print(f'transcription: \t{transcription}')
        print(f'actual: \t\t{segment.text}')
Esempio n. 7
0
class MozillaDeepSpeechEngine(Engine):
    def __init__(self, pbmm_path: str, scorer_path: str):
        self._model = Model(pbmm_path)
        self._model.enableExternalScorer(scorer_path)
        self._audio_sec = 0.
        self._proc_sec = 0.

    def transcribe(self, path: str) -> str:
        audio, sample_rate = soundfile.read(path, dtype='int16')
        assert sample_rate == self._model.sampleRate()
        self._audio_sec += audio.size / sample_rate

        start_sec = time.time()
        res = self._model.stt(audio)
        self._proc_sec += time.time() - start_sec

        return res

    def rtf(self) -> float:
        return self._proc_sec / self._audio_sec

    def delete(self) -> None:
        pass

    def __str__(self) -> str:
        return 'Mozilla DeepSpeech'
Esempio n. 8
0
def s2t(file):
    ds = Model(MODEL_FILE, 500)
    ds.enableDecoderWithLM(LANG_MODEL, TRIE_FILE, 1.50, 2.25)

    fs, audio = wav.read(file)
    data = ds.stt(audio)
    return data
Esempio n. 9
0
def recognize_DS(audio1, data):
    beam_width = 500 #how many different word sequences will the model take into account
    model_name = data['wake']['model name']
    ds = Model(model_name)
    ds.setBeamWidth(beam_width)
    audio1 = np.frombuffer(audio1.frame_data, np.int16) #converts into numpy array
    return (ds.stt(audio1)) #returning predicted audio
Esempio n. 10
0
def client(audio_file, lang="uk"):
    model_load_start = timer()
    # sphinx-doc: python_ref_model_start
    model = "./uk.tflite"

    ds = Model(model)
    # ds.enableExternalScorer("kenlm.scorer")
    # sphinx-doc: python_ref_model_stop
    model_load_end = timer() - model_load_start
    print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

    desired_sample_rate = ds.sampleRate()

    fin = wave.open(audio_file, 'rb')
    fs_orig = fin.getframerate()
    audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    audio_length = fin.getnframes() * (1 / fs_orig)
    fin.close()

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    # sphinx-doc: python_ref_inference_start

    result = ds.stt(audio)
    print(result)
    # sphinx-doc: python_ref_inference_stop
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' %
          (inference_end, audio_length),
          file=sys.stderr)
    return result
Esempio n. 11
0
    def generate_text(self):
        print(self.t1.isAlive(), "check thread t1 is alive or not")
        print(self.t2.isAlive(), "check thread t2 is alive or not")

        self.change_status = False

        self.t1 = threading.Thread(target=self.start_recording)
        self.t2 = threading.Thread(target=self.stop_recording)

        model_path = '/home/batman/python_projects/flask_blog_version1/myblog/models/deepspeech/deepspeech-0.5.1-models/'
        # Numeric values are configurable
        ds = Model(model_path + 'output_graph.pbmm', 26, 9,
                   model_path + 'alphabet.txt', 500)
        ds.enableDecoderWithLM(model_path + 'alphabet.txt',
                               model_path + 'lm.binary', model_path + 'trie',
                               0.75, 1.85)

        def load_audio(audio_path):
            fin = wave.open(audio_path, 'rb')
            audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
            fin.close()
            return audio

        def frame_rate(audio_path):
            fin = wave.open(audio_path, 'rb')
            sample_rate = fin.getframerate()
            fin.close()
            return sample_rate

        audio_file = self.filename
        field_value = ds.stt(load_audio(audio_file), frame_rate(audio_file))
        self.welcome_text.delete('1.0', END)
        self.welcome_text.insert(END, field_value)
Esempio n. 12
0
class MozillaDeepSpeechASREngine(ASREngine):
    """https://github.com/mozilla/DeepSpeech"""
    def __init__(self,
                 model_path,
                 alphabet_path,
                 language_model_path=None,
                 trie_path=None):
        """
        Constructor.

        :param model_path: Absolute path to (acoustic) model file.
        :param alphabet_path: Absolute path to file containing alphabet.
        :param language_model_path: Absolute path to language model file. This parameter is optional. Set to
        enable decoding with language model.
        :param trie_path: Absolute path to trie. This parameter is optional. Set to enable decoding with language model.
        """

        # https://github.com/mozilla/DeepSpeech/blob/master/native_client/python/client.py
        self._model = Model(model_path, 26, 9, alphabet_path, 500)
        self._model.enableDecoderWithLM(alphabet_path, language_model_path,
                                        trie_path, 1.5, 2.1)

    def transcribe(self, path):
        pcm, sample_rate = soundfile.read(path)
        pcm = (np.iinfo(np.int16).max * pcm).astype(np.int16)

        return self._model.stt(pcm, aSampleRate=sample_rate)

    def __str__(self):
        return 'Mozilla DeepSpeech'
Esempio n. 13
0
def main():
    parser = argparse.ArgumentParser(description='Running DeepSpeech inference.')
    parser.add_argument('--model', required=True,
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument('--lm', nargs='?',
                        help='Path to the language model binary file')
    parser.add_argument('--trie', nargs='?',
                        help='Path to the language model trie file created with native_client/generate_trie')
    parser.add_argument('--audio', required=True,
                        help='Path to the audio file to run (WAV format)')
    parser.add_argument('--beam_width', type=int, default=500,
                        help='Beam width for the CTC decoder')
    parser.add_argument('--lm_alpha', type=float, default=0.75,
                        help='Language model weight (lm_alpha)')
    parser.add_argument('--lm_beta', type=float, default=1.85,
                        help='Word insertion bonus (lm_beta)')
    parser.add_argument('--version', action=VersionAction,
                        help='Print version and exits')
    parser.add_argument('--extended', required=False, action='store_true',
                        help='Output string from extended metadata')
    parser.add_argument('--json', required=False, action='store_true',
                        help='Output json from metadata with timestamp of each word')
    args = parser.parse_args()

    print('Loading model from file {}'.format(args.model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(args.model, args.beam_width)
    model_load_end = timer() - model_load_start
    print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

    desired_sample_rate = ds.sampleRate()

    if args.lm and args.trie:
        print('Loading language model from files {} {}'.format(args.lm, args.trie), file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(args.lm, args.trie, args.lm_alpha, args.lm_beta)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr)

    fin = wave.open(args.audio, 'rb')
    fs = fin.getframerate()
    if fs != desired_sample_rate:
        print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format(fs, desired_sample_rate), file=sys.stderr)
        fs, audio = convert_samplerate(args.audio, desired_sample_rate)
    else:
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    audio_length = fin.getnframes() * (1/fs)
    fin.close()

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    if args.extended:
        print(metadata_to_string(ds.sttWithMetadata(audio)))
    elif args.json:
        print(metadata_json_output(ds.sttWithMetadata(audio)))
    else:
        print(ds.stt(audio))
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
Esempio n. 14
0
        def handle_message(sid, data):
            output_channel = SocketIOOutput(sio, sid, self.bot_message_evt)

            ##convert audio message to text and pass it to the Rasa Core
            ds = Model('models_stt/output_graph.pbmm', 26, 9,
                       'models_stt/alphabet.txt', 500)
            fs, audio = wav.read('LDC93S1.wav')
            audio_length = len(audio) * (1 / 16000)
            message = ds.stt(audio, fs)

            if self.session_persistence:
                if not data.get("session_id"):
                    logger.warning("A message without a valid sender_id "
                                   "was received. This message will be "
                                   "ignored. Make sure to set a proper "
                                   "session id using the "
                                   "`session_request` socketIO event.")
                    return
                sender_id = data['session_id']
            else:
                sender_id = sid

            message = UserMessage(data['message'],
                                  output_channel,
                                  sender_id,
                                  input_channel=self.name())
            on_new_message(message)
Esempio n. 15
0
class MozillaDeepSpeech(ASRSystem):
    """
    Implements a Mozilla DeepSpeech model based on the model file at model_path. 

    This code assumes the model follows Mozilla DeepSpeech version 6.1 and may not
    work for later models. See https://deepspeech.readthedocs.io/en/v0.6.1/USING.html
    for installation instructions.
    """
    def __init__(self, model_path, use_language_model=False, identifier=None):
        super(MozillaDeepSpeech, self).__init__(model_path, identifier)
        model_path = os.path.join(self.model_path, 'output_graph.pbmm')
        alphabet_path = os.path.join(self.model_path, 'alphabet.txt')
        language_model_path = os.path.join(self.model_path, 'lm.binary')
        trie_path = os.path.join(self.model_path, 'trie')
        self._model = DPModel(model_path, 500)
        self.samplerate_hz = 16000

        if use_language_model:
            self._model.enableDecoderWithLM(language_model_path, trie_path,
                                            0.75, 1.85)

    def transcribe(self, sound_or_path, fs=None):
        sound = self._load_sound(sound_or_path)
        sound = (np.iinfo(np.int16).max * sound).astype(np.int16)
        res = self._model.stt(sound)
        return res
Esempio n. 16
0
def upload(request):
    # retrieves filename from post request
    data = json.loads(request.body)
    filename = data.get('filename')

    #google upload to bucket parameters
    # storage_client = storage.Client()
    # bucket = storage_client.bucket('waev')
    # blob = bucket.blob(filename+'.flac')
    audio = f"{settings.MEDIA_ROOT}/{filename}"

    #convert audio file to mono FLAC, 16000 samplerate to optimize transcription
    tfm = sox.Transformer()
    tfm.convert(samplerate=16000, n_channels=1)
    new_audio = f"{settings.MEDIA_ROOT}/test.wav"
    audio = tfm.build(audio, new_audio)
    fin = wave.open(new_audio, 'rb')
    audio_buffer = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    # parser = argparse.ArgumentParser(description='Running DeepSpeech inference.')
    # args = parser.parse_args()
    print(new_audio)
    ds = Model("deepspeech-0.7.4-models.pbmm")
    print(ds.stt(audio_buffer))

    #upload file to bucket
    # blob.upload_from_filename(new_audio)
    # print("file uploaded!")

    return HttpResponse()
Esempio n. 17
0
class DeepSpeechInput(AudioInput):
    '''
    Input from DeepSpeech using the US English language model.
    '''
    def __init__(self, notifier, use_lm=False, wav_dir=None):
        '''
        @see AudioInput.__init__()

        @type  use_lm: bool
        @param use_lm:
            Whether to use the DeepSpeech language model for better predictions.
        '''
        super(DeepSpeechInput, self).__init__(notifier,
                                              format=pyaudio.paInt16,
                                              channels=1,
                                              rate=16000,
                                              wav_dir=wav_dir)

        # The files which we'll need from the model directory
        alphabet = os.path.join(_MODEL_DIR, 'alphabet.txt')
        model = os.path.join(_MODEL_DIR, 'output_graph.pb')
        lm = os.path.join(_MODEL_DIR, 'lm.binary')
        trie = os.path.join(_MODEL_DIR, 'trie')

        # If these don't exist then DeepSpeech will segfault when inferring!
        if not os.path.exists(alphabet):
            raise IOError("Not found: %s" % alphabet)
        if not os.path.exists(model):
            raise IOError("Not found: %s" % model)

        # Load in the model.
        LOG.info("Loading %s" % model)
        self._model = Model(model, _NUM_FEATURES, _NUM_CONTEXT, alphabet,
                            _BEAM_WIDTH)

        # If we're using a language model then pull that in too. This requires a
        # decent chunk of memory.
        if use_lm:
            if not os.path.exists(lm):
                raise IOError("Not found: %s" % lm)
            if not os.path.exists(trie):
                raise IOError("Not found: %s" % trie)

            LOG.info("Loading %s" % lm)
            self._model.enableDecoderWithLM(alphabet, lm, trie, _LM_WEIGHT,
                                            _VALID_WORD_COUNT_WEIGHT)

    def _decode_raw(self, data):
        '''
        @see AudioInput._decode_raw()
        '''
        audio = numpy.frombuffer(data, numpy.int16)
        words = self._model.stt(audio, self._rate)
        LOG.info("Got: %s" % (words, ))
        tokens = [
            Token(word.strip(), 1.0, True) for word in words.split(' ')
            if len(word.strip()) > 0
        ]
        return tokens
Esempio n. 18
0
def get_text(wav_file):

    ds = Model(MODEL_FILE, N_FEATURES, N_CONTEXT, ALPHABET_FILE, BEAM_WIDTH)
    ds.enableDecoderWithLM(ALPHABET_FILE, LANGUAGE_MODEL, TRIE_FILE, LM_WEIGHT, VALID_WORD_COUNT_WEIGHT)
    fs, audio = wavfile.read(wav_file)
    processed_data = ds.stt(audio, fs)

    print(processed_data)
Esempio n. 19
0
class SpeechRecognizer:
    def __init__(self):
        self._model = Model('DeepSpeech/deepspeech-0.7.1-models.pbmm')
        # self._model.setBeamWidth(1)
        # self._model.enableExternalScorer('DeepSpeech/deepspeech-0.7.1-models.scorer')

    def listen(self, audio):
        return self._model.stt(audio)
Esempio n. 20
0
def main():
    parser = argparse.ArgumentParser(description='Running DeepSpeech inference.')
    parser.add_argument('--model', required=True,
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument('--alphabet', required=True,
                        help='Path to the configuration file specifying the alphabet used by the network')
    parser.add_argument('--lm', nargs='?',
                        help='Path to the language model binary file')
    parser.add_argument('--trie', nargs='?',
                        help='Path to the language model trie file created with native_client/generate_trie')
    parser.add_argument('--audio', required=True,
                        help='Path to the audio file to run (WAV format)')
    parser.add_argument('--version', action=VersionAction,
                        help='Print version and exits')
    parser.add_argument('--extended', required=False, action='store_true',
                        help='Output string from extended metadata')
    args = parser.parse_args()

    print('Loading model from file {}'.format(args.model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

    if args.lm and args.trie:
        print('Loading language model from files {} {}'.format(args.lm, args.trie), file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_ALPHA, LM_BETA)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr)

    writeFile = open('speechtotext.csv', 'w')
    writer = csv.writer(writeFile)
    writer.writerow(['inputfile', 'inference'])
    for file in glob.glob("{}*.wav".format(args.audio)):

        fin = wave.open(file, 'rb')
        fs = fin.getframerate()
        if fs != SAMPLE_RATE:
            print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format(fs, SAMPLE_RATE), file=sys.stderr)
            fs, audio = convert_samplerate(args.audio)
        else:
            audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

        audio_length = fin.getnframes() * (1/SAMPLE_RATE)
        fin.close()

        print('Running inference for {}'.format(file), file=sys.stderr)
        inference_start = timer()
        if args.extended:
            print(metadata_to_string(ds.sttWithMetadata(audio, fs)))
        else:
            #print(ds.stt(audio, fs))
            writer.writerow(["{}".format(file),"{}".format(ds.stt(audio, fs))])
        inference_end = timer() - inference_start
        print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)

    writeFile.close()
Esempio n. 21
0
def DeepSpeech(Window, SpeechToNLPQueue, wavefile):

    # Create Signal Object
    SpeechSignal = GUISignal()
    SpeechSignal.signal.connect(Window.UpdateSpeechBox)

    MsgSignal = GUISignal()
    MsgSignal.signal.connect(Window.UpdateMsgBox)

    # References to models:
    model = 'DeepSpeech_Models/output_graph.pbmm'
    alphabet = 'DeepSpeech_Models/alphabet.txt'
    lm = 'DeepSpeech_Models/lm.binary'
    trie = 'DeepSpeech_Models/trie'

    print('Loading model from file {}'.format(model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

    if lm and trie:
        print('Loading language model from files {} {}'.format(lm, trie),
              file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(alphabet, lm, trie, LM_ALPHA, LM_BETA)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in {:.3}s.'.format(lm_load_end),
              file=sys.stderr)

    audio = wavefile

    fin = wave.open(audio, 'rb')
    fs = fin.getframerate()
    if fs != 16000:
        print(
            'Warning: original sample rate ({}) is different than 16kHz. Resampling might produce erratic speech recognition.'
            .format(fs),
            file=sys.stderr)
        fs, audio = convert_samplerate(audio)
    else:
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    audio_length = fin.getnframes() * (1 / 16000)
    fin.close()

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    output = (ds.stt(audio, fs))
    print(output)
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' %
          (inference_end, audio_length),
          file=sys.stderr)

    QueueItem = SpeechNLPItem(output, True, 0, 0, 'Speech')
    SpeechToNLPQueue.put(QueueItem)
    SpeechSignal.signal.emit([QueueItem])
Esempio n. 22
0
class DeepSpeechRecognizer:

    def __init__(self):

        self.file_path = Path(__file__).parent

        self.model = Model('/Users/shihangyu/Scripts/python/stt_server/model/deepspeech-0.6.1-models/output_graph.pbmm',
                           aBeamWidth=500)

        self.desired_sample_rate = self.model.sampleRate()

        self.logger = getLogger(self.__module__)

        self.tmp_path = self.file_path / 'tmp.wav'

    def __convert_samplerate(self, audio_path):
        sox_cmd = 'sox {} --type raw --bits 16 --channels 1 --rate {} --encoding signed-integer --endian little --compression 0.0 --no-dither - '.format(
            quote(audio_path), self.desired_sample_rate)
        try:
            output = subprocess.check_output(shlex.split(sox_cmd), stderr=subprocess.PIPE)
        except subprocess.CalledProcessError as e:
            raise RuntimeError('SoX returned non-zero status: {}'.format(e.stderr))
        except OSError as e:
            raise OSError(e.errno,
                          'SoX not found, use {}hz files or install it: {}'.format(self.desired_sample_rate,
                                                                                   e.strerror))

        return self.desired_sample_rate, np.frombuffer(output, np.int16)

    def inference(self, audio_path):

        try:
            fin = wave.open(audio_path, 'rb')
        except Exception as e:

            x, _ = librosa.load(str(audio_path), sr=16000)

            sf.write(str(self.tmp_path), x, 16000)

            fin = wave.open(str(self.tmp_path), 'rb')

        fs = fin.getframerate()

        if fs != self.desired_sample_rate:
            # self.logger.warning(f'Warning: original sample rate ({fs}) is different than {self.desired_sample_rate}hz. '
            #                     f'Resampling might produce erratic speech recognition.')
            fs, audio = self.__convert_samplerate(audio_path)
        else:
            audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

        fin.close()

        output = self.model.stt(audio)

        self.logger.debug(f"DeepSpeechRecognizer inference output: {output}")

        return output
Esempio n. 23
0
class EnglishASR(object):
    def __init__(self, model, scorer):
        self.model = Model(model)
        self.model.enableExternalScorer(scorer)

    def recognize(self, wav_path):
        fs, audio = wavfile.read(wav_path)
        assert fs == 16000
        result = self.model.stt(audio)
        return result
Esempio n. 24
0
 def SpeechToText(self, audio_file):
     input_graph = "deepspeech-0.5.1-models/output_graph.pbmm"
     alphabet = "deepspeech-0.5.1-models/alphabet.txt"
     deepSpeech = Model(input_graph, 26, 9, alphabet, 500)
     fs, audio = wav.read(audio_file)
     text_data = deepSpeech.stt(audio, fs)
     print(text_data)
     with open('out_text_data.txt', 'w') as f:
         f.write(text_data)
     return text_data
Esempio n. 25
0
def predict_speech_to_text(stream_file):
    # Initialize the model
    speech_model = Model(MODEL_PATH)

    # Enable language scorer to improve the accuracy
    speech_model.enableExternalScorer(SCORER_PATH)
    # You can play with setting the model Beam Width, Scorer language model weight and word insertion weight

    # Use scipy to covert wav file into numpy array
    _, audio = wav.read(stream_file)
    return speech_model.stt(audio)
def main(argv):
    if len(argv) < 1:
        print("No .wav File given.")
        return

    ds = Model(MODEL_FILE, 500)
    ds.enableDecoderWithLM(LANG_MODEL, TRIE_FILE, 1.50, 2.25)

    fs, audio = wav.read(argv[0])
    data = ds.stt(audio)
    print(data)
Esempio n. 27
0
class SpeechToTextEngine:
    def __init__(self, model_path, scorer_path):
        self.model = Model(model_path=model_path)
        self.model.enableExternalScorer(scorer_path=scorer_path)

    def run(self, audio):
        audio = normalize_audio(audio)
        audio = BytesIO(audio)
        with wave.Wave_read(audio) as wav:
            audio = np.frombuffer(wav.readframes(wav.getnframes()), np.int16)
        result = self.model.stt(audio_buffer=audio)
        return result
class Tester(BaseTester):

    name = 'DeepSpeech'

    audio_format = RATE16K_MONO_WAV

    def __init__(self, *args, **kwargs):
        super(Tester, self).__init__(*args, **kwargs)

        files = [
            args_lm,
            args_trie,
            args_model,
            # args_alphabet,
        ]
        for f in files:
            assert os.path.isfile(f), 'File %s does not exist.' % f

        print('Loading model from file %s' % (args_model), file=sys.stderr)
        model_load_start = timer()
        # self.ds = Model(args_model, N_FEATURES, N_CONTEXT, args_alphabet, BEAM_WIDTH)
        self.ds = Model(args_model, BEAM_WIDTH)
        model_load_end = timer() - model_load_start
        print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

        # if args_lm and args_trie:
        print('Loading language model from files %s %s' % (args_lm, args_trie),
              file=sys.stderr)
        lm_load_start = timer()
        # self.ds.enableDecoderWithLM(args_alphabet, args_lm, args_trie, LM_ALPHA, LM_BETA)
        self.ds.enableDecoderWithLM(args_lm, args_trie, LM_ALPHA, LM_BETA)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in %0.3fs.' % (lm_load_end),
              file=sys.stderr)

    def audio_to_text(self, fn):
        fin = wave.open(fn, 'rb')
        fs = fin.getframerate()
        assert fs == 16000, "Only 16000Hz input WAV files are supported for now!"
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
        audio_length = fin.getnframes() * (1. / fs)
        fin.close()

        print('Running inference.', file=sys.stderr)
        inference_start = timer()
        # text = self.ds.stt(audio, fs)
        text = self.ds.stt(audio)
        print('text:', text)
        inference_end = timer() - inference_start
        print('Inference took %0.3fs for %0.3fs audio file.' %
              (inference_end, audio_length),
              file=sys.stderr)
        return text
Esempio n. 29
0
        async def handle_message(sid, data):

            output_channel = SocketIOOutput(sio, sid, self.bot_message_evt,
                                            data['message'])
            if data['message'] == "/get_started":
                message = data['message']
            else:
                ##receive audio as .ogg
                received_file = sid + '.wav'

                urllib.request.urlretrieve(data['message'], received_file)
                path = os.path.dirname(__file__)
                #print(path)
                #print(sid)
                # convert .ogg file into int16 wave file by ffmpeg
                #-ar 44100
                os.system("ffmpeg -y -i {0} -ar 16000 output_{1}.wav".format(
                    received_file, sid))
                #os.system("ffmpeg -y -i {0} -c:a pcm_s161e output_{1}.wav".format(received_file,sid))
                N_FEATURES = 25
                N_CONTEXT = 9
                BEAM_WIDTH = 500
                LM_ALPHA = 0.75
                LM_BETA = 1.85

                ds = Model('deepspeech-0.5.1-models/output_graph.pbmm',
                           N_FEATURES, N_CONTEXT,
                           'deepspeech-0.5.1-models/alphabet.txt', BEAM_WIDTH)
                fs, audio = wav.read("output_{0}.wav".format(sid))
                message = ds.stt(audio, fs)

                #await self.sio.emit(self.bot_message_evt, response, room=socket_id)
                await sio.emit("user_uttered", {"text": message}, room=sid)
                #ffmpeg -i input.flv -f s16le -acodec pcm_s16le output.raw

            if self.session_persistence:
                #if not data.get("session_id"):
                #    logger.warning("A message without a valid sender_id "
                #                   "was received. This message will be "
                #                   "ignored. Make sure to set a proper "
                #                   "session id using the "
                #                   "`session_request` socketIO event.")
                #    return
                #sender_id = data['session_id']
                #else:
                sender_id = sid

            message_rasa = UserMessage(message,
                                       output_channel,
                                       sender_id,
                                       input_channel=self.name())
            await on_new_message(message_rasa)
Esempio n. 30
0
def deepspeech_predict(wav_output):
    N_FEATURES = 25
    N_CONTEXT = 9
    BEAM_WIDTH = 500

    print("* Loading model")
    ds = Model('deepspeech-0.5.1-models/output_graph.pbmm', N_FEATURES,
               N_CONTEXT, 'deepspeech-0.5.1-models/alphabet.txt', BEAM_WIDTH)

    print("* Reading audio file")
    fs, audio = wav.read(wav_output)
    print("* Predicting")
    return ds.stt(audio, fs)
Esempio n. 31
0
def main():
    parser = argparse.ArgumentParser(description='Running DeepSpeech inference.')
    parser.add_argument('--model', required=True,
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument('--alphabet', required=True,
                        help='Path to the configuration file specifying the alphabet used by the network')
    parser.add_argument('--lm', nargs='?',
                        help='Path to the language model binary file')
    parser.add_argument('--trie', nargs='?',
                        help='Path to the language model trie file created with native_client/generate_trie')
    parser.add_argument('--audio', required=True,
                        help='Path to the audio file to run (WAV format)')
    parser.add_argument('--version', action=VersionAction,
                        help='Print version and exits')
    parser.add_argument('--extended', required=False, action='store_true',
                        help='Output string from extended metadata')
    args = parser.parse_args()

    print('Loading model from file {}'.format(args.model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

    if args.lm and args.trie:
        print('Loading language model from files {} {}'.format(args.lm, args.trie), file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_ALPHA, LM_BETA)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr)

    fin = wave.open(args.audio, 'rb')
    fs = fin.getframerate()
    if fs != 16000:
        print('Warning: original sample rate ({}) is different than 16kHz. Resampling might produce erratic speech recognition.'.format(fs), file=sys.stderr)
        fs, audio = convert_samplerate(args.audio)
    else:
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    audio_length = fin.getnframes() * (1/16000)
    fin.close()

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    if args.extended:
        print(metadata_to_string(ds.sttWithMetadata(audio, fs)))
    else:
        print(ds.stt(audio, fs))
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
Esempio n. 32
0
def tflite_worker(model, alphabet, lm, trie, queue_in, queue_out, gpu_mask):
    os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_mask)
    ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)
    ds.enableDecoderWithLM(alphabet, lm, trie, LM_ALPHA, LM_BETA)

    while True:
        msg = queue_in.get()

        fin = wave.open(msg['filename'], 'rb')
        fs = fin.getframerate()
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
        audio_length = fin.getnframes() * (1/16000)
        fin.close()
    
        decoded = ds.stt(audio, fs)
        
        queue_out.put({'prediction': decoded, 'ground_truth': msg['transcript']})
        queue_in.task_done()
class Tester(BaseTester):

    name = 'DeepSpeech'

    audio_format = RATE16K_MONO_WAV

    def __init__(self, *args, **kwargs):
        super(Tester, self).__init__(*args, **kwargs)

        files = [args_lm, args_trie, args_model, args_alphabet]
        for f in files:
            assert os.path.isfile(f)

        print('Loading model from file %s' % (args_model), file=sys.stderr)
        model_load_start = timer()
        self.ds = Model(args_model, N_FEATURES, N_CONTEXT, args_alphabet, BEAM_WIDTH)
        model_load_end = timer() - model_load_start
        print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

        if args_lm and args_trie:
            print('Loading language model from files %s %s' % (args_lm, args_trie), file=sys.stderr)
            lm_load_start = timer()
            self.ds.enableDecoderWithLM(args_alphabet, args_lm, args_trie, LM_ALPHA, LM_BETA)
            lm_load_end = timer() - lm_load_start
            print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr)

    def audio_to_text(self, fn):
        fin = wave.open(fn, 'rb')
        fs = fin.getframerate()
        assert fs == 16000, "Only 16000Hz input WAV files are supported for now!"
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
        audio_length = fin.getnframes() * (1/16000)
        fin.close()

        print('Running inference.', file=sys.stderr)
        inference_start = timer()
        text = self.ds.stt(audio, fs)
        print('text:', text)
        inference_end = timer() - inference_start
        print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
        return text