コード例 #1
0
def recognize(model="../models/output_graph.pb",
              audio="../audio/2830-3980-0043.wav",
              alphabet="../models/alphabet.txt",
              lm="../models/lm.binary",
              trie="../models/trie"):
    print('Loading model from file %s' % (model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

    if lm and trie:
        print('Loading language model from files %s %s' % (lm, trie),
              file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT,
                               WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in %0.3fs.' % (lm_load_end),
              file=sys.stderr)

    fs, audio = wav.read(audio)
    # We can assume 16kHz
    audio_length = len(audio) * (1 / 16000)
    assert fs == 16000, "Only 16000Hz input WAV files are supported for now!"

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    result = ds.stt(audio, fs)
    print(result, file=sys.stderr)
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' %
          (inference_end, audio_length),
          file=sys.stderr)
    return result
コード例 #2
0
ファイル: recogonition.py プロジェクト: diggerdu/Debussy
class transciber(object):
    def __init__(self,
                 modelPath,
                 alphabet,
                 lmPath,
                 trie,
                 numFeatures=26,
                 numContext=9,
                 beamWidth=500):
        print('Loading model from file %s' % modelPath, file=sys.stderr)
        model_load_start = timer()
        self.model = Model(modelPath, numFeatures, numContext, alphabet,
                           beamWidth)
        self.model.enableDecoderWithLM(alphabet, lmPath, trie, LM_WEIGHT,
                                       WORD_COUNT_WEIGHT,
                                       VALID_WORD_COUNT_WEIGHT)
        model_load_end = timer() - model_load_start
        print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

    def transcribe(self, audioPath):
        fs, audio = wav.read(audioPath)
        audio_length = len(audio) * (1 / 16000)
        label = self.model.stt(audio, fs)
        print(label)
        return label
コード例 #3
0
ファイル: dshttp.py プロジェクト: xdraylin/DeepSpeechHTTP
class DeepSpeech:
    def __init__(self, model, alphabet, lm=None, trie=None):
        print('Loading model from file %s' % (model), file=sys.stderr)
        model_load_start = timer()
        self.ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)
        model_load_end = timer() - model_load_start
        print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

        if lm is not None and trie is not None:
            print('Loading language model from files %s %s' % (lm, trie),
                  file=sys.stderr)
            lm_load_start = timer()
            self.ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT,
                                        WORD_COUNT_WEIGHT,
                                        VALID_WORD_COUNT_WEIGHT)
            lm_load_end = timer() - lm_load_start
            print('Loaded language model in %0.3fs.' % (lm_load_end),
                  file=sys.stderr)

    def stt(self, audio_file):
        fs, audio = wav.read(audio_file)
        audio_length = len(audio) * (1 / 16000)
        print('Running inference.', file=sys.stderr)
        inference_start = timer()
        stt_result = self.ds.stt(audio, fs)
        print('Return result: ', stt_result)
        inference_end = timer() - inference_start
        print('Inference took %0.3fs for %0.3fs audio file.' %
              (inference_end, audio_length),
              file=sys.stderr)
        return stt_result
コード例 #4
0
class DeepSpeechImp:
    ds = None

    def __init__(self):

        logging.info('Loading model from file %s' % (shared_params.DS_MODEL))
        model_load_start = timer()
        self.ds = Model(shared_params.DS_MODEL, N_FEATURES, N_CONTEXT,
                        shared_params.DS_ALPHABET, shared_params.BEAM_WIDTH)
        model_load_end = timer() - model_load_start
        logging.info('Loaded model in %0.3fs.' % (model_load_end))
        logging.info('Loading language model from files %s %s' %
                     (shared_params.DS_LANGUAGE_MODEL, shared_params.DS_TRIE))
        lm_load_start = timer()
        self.ds.enableDecoderWithLM(shared_params.DS_ALPHABET,
                                    shared_params.DS_LANGUAGE_MODEL,
                                    shared_params.DS_TRIE,
                                    shared_params.LM_WEIGHT,
                                    shared_params.WORD_COUNT_WEIGHT,
                                    shared_params.VALID_WORD_COUNT_WEIGHT)
        lm_load_end = timer() - lm_load_start
        logging.info('Loaded language model in %0.3fs.' % (lm_load_end))

    def process_audio(self, audio_path):
        try:
            fs, audio = wav.read(audio_path)
            return self.ds.stt(audio, fs)
        except Exception as ex:
            logging.error(str(ex))
            return ""

    def __del__(self):
        del self.ds
コード例 #5
0
    def _worker_thread(self):
        print('restoring from {}'.format(model_file))
        model = Model(model_file, N_INPUT, N_CONTEXT, ALPHABET_CONFIG_PATH,
                      BEAM_WIDTH)
        model.enableDecoderWithLM(ALPHABET_CONFIG_PATH, LM_BINARY_PATH,
                                  LM_TRIE_PATH, LM_WEIGHT, WORD_COUNT_WEIGHT,
                                  VALID_WORD_COUNT_WEIGHT)

        while True:
            cmd, *args = self._queue.get()
            if cmd == 'sample':
                sample = args[0]
                file = wave.open(sample.wav_path)
                audio = np.frombuffer(file.readframes(file.getnframes()),
                                      dtype=np.int16)
                fs = file.getframerate()
                start = time.time()
                result = model.stt(audio, fs)
                inference_time = time.time() - start
                wav_time = wav_length(sample.wav_path)
                print('wav length: {}\ninference time: {}\nRTF: {:2f}'.format(
                    wav_time, inference_time, inference_time / wav_time))
                self.inference_done.emit(sample, result)
            elif cmd == 'stop':
                break

        sess.close()
コード例 #6
0
def main_deepspeech(args):
    args = parse_args_deep() if args is None else args
    print('Loading model from file %s' % (args.model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

    if args.lm and args.trie:
        print('Loading language model from files %s %s' % (args.lm, args.trie),
              file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT,
                               WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in %0.3fs.' % (lm_load_end),
              file=sys.stderr)

    fs, audio = wave.read(args.audio)
    # We can assume 16kHz
    audio_length = len(audio) * (1 / 16000)
    assert fs == 16000, "Only 16000Hz input WAV files are supported for now!"

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    print(ds.stt(audio, fs))
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' %
          (inference_end, audio_length),
          file=sys.stderr)
コード例 #7
0
class SpeechRecognizer:
    def __init__(self):
        self._model = Model(MODEL_PATH, N_FEATURES, N_CONTEXT, ALPHABET_PATH,
                            BEAM_WIDTH)

        self._model.enableDecoderWithLM(ALPHABET_PATH, LANGUAGE_MODEL_PATH,
                                        TRIE_PATH, LM_WEIGHT,
                                        WORD_COUNT_WEIGHT,
                                        VALID_WORD_COUNT_WEIGHT)

    def speech_to_text(self, audio_buffer, sample_rate):
        app.logger.info('processing audio file')
        audio = self._process_audio_data(audio_buffer, sample_rate)
        app.logger.info('starting recognition')

        start = time()
        text = self._model.stt(audio, SAMPLE_RATE)
        end = time()
        app.logger.info('finished in {:.3f}s'.format(end - start))

        return text

    def _process_audio_data(self, audio_buffer, original_sample_rate):
        audio = np.frombuffer(audio_buffer, dtype=np.int16)
        if original_sample_rate != SAMPLE_RATE:
            audio = self._resample(audio, original_sample_rate)
        return audio

    def _resample(self, audio, original_sample_rate):
        audio_length = len(audio) / original_sample_rate
        samples = int(audio_length * SAMPLE_RATE)
        return signal.resample(audio, samples).astype(np.int16)
コード例 #8
0
ファイル: transcribe_flickr8k.py プロジェクト: gchrupala/vgs
def main():
    model = "models/output_graph.pb"
    alphabet = "models/alphabet.txt"
    lm = "models/lm.binary"
    trie = "models/trie"

    ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)

    ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT, WORD_COUNT_WEIGHT,
                           VALID_WORD_COUNT_WEIGHT)

    with open("flickr_audio_transcription.txt", "w") as out:
        for audio_f in glob.glob(
                "/roaming/gchrupal/vgs/data/flickr8k/flickr_audio/wavs/*.wav"):
            print("Transcribing {}".format(audio_f))
            try:
                fs, audio = wav.read(audio_f)
                if fs != 16000:
                    if fs < 16000:
                        print(
                            'Warning: original sample rate (%d) is lower than 16kHz. Up-sampling might produce erratic speech recognition.'
                            % (fs),
                            file=sys.stderr)
                    fs, audio = convert_samplerate(args.audio)
                audio_length = len(audio) * (1 / 16000)
                basename, ext = os.path.splitext(os.path.basename(audio_f))
                out.write("{}\t{}\n".format(basename, ds.stt(audio, fs)))
                out.flush()
            except ValueError as e:
                print("Error: {}".format(e))
コード例 #9
0
def load_model():
    args = {
        'model': './models/output_graph.pb',
        'alphabet': './models/alphabet.txt',
        'lm': './models/lm.binary',
        'trie': './models/trie',
        'audio': './sample_input.wav'
    }

    print('Loading model from file {}'.format(args['model']), file=sys.stderr)
    model_load_start = timer()
    ds = Model(args['model'], N_FEATURES, N_CONTEXT, args['alphabet'],
               BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

    if args['lm'] and args['trie']:
        print('Loading language model from files {} {}'.format(
            args['lm'], args['trie']),
              file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(args['alphabet'],
                               args['lm'],
                               args['trie'],
                               aLMWeight=LM_WEIGHT,
                               aValidWordCountWeight=VALID_WORD_COUNT_WEIGHT,
                               aWordCountWeight=WORD_COUNT_WEIGHT)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in {:.3}s.'.format(lm_load_end),
              file=sys.stderr)
    return ds
コード例 #10
0
def build_model(init_settings):
    """

    :param init_settings: Configparser
    :return:
    """
    print('Loading DeepSpeech Models')
    try:
        ds = Model(str(init_settings['deepspeech']['model_path']),
                   int(init_settings['deepspeech']['N_FEATURES']),
                   int(init_settings['deepspeech']['N_CONTEXT']),
                   str(init_settings['deepspeech']['alphabet_path']),
                   int(init_settings['deepspeech']['BEAM_WIDTH']))
        ds.enableDecoderWithLM(
            str(init_settings['deepspeech']['alphabet_path']),
            str(init_settings['deepspeech']['lm_path']),
            str(init_settings['deepspeech']['trie_path']),
            float(init_settings['deepspeech']['LM_WEIGHT']),
            float(init_settings['deepspeech']['WORD_COUNT_WEIGHT']),
            float(init_settings['deepspeech']['VALID_WORD_COUNT_WEIGHT']))
        return ds
    except Exception as e:
        print('Loading Error!')
        print(e)
        return None
コード例 #11
0
ファイル: client.py プロジェクト: Kutim/Run-Black-Box-Audio
def main():
    parser = argparse.ArgumentParser(
        description='Benchmarking tooling for DeepSpeech native_client.')
    parser.add_argument('model',
                        type=str,
                        help='Path to the model (protocol buffer binary file)',
                        default="models/output_graph.pb")
    parser.add_argument('audio',
                        type=str,
                        help='Path to the audio file to run (WAV format)',
                        default="sample_input.wav")
    parser.add_argument(
        'alphabet',
        type=str,
        help=
        'Path to the configuration file specifying the alphabet used by the network',
        default="models/alphabet.txt")
    parser.add_argument('lm',
                        type=str,
                        nargs='?',
                        help='Path to the language model binary file',
                        default="models/lm.binary")
    parser.add_argument(
        'trie',
        type=str,
        nargs='?',
        help=
        'Path to the language model trie file created with native_client/generate_trie',
        default="models/trie")
    args = parser.parse_args()

    print('Loading model from file %s' % (args.model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

    if args.lm and args.trie:
        print('Loading language model from files %s %s' % (args.lm, args.trie),
              file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT,
                               WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in %0.3fs.' % (lm_load_end),
              file=sys.stderr)

    fs, audio = wav.read(args.audio)
    # We can assume 16kHz
    audio_length = len(audio) * (1 / 16000)
    assert fs == 16000, "Only 16000Hz input WAV files are supported for now!"

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    print(ds.stt(audio, fs))
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' %
          (inference_end, audio_length),
          file=sys.stderr)
コード例 #12
0
def main(model, alphabet, lm, trie, dest):

	# parser = argparse.ArgumentParser(description='Benchmarking tooling for DeepSpeech native_client.')
	# parser.add_argument('model', type=str,
	# 					help='Path to the model (protocol buffer binary file)')
	# parser.add_argument('alphabet', type=str,
	# 					help='Path to the configuration file specifying the alphabet used by the network')
	# parser.add_argument('lm', type=str, nargs='?',
	# 					help='Path to the language model binary file')
	# parser.add_argument('trie', type=str, nargs='?',
	# 					help='Path to the language model trie file created with native_client/generate_trie')
	# parser.add_argument('audio', type=str,
	# 					help='Path to the audio file to run (WAV format)')
	# args = parser.parse_args()

	# print(args);

	print('Loading model from file %s' % (model), file=sys.stderr)
	model_load_start = timer()
	ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)
	model_load_end = timer() - model_load_start
	print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

	if lm and trie:
		print('Loading language model from files %s %s' % (lm, trie), file=sys.stderr)
		lm_load_start = timer()
		ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT,
							   WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
		lm_load_end = timer() - lm_load_start
		print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr)

	# fs, audio = read_video(args.audio) #wav.read(args.audio)
	# return ;
	print('Running inference.', file=sys.stderr)
	clips = os.listdir(dest) ; # clips dir path

	subs = [] ;

	for i, clip in enumerate(clips) :
		fs, audio = wav.read(dest + str(i) + '.wav') ;

		if fs != 16000:
			if fs < 16000:
				print('Warning: original sample rate (%d) is lower than 16kHz. Up-sampling might produce erratic speech recognition.' % (fs), file=sys.stderr)
		
		fs, audio = convert_samplerate(dest + str(i) + '.wav')	
		audio_length = len(audio) * ( 1 / 16000)

		inference_start = timer()
		subs.append(ds.stt(audio, fs)) ;
		print(subs[len(subs) - 1]);

		inference_end = timer() - inference_start
		print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)

		# break ;

	return subs ;	
コード例 #13
0
ファイル: client.py プロジェクト: FeherBalazs/DeepSpeech-1
def main():
    parser = argparse.ArgumentParser(
        description='Benchmarking tooling for DeepSpeech native_client.')
    parser.add_argument('model',
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument(
        'alphabet',
        help=
        'Path to the configuration file specifying the alphabet used by the network'
    )
    parser.add_argument('lm',
                        nargs='?',
                        help='Path to the language model binary file')
    parser.add_argument(
        'trie',
        nargs='?',
        help=
        'Path to the language model trie file created with native_client/generate_trie'
    )
    parser.add_argument('audio',
                        help='Path to the audio file to run (WAV format)')
    args = parser.parse_args()

    print('Loading model from file %s' % (args.model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

    if args.lm and args.trie:
        print('Loading language model from files %s %s' % (args.lm, args.trie),
              file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT,
                               WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in %0.3fs.' % (lm_load_end),
              file=sys.stderr)

    fs, audio = wav.read(args.audio)
    if fs != 16000:
        if fs < 16000:
            print(
                'Warning: original sample rate (%d) is lower than 16kHz. Up-sampling might produce erratic speech recognition.'
                % (fs),
                file=sys.stderr)
        fs, audio = convert_samplerate(args.audio)
    audio_length = len(audio) * (1 / 16000)

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    print(ds.stt(audio, fs))
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' %
          (inference_end, audio_length),
          file=sys.stderr)
コード例 #14
0
    def setup_model(model_path, alphabet, lm, trie):
        if model_path and alphabet:
            print("creating model {} {}".format(model_path, alphabet))
            ds_model = Model(model_path, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)

            if lm and trie:
                ds_model.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT,
                               WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
            return ds_model
        return None
コード例 #15
0
def load_model():
    model_path = 'output_graph.pb'
    alphabet_path = 'alphabet.txt'
    lm_path = 'lm.binary'
    trie_path = 'trie'

    ds = Model(model_path, N_FEATURES, N_CONTEXT, alphabet_path, BEAM_WIDTH)

    ds.enableDecoderWithLM(alphabet_path, lm_path, trie_path, LM_WEIGHT,
                           WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
    return ds
コード例 #16
0
        def setup_model(model_path, alphabet, lm, trie):
            log("creating model {} {}...".format(model_path, alphabet))
            ds_model = Model(model_path, N_FEATURES, N_CONTEXT, alphabet,
                             BEAM_WIDTH)

            if lm and trie:
                ds_model.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT,
                                             WORD_COUNT_WEIGHT,
                                             VALID_WORD_COUNT_WEIGHT)
            log("model is ready.")
            return ds_model
コード例 #17
0
ファイル: client.py プロジェクト: RawStewage/DeepSpeech
def main():
    parser = argparse.ArgumentParser(description='Running DeepSpeech inference.')
    parser.add_argument('--model',
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument('--alphabet',
                        help='Path to the configuration file specifying the alphabet used by the network')
    parser.add_argument('--lm', nargs='?',
                        help='Path to the language model binary file')
    parser.add_argument('--trie', nargs='?',
                        help='Path to the language model trie file created with native_client/generate_trie')
    parser.add_argument('--audio',
                        help='Path to the audio file to run (WAV format)')
    parser.add_argument('--version',
                        help='Print version and exits')
    args = parser.parse_args()

    if args.version:
        print_versions()
        return 0

    print('Loading model from file {}'.format(args.model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

    if args.lm and args.trie:
        print('Loading language model from files {} {}'.format(args.lm, args.trie), file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT,
                               WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr)

    fin = wave.open(args.audio, 'rb')
    fs = fin.getframerate()
    if fs != 16000:
        print('Warning: original sample rate ({}) is different than 16kHz. Resampling might produce erratic speech recognition.'.format(fs), file=sys.stderr)
        fs, audio = convert_samplerate(args.audio)
    else:
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    audio_length = fin.getnframes() * (1/16000)
    fin.close()

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    print(ds.stt(audio, fs))
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
コード例 #18
0
def loadModel():
    global ds
    print('Loading model from file %s' % (MODEL_FILE), file=sys.stderr)
    model_load_start = timer()
    ds = Model(MODEL_FILE, N_FEATURES, N_CONTEXT, ALPHABET_FILE, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)
    print('Loading language model from files %s %s' %
          (LM_BINARY_FILE, TRIE_FILE),
          file=sys.stderr)
    lm_load_start = timer()
    ds.enableDecoderWithLM(ALPHABET_FILE, LM_BINARY_FILE, TRIE_FILE, LM_WEIGHT,
                           WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
    lm_load_end = timer() - lm_load_start
    print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr)
コード例 #19
0
def stt(audioPath):

    model = conf.get_config('model')
    alphabet = conf.get_config('alphabet')
    lm = conf.get_config('lm')
    trie = conf.get_config('trie')

    ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)
    if lm and trie:
        ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT,
                               WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)

    fs, audio = wav.read(audioPath)
    text = ds.stt(audio, fs)

    return text
コード例 #20
0
def main():
    parser = argparse.ArgumentParser(description='Benchmarking tooling for DeepSpeech native_client.')
    parser.add_argument('model', type=str,
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument('audio', type=str,
                        help='Path to the audio file to run (WAV format)')
    parser.add_argument('alphabet', type=str,
                        help='Path to the configuration file specifying the alphabet used by the network')
    parser.add_argument('lm', type=str, nargs='?',
                        help='Path to the language model binary file')
    parser.add_argument('trie', type=str, nargs='?',
                        help='Path to the language model trie file created with native_client/generate_trie')
    args = parser.parse_args()

    print('Loading model from file %s' % (args.model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

    if args.lm and args.trie:
        print('Loading language model from files %s %s' % (args.lm, args.trie), file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT,
                               WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr)

    for path in sorted(glob.glob(args.audio))[::1]:
        target = os.path.splitext(path)[0] + '.txt'
        if os.path.exists(target):
            continue

        fs, audio = wav.read(path)
        # We can assume 16kHz
        audio_length = len(audio) * (1 / 16000)
        assert fs == 16000, "Only 16000Hz input WAV files are supported for now!"
    
        print('Running inference of %s.' % path, file=sys.stderr)
        inference_start = timer()
        text = ds.stt(audio, fs)
        print(text)
        inference_end = timer() - inference_start
        print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)

        with open(target, 'w') as out:
            out.write(text)
コード例 #21
0
ファイル: engine.py プロジェクト: yushenxiang/stt-benchmark
class MozillaDeepSpeechASREngine(ASREngine):
    """https://github.com/mozilla/DeepSpeech"""
    def __init__(self,
                 model_path,
                 alphabet_path,
                 language_model_path=None,
                 trie_path=None):
        """
        Constructor.

        :param model_path: Absolute path to (acoustic) model file.
        :param alphabet_path: Absolute path to file containing alphabet.
        :param language_model_path: Absolute path to language model file. This parameter is optional. Set to
        enable decoding with language model.
        :param trie_path: Absolute path to trie. This parameter is optional. Set to enable decoding with language model.
        """

        # https://github.com/mozilla/DeepSpeech/blob/master/native_client/python/client.py
        self._model = Model(aModelPath=model_path,
                            aNCep=26,
                            aNContext=9,
                            aAlphabetConfigPath=alphabet_path,
                            aBeamWidth=500)

        if language_model_path is not None and trie_path is not None:
            self._model.enableDecoderWithLM(aAlphabetConfigPath=alphabet_path,
                                            aLMPath=language_model_path,
                                            aTriePath=trie_path,
                                            aLMWeight=1.75,
                                            aWordCountWeight=1.0,
                                            aValidWordCountWeight=1.0)
            self._with_language_model = True
        else:
            self._with_language_model = False

    def transcribe(self, path):
        pcm, sample_rate = soundfile.read(path)
        pcm = (np.iinfo(np.int16).max * pcm).astype(np.int16)

        return self._model.stt(pcm, aSampleRate=sample_rate)

    def __str__(self):
        if self._with_language_model:
            return 'Mozilla DeepSpeech (with language model)'
        else:
            return 'Mozilla DeepSpeech'
コード例 #22
0
def load_model(model_path, alphabet_path, lm_path, trie_path):
    print('Loading model from file %s' % (model_path), file=sys.stderr)
    model_load_start = timer()
    ds = Model(model_path, N_FEATURES, N_CONTEXT, alphabet_path, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

    if lm_path and trie_path:
        print('Loading language model from files %s %s' % (lm_path, trie_path),
              file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT,
                               WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in %0.3fs.' % (lm_load_end),
              file=sys.stderr)
    return ds
コード例 #23
0
def load_model():
    model_path = 'output_graph.pb'
    alphabet_path = 'alphabet.txt'
    lm_path = 'lm.binary'
    trie_path = 'trie'

    #print('Loading model from file %s' % (model_path), file=sys.stderr)
    #model_load_start = timer()
    ds = Model(model_path, N_FEATURES, N_CONTEXT, alphabet_path, BEAM_WIDTH)
    #model_load_end = timer() - model_load_start
    #print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

    #print('Loading language model from files %s %s' % (lm_path, trie_path), file=sys.stderr)
    #lm_load_start = timer()
    ds.enableDecoderWithLM(alphabet_path, lm_path, trie_path, LM_WEIGHT,
                           WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
    #lm_load_end = timer() - lm_load_start
    #print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr)
    return ds
コード例 #24
0
ファイル: client.py プロジェクト: RussellCloud/DeepSpeech
def main():
    parser = argparse.ArgumentParser(description='Benchmarking tooling for DeepSpeech native_client.')
    parser.add_argument('model', type=str,
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument('alphabet', type=str,
                        help='Path to the configuration file specifying the alphabet used by the network')
    parser.add_argument('lm', type=str, nargs='?',
                        help='Path to the language model binary file')
    parser.add_argument('trie', type=str, nargs='?',
                        help='Path to the language model trie file created with native_client/generate_trie')
    parser.add_argument('audio', type=str,
                        help='Path to the audio file to run (WAV format)')
    args = parser.parse_args()

    print('Loading model from file %s' % (args.model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

    if args.lm and args.trie:
        print('Loading language model from files %s %s' % (args.lm, args.trie), file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT,
                               WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr)

    fs, audio = wav.read(args.audio)
    if fs != 16000:
        if fs < 16000:
            print('Warning: original sample rate (%d) is lower than 16kHz. Up-sampling might produce erratic speech recognition.' % (fs), file=sys.stderr)
        fs, audio = convert_samplerate(args.audio)
    audio_length = len(audio) * ( 1 / 16000)

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    print(ds.stt(audio, fs))
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
コード例 #25
0
class DeepSpeech:
    """Wrap DeepSpeech and provide the methods we need"""

    def __init__(self, settings):

        self.beam_width = 1024
        self.lm_weight = 1.75
        self.word_count_weight = 1.00
        self.valid_word_count_weight = 1.00
        self.n_features = 26
        self.n_context = 9
        self.alphabet = settings.get('alphabet')
        self.lm = settings.get('lm')
        self.trie = settings.get('trie')
        self.graph = settings.get('graph')

    def load_model(self):
        start = timer()
        self.model = Model(self.graph, self.n_features, self.n_context, self.alphabet, self.beam_width)
        end = timer()
        print('Loaded model in %0.3fs.' % (end - start))
        if self.lm is not None and self.trie is not None:
            start = timer()
            self.model.enableDecoderWithLM(
                self.alphabet, self.lm,
                self.trie, self.lm_weight,
                self.word_count_weight,
                self.valid_word_count_weight
            )
            end = timer()
            print('Loaded language model in %0.3fs.' % (end - start))

    def oneshoot(self, wav_file):
        fs, audio = wav.read(wav_file)
        start = timer()
        result = self.model.stt(audio, fs)
        latency = timer() - start
        audio_length = len(audio) * ( 1 / 16000)
        return result, latency
コード例 #26
0
class SpeechToText():
    def __init__(self, model_path):
        # Defined constants. See https://github.com/mozilla/DeepSpeech/blob/master/native_client/python/client.py
        BEAM_WIDTH = 500
        LM_WEIGHT = 1.75
        WORD_COUNT_WEIGHT = 1.00
        VALID_WORD_COUNT_WEIGHT = 1.00
        N_FEATURES = 26
        N_CONTEXT = 9

        model = os.path.join(model_path, "output_graph.pb")
        alphabet = os.path.join(model_path, "alphabet.txt")
        lm = os.path.join(model_path, "lm.binary")
        trie = os.path.join(model_path, "trie")

        self.model = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)
        self.model.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT,
                                       WORD_COUNT_WEIGHT,
                                       VALID_WORD_COUNT_WEIGHT)

    def run(self, audio, fs):
        return self.model.stt(audio, fs)
コード例 #27
0
def load_model(args):
    BEAM_WIDTH = 500
    LM_WEIGHT = 1.75
    WORD_COUNT_WEIGHT = 1.00
    VALID_WORD_COUNT_WEIGHT = 1.00
    N_FEATURES = 26
    N_CONTEXT = 9

    print('Loading model from file %s' % (args.model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

    if args.lm and args.trie:
        print('Loading language model from files %s %s' % (args.lm, args.trie), file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT,
                               WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr)

    return ds
コード例 #28
0
    def load_ds_model(self):
        """ Loading the deepspeech module.
            return: deepspeech object
        """
        logging.info('Loading model from file %s' % (self.model))
        model_load_start = timer()
        ds = Model(self.model, self.N_FEATURES, self.N_CONTEXT, self.alphabet,
                   self.BEAM_WIDTH)
        model_load_end = timer() - model_load_start
        logging.info('Loaded model in %0.3fs.' % (model_load_end))

        # Load the lm and trie only if the path is given
        if self.lm and self.trie:
            logging.info('Loading language model from files %s %s' %
                         (self.lm, self.trie))
            lm_load_start = timer()
            ds.enableDecoderWithLM(self.alphabet, self.lm, self.trie,
                                   self.LM_WEIGHT, self.WORD_COUNT_WEIGHT,
                                   self.VALID_WORD_COUNT_WEIGHT)
            lm_load_end = timer() - lm_load_start
            logging.info('Loaded language model in %0.3fs.' % (lm_load_end))

        return ds
コード例 #29
0
def recognize_deepspeech(audio):

    model = path.join(path.dirname(path.realpath(__file__)),
                      'models/output_graph.pb')
    alphabet = path.join(path.dirname(path.realpath(__file__)),
                         'models/alphabet.txt')
    lm = path.join(path.dirname(path.realpath(__file__)), 'models/lm.binary')
    trie = path.join(path.dirname(path.realpath(__file__)), 'models/trie')

    #print('Loading model from file %s' % (model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    #print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

    if lm and trie:
        #print('Loading language model from files %s %s' % (lm, trie), file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT,
                               WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
        lm_load_end = timer() - lm_load_start
        #print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr)

    fs, audio = wav.read(audio)
    if fs != 16000:
        if fs < 16000:
            print(
                'Warning: original sample rate (%d) is lower than 16kHz. Up-sampling might produce erratic speech recognition.'
                % (fs),
                file=sys.stderr)
        fs, audio = convert_samplerate(audio)
    audio_length = len(audio) * (1 / 16000)

    #print('Running inference.', file=sys.stderr)
    #inference_start = timer()
    #inference_end = timer() - inference_start
    return ds.stt(audio, fs)
コード例 #30
0
ファイル: client.py プロジェクト: warent/DeepSpeech
def main():
    parser = argparse.ArgumentParser(
        description='Benchmarking tooling for DeepSpeech native_client.')
    parser.add_argument('model',
                        type=str,
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument('audio',
                        type=str,
                        help='Path to the audio file to run (WAV format)')
    parser.add_argument(
        'alphabet',
        type=str,
        help=
        'Path to the configuration file specifying the alphabet used by the network'
    )
    parser.add_argument('lm',
                        type=str,
                        nargs='?',
                        help='Path to the language model binary file')
    parser.add_argument(
        'trie',
        type=str,
        nargs='?',
        help=
        'Path to the language model trie file created with native_client/generate_trie'
    )
    args = parser.parse_args()

    ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH)

    if args.lm and args.trie:
        ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT,
                               WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)

    fs, audio = wav.read(args.audio)
    print(ds.stt(audio, fs))
コード例 #31
0
def deepspeech_main():
    # These constants control the beam search decoder

    # Beam width used in the CTC decoder when building candidate transcriptions
    BEAM_WIDTH = 500

    # The alpha hyperparameter of the CTC decoder. Language Model weight
    LM_WEIGHT = 1.75

    # The beta hyperparameter of the CTC decoder. Word insertion weight (penalty)
    WORD_COUNT_WEIGHT = 1.00

    # Valid word insertion weight. This is used to lessen the word insertion penalty
    # when the inserted word is part of the vocabulary
    VALID_WORD_COUNT_WEIGHT = 1.00

    # These constants are tied to the shape of the graph used (changing them changes
    # the geometry of the first layer), so make sure you use the same constants that
    # were used during training

    # Number of MFCC features to use
    N_FEATURES = 26

    # Size of the context window used for producing timesteps in the input vector
    N_CONTEXT = 9

    parser = argparse.ArgumentParser(
        description='Benchmarking tooling for DeepSpeech native_client.')
    parser.add_argument('model',
                        type=str,
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument('audio',
                        type=str,
                        help='Path to the audio file to run (WAV format)')
    parser.add_argument(
        'alphabet',
        type=str,
        help=
        'Path to the configuration file specifying the alphabet used by the network'
    )
    parser.add_argument('lm',
                        type=str,
                        nargs='?',
                        help='Path to the language model binary file')
    parser.add_argument(
        'trie',
        type=str,
        nargs='?',
        help=
        'Path to the language model trie file created with native_client/generate_trie'
    )
    args = parser.parse_args()

    print('Loading model from file %s' % (args.model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

    if args.lm and args.trie:
        print('Loading language model from files %s %s' % (args.lm, args.trie),
              file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT,
                               WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in %0.3fs.' % (lm_load_end),
              file=sys.stderr)

    fs, audio = wav.read(args.audio)
    # We can assume 16kHz
    audio_length = len(audio) * (1 / 16000)
    assert fs == 16000, "Only 16000Hz input WAV files are supported for now!"

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    print(ds.stt(audio, fs))
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' %
          (inference_end, audio_length),
          file=sys.stderr)
コード例 #32
0
ファイル: main.py プロジェクト: stevemurr/deepspeech-server
if __name__ == '__main__':
    args, params, err = setup_args()
    if err:
        check_err(err)

    ds = Model(
        params["model"], 
        params["n_features"], 
        params["n_context"], 
        params["alphabet"], 
        params["beam_width"])
    ds.enableDecoderWithLM(
        params["alphabet"], 
        params["lm"], 
        params["trie"], 
        params["lm_weight"], 
        params["word_count_weight"], 
        params["valid_word_count_weight"])

    logger = setup_logger()
    routes = {
        "/api/reco": SpeechRecognitionResource(ds)
    } 
    
    api = setup_api(routes, middleware=logger)

    try:
        bjoern.run(api, host='0.0.0.0', port=args.port)
    except KeyboardInterrupt:
        sys.exit(0)