Ejemplo n.º 1
0
def main_deepspeech(args):
    args = parse_args_deep() if args is None else args
    print('Loading model from file %s' % (args.model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

    if args.lm and args.trie:
        print('Loading language model from files %s %s' % (args.lm, args.trie),
              file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT,
                               WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in %0.3fs.' % (lm_load_end),
              file=sys.stderr)

    fs, audio = wave.read(args.audio)
    # We can assume 16kHz
    audio_length = len(audio) * (1 / 16000)
    assert fs == 16000, "Only 16000Hz input WAV files are supported for now!"

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    print(ds.stt(audio, fs))
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' %
          (inference_end, audio_length),
          file=sys.stderr)
Ejemplo n.º 2
0
def build_model(init_settings):
    """

    :param init_settings: Configparser
    :return:
    """
    print('Loading DeepSpeech Models')
    try:
        ds = Model(str(init_settings['deepspeech']['model_path']),
                   int(init_settings['deepspeech']['N_FEATURES']),
                   int(init_settings['deepspeech']['N_CONTEXT']),
                   str(init_settings['deepspeech']['alphabet_path']),
                   int(init_settings['deepspeech']['BEAM_WIDTH']))
        ds.enableDecoderWithLM(
            str(init_settings['deepspeech']['alphabet_path']),
            str(init_settings['deepspeech']['lm_path']),
            str(init_settings['deepspeech']['trie_path']),
            float(init_settings['deepspeech']['LM_WEIGHT']),
            float(init_settings['deepspeech']['WORD_COUNT_WEIGHT']),
            float(init_settings['deepspeech']['VALID_WORD_COUNT_WEIGHT']))
        return ds
    except Exception as e:
        print('Loading Error!')
        print(e)
        return None
Ejemplo n.º 3
0
 def __init__(self, modelPath, alphabet, lmPath, trie, numFeatures=26, numContext=9, beamWidth=500):
     print('Loading model from file %s' % modelPath, file=sys.stderr)
     model_load_start = timer()
     self.model = Model(modelPath, numFeatures, numContext, alphabet, beamWidth)
     #self.model.enableDecoderWithLM(alphabet, lmPath, trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
     model_load_end = timer() - model_load_start
     print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)
Ejemplo n.º 4
0
    def _worker_thread(self):
        print('restoring from {}'.format(model_file))
        model = Model(model_file, N_INPUT, N_CONTEXT, ALPHABET_CONFIG_PATH,
                      BEAM_WIDTH)
        model.enableDecoderWithLM(ALPHABET_CONFIG_PATH, LM_BINARY_PATH,
                                  LM_TRIE_PATH, LM_WEIGHT, WORD_COUNT_WEIGHT,
                                  VALID_WORD_COUNT_WEIGHT)

        while True:
            cmd, *args = self._queue.get()
            if cmd == 'sample':
                sample = args[0]
                file = wave.open(sample.wav_path)
                audio = np.frombuffer(file.readframes(file.getnframes()),
                                      dtype=np.int16)
                fs = file.getframerate()
                start = time.time()
                result = model.stt(audio, fs)
                inference_time = time.time() - start
                wav_time = wav_length(sample.wav_path)
                print('wav length: {}\ninference time: {}\nRTF: {:2f}'.format(
                    wav_time, inference_time, inference_time / wav_time))
                self.inference_done.emit(sample, result)
            elif cmd == 'stop':
                break

        sess.close()
Ejemplo n.º 5
0
    def __init__(self,
                 model_path,
                 alphabet_path,
                 language_model_path=None,
                 trie_path=None):
        """
        Constructor.

        :param model_path: Absolute path to (acoustic) model file.
        :param alphabet_path: Absolute path to file containing alphabet.
        :param language_model_path: Absolute path to language model file. This parameter is optional. Set to
        enable decoding with language model.
        :param trie_path: Absolute path to trie. This parameter is optional. Set to enable decoding with language model.
        """

        # https://github.com/mozilla/DeepSpeech/blob/master/native_client/python/client.py
        self._model = Model(aModelPath=model_path,
                            aNCep=26,
                            aNContext=9,
                            aAlphabetConfigPath=alphabet_path,
                            aBeamWidth=500)

        if language_model_path is not None and trie_path is not None:
            self._model.enableDecoderWithLM(aAlphabetConfigPath=alphabet_path,
                                            aLMPath=language_model_path,
                                            aTriePath=trie_path,
                                            aLMWeight=1.75,
                                            aWordCountWeight=1.0,
                                            aValidWordCountWeight=1.0)
            self._with_language_model = True
        else:
            self._with_language_model = False
Ejemplo n.º 6
0
def recognize(model="../models/output_graph.pb",
              audio="../audio/2830-3980-0043.wav",
              alphabet="../models/alphabet.txt",
              lm="../models/lm.binary",
              trie="../models/trie"):
    print('Loading model from file %s' % (model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

    if lm and trie:
        print('Loading language model from files %s %s' % (lm, trie),
              file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT,
                               WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in %0.3fs.' % (lm_load_end),
              file=sys.stderr)

    fs, audio = wav.read(audio)
    # We can assume 16kHz
    audio_length = len(audio) * (1 / 16000)
    assert fs == 16000, "Only 16000Hz input WAV files are supported for now!"

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    result = ds.stt(audio, fs)
    print(result, file=sys.stderr)
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' %
          (inference_end, audio_length),
          file=sys.stderr)
    return result
Ejemplo n.º 7
0
def load_model():
    args = {
        'model': './models/output_graph.pb',
        'alphabet': './models/alphabet.txt',
        'lm': './models/lm.binary',
        'trie': './models/trie',
        'audio': './sample_input.wav'
    }

    print('Loading model from file {}'.format(args['model']), file=sys.stderr)
    model_load_start = timer()
    ds = Model(args['model'], N_FEATURES, N_CONTEXT, args['alphabet'],
               BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

    if args['lm'] and args['trie']:
        print('Loading language model from files {} {}'.format(
            args['lm'], args['trie']),
              file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(args['alphabet'],
                               args['lm'],
                               args['trie'],
                               aLMWeight=LM_WEIGHT,
                               aValidWordCountWeight=VALID_WORD_COUNT_WEIGHT,
                               aWordCountWeight=WORD_COUNT_WEIGHT)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in {:.3}s.'.format(lm_load_end),
              file=sys.stderr)
    return ds
Ejemplo n.º 8
0
    def __init__(self, vocabulary, graph="models/output_graph.pb",
                 alphabet="models/alphabet.txt"):

        self._logger = logging.getLogger(__name__)
        self._logger.debug("Initializing DeepSpeech with graph '%s' " +
                           "and alphabet '%s'", graph, alphabet)
        self._model = Model(graph, 26, 9, alphabet, 500)
Ejemplo n.º 9
0
def main():
    model = "models/output_graph.pb"
    alphabet = "models/alphabet.txt"
    lm = "models/lm.binary"
    trie = "models/trie"

    ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)

    ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT, WORD_COUNT_WEIGHT,
                           VALID_WORD_COUNT_WEIGHT)

    with open("flickr_audio_transcription.txt", "w") as out:
        for audio_f in glob.glob(
                "/roaming/gchrupal/vgs/data/flickr8k/flickr_audio/wavs/*.wav"):
            print("Transcribing {}".format(audio_f))
            try:
                fs, audio = wav.read(audio_f)
                if fs != 16000:
                    if fs < 16000:
                        print(
                            'Warning: original sample rate (%d) is lower than 16kHz. Up-sampling might produce erratic speech recognition.'
                            % (fs),
                            file=sys.stderr)
                    fs, audio = convert_samplerate(args.audio)
                audio_length = len(audio) * (1 / 16000)
                basename, ext = os.path.splitext(os.path.basename(audio_f))
                out.write("{}\t{}\n".format(basename, ds.stt(audio, fs)))
                out.flush()
            except ValueError as e:
                print("Error: {}".format(e))
Ejemplo n.º 10
0
def main():
    parser = argparse.ArgumentParser(
        description='Benchmarking tooling for DeepSpeech native_client.')
    parser.add_argument('model',
                        type=str,
                        help='Path to the model (protocol buffer binary file)',
                        default="models/output_graph.pb")
    parser.add_argument('audio',
                        type=str,
                        help='Path to the audio file to run (WAV format)',
                        default="sample_input.wav")
    parser.add_argument(
        'alphabet',
        type=str,
        help=
        'Path to the configuration file specifying the alphabet used by the network',
        default="models/alphabet.txt")
    parser.add_argument('lm',
                        type=str,
                        nargs='?',
                        help='Path to the language model binary file',
                        default="models/lm.binary")
    parser.add_argument(
        'trie',
        type=str,
        nargs='?',
        help=
        'Path to the language model trie file created with native_client/generate_trie',
        default="models/trie")
    args = parser.parse_args()

    print('Loading model from file %s' % (args.model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

    if args.lm and args.trie:
        print('Loading language model from files %s %s' % (args.lm, args.trie),
              file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT,
                               WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in %0.3fs.' % (lm_load_end),
              file=sys.stderr)

    fs, audio = wav.read(args.audio)
    # We can assume 16kHz
    audio_length = len(audio) * (1 / 16000)
    assert fs == 16000, "Only 16000Hz input WAV files are supported for now!"

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    print(ds.stt(audio, fs))
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' %
          (inference_end, audio_length),
          file=sys.stderr)
    def __init__(self):
        self._model = Model(MODEL_PATH, N_FEATURES, N_CONTEXT, ALPHABET_PATH,
                            BEAM_WIDTH)

        self._model.enableDecoderWithLM(ALPHABET_PATH, LANGUAGE_MODEL_PATH,
                                        TRIE_PATH, LM_WEIGHT,
                                        WORD_COUNT_WEIGHT,
                                        VALID_WORD_COUNT_WEIGHT)
Ejemplo n.º 12
0
def main(model, alphabet, lm, trie, dest):

	# parser = argparse.ArgumentParser(description='Benchmarking tooling for DeepSpeech native_client.')
	# parser.add_argument('model', type=str,
	# 					help='Path to the model (protocol buffer binary file)')
	# parser.add_argument('alphabet', type=str,
	# 					help='Path to the configuration file specifying the alphabet used by the network')
	# parser.add_argument('lm', type=str, nargs='?',
	# 					help='Path to the language model binary file')
	# parser.add_argument('trie', type=str, nargs='?',
	# 					help='Path to the language model trie file created with native_client/generate_trie')
	# parser.add_argument('audio', type=str,
	# 					help='Path to the audio file to run (WAV format)')
	# args = parser.parse_args()

	# print(args);

	print('Loading model from file %s' % (model), file=sys.stderr)
	model_load_start = timer()
	ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)
	model_load_end = timer() - model_load_start
	print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

	if lm and trie:
		print('Loading language model from files %s %s' % (lm, trie), file=sys.stderr)
		lm_load_start = timer()
		ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT,
							   WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
		lm_load_end = timer() - lm_load_start
		print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr)

	# fs, audio = read_video(args.audio) #wav.read(args.audio)
	# return ;
	print('Running inference.', file=sys.stderr)
	clips = os.listdir(dest) ; # clips dir path

	subs = [] ;

	for i, clip in enumerate(clips) :
		fs, audio = wav.read(dest + str(i) + '.wav') ;

		if fs != 16000:
			if fs < 16000:
				print('Warning: original sample rate (%d) is lower than 16kHz. Up-sampling might produce erratic speech recognition.' % (fs), file=sys.stderr)
		
		fs, audio = convert_samplerate(dest + str(i) + '.wav')	
		audio_length = len(audio) * ( 1 / 16000)

		inference_start = timer()
		subs.append(ds.stt(audio, fs)) ;
		print(subs[len(subs) - 1]);

		inference_end = timer() - inference_start
		print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)

		# break ;

	return subs ;	
Ejemplo n.º 13
0
def main():
    parser = argparse.ArgumentParser(
        description='Benchmarking tooling for DeepSpeech native_client.')
    parser.add_argument('model',
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument(
        'alphabet',
        help=
        'Path to the configuration file specifying the alphabet used by the network'
    )
    parser.add_argument('lm',
                        nargs='?',
                        help='Path to the language model binary file')
    parser.add_argument(
        'trie',
        nargs='?',
        help=
        'Path to the language model trie file created with native_client/generate_trie'
    )
    parser.add_argument('audio',
                        help='Path to the audio file to run (WAV format)')
    args = parser.parse_args()

    print('Loading model from file %s' % (args.model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

    if args.lm and args.trie:
        print('Loading language model from files %s %s' % (args.lm, args.trie),
              file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT,
                               WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in %0.3fs.' % (lm_load_end),
              file=sys.stderr)

    fs, audio = wav.read(args.audio)
    if fs != 16000:
        if fs < 16000:
            print(
                'Warning: original sample rate (%d) is lower than 16kHz. Up-sampling might produce erratic speech recognition.'
                % (fs),
                file=sys.stderr)
        fs, audio = convert_samplerate(args.audio)
    audio_length = len(audio) * (1 / 16000)

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    print(ds.stt(audio, fs))
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' %
          (inference_end, audio_length),
          file=sys.stderr)
Ejemplo n.º 14
0
    def setup_model(model_path, alphabet, lm, trie):
        if model_path and alphabet:
            print("creating model {} {}".format(model_path, alphabet))
            ds_model = Model(model_path, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)

            if lm and trie:
                ds_model.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT,
                               WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
            return ds_model
        return None
Ejemplo n.º 15
0
 def __init__(self, *args, **kwargs):
     plugin.STTPlugin.__init__(self, *args, **kwargs)
     self._logger = logging.getLogger(__name__)
     self._plugin_config = self.profile['deepspeech']
     graph = self._plugin_config['graph']
     alphabet = self._plugin_config['alphabet']
     self._logger.debug(
         "Initializing DeepSpeech with graph '%s' " + "and alphabet '%s'",
         graph, alphabet)
     self._model = Model(graph, 26, 9, alphabet, 500)
Ejemplo n.º 16
0
        def setup_model(model_path, alphabet, lm, trie):
            log("creating model {} {}...".format(model_path, alphabet))
            ds_model = Model(model_path, N_FEATURES, N_CONTEXT, alphabet,
                             BEAM_WIDTH)

            if lm and trie:
                ds_model.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT,
                                             WORD_COUNT_WEIGHT,
                                             VALID_WORD_COUNT_WEIGHT)
            log("model is ready.")
            return ds_model
Ejemplo n.º 17
0
def load_model():
    model_path = 'output_graph.pb'
    alphabet_path = 'alphabet.txt'
    lm_path = 'lm.binary'
    trie_path = 'trie'

    ds = Model(model_path, N_FEATURES, N_CONTEXT, alphabet_path, BEAM_WIDTH)

    ds.enableDecoderWithLM(alphabet_path, lm_path, trie_path, LM_WEIGHT,
                           WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
    return ds
Ejemplo n.º 18
0
 def load_model(self):
     start = timer()
     self.model = Model(self.graph, self.n_features, self.n_context, self.alphabet, self.beam_width)
     end = timer()
     print('Loaded model in %0.3fs.' % (end - start))
     if self.lm is not None and self.trie is not None:
         start = timer()
         self.model.enableDecoderWithLM(
             self.alphabet, self.lm,
             self.trie, self.lm_weight,
             self.word_count_weight,
             self.valid_word_count_weight
         )
         end = timer()
         print('Loaded language model in %0.3fs.' % (end - start))
Ejemplo n.º 19
0
def loadModel():
    global ds
    print('Loading model from file %s' % (MODEL_FILE), file=sys.stderr)
    model_load_start = timer()
    ds = Model(MODEL_FILE, N_FEATURES, N_CONTEXT, ALPHABET_FILE, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)
    print('Loading language model from files %s %s' %
          (LM_BINARY_FILE, TRIE_FILE),
          file=sys.stderr)
    lm_load_start = timer()
    ds.enableDecoderWithLM(ALPHABET_FILE, LM_BINARY_FILE, TRIE_FILE, LM_WEIGHT,
                           WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
    lm_load_end = timer() - lm_load_start
    print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr)
Ejemplo n.º 20
0
class SpeechServerMain(AppConfig):
    name = 'speech_server_main'
    conf = config.ConfigDeepSpeech()
    model = conf.get_config('model')
    alphabet = conf.get_config('alphabet')
    lm = conf.get_config('lm')
    trie = conf.get_config('trie')

    ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)
    if lm and trie:
        ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT,
                               WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)

    def ready(self):
        print("Deepspeech Server Initialization")
    def load_model(self):
        print('Loading model from file %s' % (MODEL_PATH))
        model_load_start = timer()
        self.ds = Model(MODEL_PATH, N_FEATURES, N_CONTEXT, ALPHABET_PATH,
                        BEAM_WIDTH)

        model_load_end = timer() - model_load_start
        print('Loaded model in %0.3fs.' % (model_load_end))

        print('Loading language model from files %s %s' % (LM_PATH, TRIE_PATH))
        lm_load_start = timer()
        self.ds.enableDecoderWithLM(ALPHABET_PATH, LM_PATH, TRIE_PATH,
                                    LM_WEIGHT, WORD_COUNT_WEIGHT,
                                    VALID_WORD_COUNT_WEIGHT)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in %0.3fs.' % (lm_load_end))
Ejemplo n.º 22
0
def stt(audioPath):

    model = conf.get_config('model')
    alphabet = conf.get_config('alphabet')
    lm = conf.get_config('lm')
    trie = conf.get_config('trie')

    ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)
    if lm and trie:
        ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT,
                               WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)

    fs, audio = wav.read(audioPath)
    text = ds.stt(audio, fs)

    return text
Ejemplo n.º 23
0
def main():
    parser = argparse.ArgumentParser(description='Benchmarking tooling for DeepSpeech native_client.')
    parser.add_argument('model', type=str,
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument('audio', type=str,
                        help='Path to the audio file to run (WAV format)')
    parser.add_argument('alphabet', type=str,
                        help='Path to the configuration file specifying the alphabet used by the network')
    parser.add_argument('lm', type=str, nargs='?',
                        help='Path to the language model binary file')
    parser.add_argument('trie', type=str, nargs='?',
                        help='Path to the language model trie file created with native_client/generate_trie')
    args = parser.parse_args()

    print('Loading model from file %s' % (args.model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

    if args.lm and args.trie:
        print('Loading language model from files %s %s' % (args.lm, args.trie), file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT,
                               WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr)

    for path in sorted(glob.glob(args.audio))[::1]:
        target = os.path.splitext(path)[0] + '.txt'
        if os.path.exists(target):
            continue

        fs, audio = wav.read(path)
        # We can assume 16kHz
        audio_length = len(audio) * (1 / 16000)
        assert fs == 16000, "Only 16000Hz input WAV files are supported for now!"
    
        print('Running inference of %s.' % path, file=sys.stderr)
        inference_start = timer()
        text = ds.stt(audio, fs)
        print(text)
        inference_end = timer() - inference_start
        print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)

        with open(target, 'w') as out:
            out.write(text)
Ejemplo n.º 24
0
def load_model(model_path, alphabet_path, lm_path, trie_path):
    print('Loading model from file %s' % (model_path), file=sys.stderr)
    model_load_start = timer()
    ds = Model(model_path, N_FEATURES, N_CONTEXT, alphabet_path, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

    if lm_path and trie_path:
        print('Loading language model from files %s %s' % (lm_path, trie_path),
              file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT,
                               WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in %0.3fs.' % (lm_load_end),
              file=sys.stderr)
    return ds
Ejemplo n.º 25
0
    def __init__(self, model, alphabet, lm=None, trie=None):
        print('Loading model from file %s' % (model), file=sys.stderr)
        model_load_start = timer()
        self.ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)
        model_load_end = timer() - model_load_start
        print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

        if lm is not None and trie is not None:
            print('Loading language model from files %s %s' % (lm, trie),
                  file=sys.stderr)
            lm_load_start = timer()
            self.ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT,
                                        WORD_COUNT_WEIGHT,
                                        VALID_WORD_COUNT_WEIGHT)
            lm_load_end = timer() - lm_load_start
            print('Loaded language model in %0.3fs.' % (lm_load_end),
                  file=sys.stderr)
Ejemplo n.º 26
0
def main():
    print('Loading model from file %s' % MODEL, file=sys.stderr)
    model_load_start = timer()
    ds = Model(MODEL, N_FEATURES, N_CONTEXT, ALPHABET, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in %0.3fs.' % model_load_end, file=sys.stderr)

    # Uncomment if you want to use a language model
    # =============================================

    # print('Loading language model from files %s %s' % (LANGUAGE_MODEL, TRIE), file=sys.stderr)
    # lm_load_start = timer()
    # ds.enableDecoderWithLM(ALPHABET, LANGUAGE_MODEL, TRIE, LM_WEIGHT,
    #                        WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
    # lm_load_end = timer() - lm_load_start
    # print('Loaded language model in %0.3fs.' % lm_load_end, file=sys.stderr)

    # audio file
    path_to_audio = 'data/sesq316qna.mp3'

    # change rate of audio file to 16kHz
    call = AudioSegment.from_file(path_to_audio)
    call = call.set_frame_rate(16000)
    # only analyze the first 2 minutes (2 * 60 * 1000)
    segment = call[:120000]

    # declare the new name of the audio file
    path = 'data/testing.wav'

    # export the audio file to wav format
    segment.export(path, format="wav")

    # read the new file again with the wav reader
    fs, audio = wav.read(path)
    # We can assume 16kHz
    audio_length = len(audio) * (1 / 16000)
    assert fs == 16000, "Only 16000Hz input WAV files are supported for now!"

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    prediction_text = ds.stt(audio, fs)
    print(prediction_text)
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' %
          (inference_end, audio_length),
          file=sys.stderr)
Ejemplo n.º 27
0
    def __init__(self, model_path):
        # Defined constants. See https://github.com/mozilla/DeepSpeech/blob/master/native_client/python/client.py
        BEAM_WIDTH = 500
        LM_WEIGHT = 1.75
        WORD_COUNT_WEIGHT = 1.00
        VALID_WORD_COUNT_WEIGHT = 1.00
        N_FEATURES = 26
        N_CONTEXT = 9

        model = os.path.join(model_path, "output_graph.pb")
        alphabet = os.path.join(model_path, "alphabet.txt")
        lm = os.path.join(model_path, "lm.binary")
        trie = os.path.join(model_path, "trie")

        self.model = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)
        self.model.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT,
                                       WORD_COUNT_WEIGHT,
                                       VALID_WORD_COUNT_WEIGHT)
Ejemplo n.º 28
0
def Deep():
    try:
        if tkMessageBox.askyesno("Confirmation", "Would you like to proceed?"):

            BEAM_WIDTH = 500
            LM_WEIGHT = 1.75
            WORD_COUNT_WEIGHT = 1.00
            VALID_WORD_COUNT_WEIGHT = 1.00
            N_FEATURES = 26
            N_CONTEXT = 9

            ds = Model('models/models.pb', N_FEATURES, N_CONTEXT,
                       'models/alphabet.txt', BEAM_WIDTH)

            fs, audio = wav.read(audiofile.get())

            if fs != 16000:
                cbn = sox.Combiner()
                cbn.convert(samplerate=16000, n_channels=1)
                cbn.build([str(audiofile.get())], './', 'concatenate')
                fs, audio = wav.read('./')

            audio_length = len(audio) * (1 / 16000)

            resultpage = Toplevel(parent)
            resultpage.title("Result")
            result_border = ttk.Frame(resultpage, padding=(12, 12, 12, 12))
            result_border.pack()
            result_page = Frame(result_border, bg="white")
            result_page.pack()

            Tkinter.Label(result_page,
                          text="What I've heard from you:",
                          font=14,
                          bg="white").grid(row=1, column=1, sticky=E)
            Tkinter.Label(result_page, textvariable=word, font=12,
                          bg="white").grid(row=2, column=2, sticky=E)

            word.set(ds.stt(audio, fs))

    except ValueError:
        tkMessageBox.showerror("Error!", "Only 16000Hz WAV files supported!")
    except IOError:
        tkMessageBox.showerror("Error!", "No file uploaded!")
Ejemplo n.º 29
0
def load_model():
    model_path = 'output_graph.pb'
    alphabet_path = 'alphabet.txt'
    lm_path = 'lm.binary'
    trie_path = 'trie'

    #print('Loading model from file %s' % (model_path), file=sys.stderr)
    #model_load_start = timer()
    ds = Model(model_path, N_FEATURES, N_CONTEXT, alphabet_path, BEAM_WIDTH)
    #model_load_end = timer() - model_load_start
    #print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

    #print('Loading language model from files %s %s' % (lm_path, trie_path), file=sys.stderr)
    #lm_load_start = timer()
    ds.enableDecoderWithLM(alphabet_path, lm_path, trie_path, LM_WEIGHT,
                           WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
    #lm_load_end = timer() - lm_load_start
    #print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr)
    return ds
    def __init__(self):

        logging.info('Loading model from file %s' % (shared_params.DS_MODEL))
        model_load_start = timer()
        self.ds = Model(shared_params.DS_MODEL, N_FEATURES, N_CONTEXT,
                        shared_params.DS_ALPHABET, shared_params.BEAM_WIDTH)
        model_load_end = timer() - model_load_start
        logging.info('Loaded model in %0.3fs.' % (model_load_end))
        logging.info('Loading language model from files %s %s' %
                     (shared_params.DS_LANGUAGE_MODEL, shared_params.DS_TRIE))
        lm_load_start = timer()
        self.ds.enableDecoderWithLM(shared_params.DS_ALPHABET,
                                    shared_params.DS_LANGUAGE_MODEL,
                                    shared_params.DS_TRIE,
                                    shared_params.LM_WEIGHT,
                                    shared_params.WORD_COUNT_WEIGHT,
                                    shared_params.VALID_WORD_COUNT_WEIGHT)
        lm_load_end = timer() - lm_load_start
        logging.info('Loaded language model in %0.3fs.' % (lm_load_end))