def create_bundle( alphabet_path, lm_path, vocab_path, package_path, force_utf8, default_alpha, default_beta, ): words = set() vocab_looks_char_based = True with open(vocab_path) as fin: for line in fin: for word in line.split(): words.add(word.encode("utf-8")) if len(word) > 1: vocab_looks_char_based = False print("{} unique words read from vocabulary file.".format(len(words))) cbm = "Looks" if vocab_looks_char_based else "Doesn't look" print("{} like a character based model.".format(cbm)) if force_utf8 != None: # pylint: disable=singleton-comparison use_utf8 = force_utf8.value else: use_utf8 = vocab_looks_char_based print("Using detected UTF-8 mode: {}".format(use_utf8)) if use_utf8: serialized_alphabet = UTF8Alphabet().serialize() else: if not alphabet_path: raise RuntimeError("No --alphabet path specified, can't continue.") serialized_alphabet = Alphabet(alphabet_path).serialize() alphabet = NativeAlphabet() err = alphabet.deserialize(serialized_alphabet, len(serialized_alphabet)) if err != 0: raise RuntimeError("Error loading alphabet: {}".format(err)) scorer = Scorer() scorer.set_alphabet(alphabet) scorer.set_utf8_mode(use_utf8) scorer.reset_params(default_alpha, default_beta) err = scorer.load_lm(lm_path) if err != ds_ctcdecoder.DS_ERR_SCORER_NO_TRIE: print('Error loading language model file: 0x{:X}.'.format(err)) print( 'See the error codes section in https://deepspeech.readthedocs.io for a description.' ) sys.exit(1) scorer.fill_dictionary(list(words)) shutil.copy(lm_path, package_path) # append, not overwrite if scorer.save_dictionary(package_path, True): print("Package created in {}".format(package_path)) else: print("Error when creating {}".format(package_path)) sys.exit(1)
def create_bundle( alphabet_path, lm_path, vocab_path, package_path, force_utf8, default_alpha, default_beta, ): words = set() vocab_looks_char_based = True with open(vocab_path) as fin: for line in fin: for word in line.split(): words.add(word.encode("utf-8")) if len(word) > 1: vocab_looks_char_based = False print("{} unique words read from vocabulary file.".format(len(words))) print("{} like a character based model.".format( "Looks" if vocab_looks_char_based else "Doesn't look")) if force_utf8 != None: # pylint: disable=singleton-comparison use_utf8 = force_utf8.value print("Forcing UTF-8 mode = {}".format(use_utf8)) else: use_utf8 = vocab_looks_char_based if use_utf8: serialized_alphabet = UTF8Alphabet().serialize() else: if not alphabet_path: print("No --alphabet path specified, can't continue.") sys.exit(1) serialized_alphabet = Alphabet(alphabet_path).serialize() alphabet = NativeAlphabet() err = alphabet.deserialize(serialized_alphabet, len(serialized_alphabet)) if err != 0: print("Error loading alphabet: {}".format(err)) sys.exit(1) scorer = Scorer() scorer.set_alphabet(alphabet) scorer.set_utf8_mode(use_utf8) scorer.reset_params(default_alpha, default_beta) scorer.load_lm(lm_path) scorer.fill_dictionary(list(words)) shutil.copy(lm_path, package_path) scorer.save_dictionary(package_path, True) # append, not overwrite print("Package created in {}".format(package_path))
def create_bundle( alphabet_path, lm_path, vocab_path, package_path, force_utf8, default_alpha, default_beta, ): words = set() with open(vocab_path) as fin: for line in fin: for word in line.split(): words.add(word.encode("utf-8")) if not alphabet_path: raise RuntimeError("No --alphabet path specified, can't continue.") serialized_alphabet = Alphabet(alphabet_path).serialize() alphabet = NativeAlphabet() err = alphabet.deserialize(serialized_alphabet, len(serialized_alphabet)) if err != 0: raise RuntimeError("Error loading alphabet: {}".format(err)) scorer = Scorer() scorer.set_alphabet(alphabet) scorer.reset_params(default_alpha, default_beta) scorer.load_lm(lm_path) # TODO: Why is this not working? # err = scorer.load_lm(lm_path) # if err != ds_ctcdecoder.DS_ERR_SCORER_NO_TRIE: # print('Error loading language model file: 0x{:X}.'.format(err)) # print('See the error codes section in https://deepspeech.readthedocs.io for a description.') # sys.exit(1) scorer.fill_dictionary(list(words)) shutil.copy(lm_path, package_path) # This is the problem! scorer.save_dictionary(package_path, True) # append, not overwrite print("Package created in {}".format(package_path))
def main(): alphabet_txt = os.path.join(LANG.model_dir, 'alphabet.txt') raw_txt_gz = os.path.join(LANG.model_dir, 'raw.txt.gz') unprepared_txt = os.path.join(LANG.model_dir, 'unprepared.txt') prepared_txt = os.path.join(LANG.model_dir, 'prepared.txt') vocabulary_txt = os.path.join(LANG.model_dir, 'vocabulary.txt') unfiltered_arpa = os.path.join(LANG.model_dir, 'unfiltered.arpa') filtered_arpa = os.path.join(LANG.model_dir, 'filtered.arpa') lm_binary = os.path.join(LANG.model_dir, 'lm.binary') kenlm_scorer = os.path.join(LANG.model_dir, 'kenlm.scorer') temp_prefix = os.path.join(LANG.model_dir, 'tmp') section('Writing alphabet file', empty_lines_before=1) with open(alphabet_txt, 'w', encoding='utf-8') as alphabet_file: alphabet_file.write('\n'.join(LANG.alphabet) + '\n') redo = ARGS.force_download section('Downloading text data') redo = maybe_download(LANG.text_url, raw_txt_gz, force=redo) section('Unzipping text data') redo = maybe_ungzip(raw_txt_gz, unprepared_txt, force=redo) redo = redo or ARGS.force_prepare section('Preparing text and building vocabulary') if redo or not os.path.isfile(prepared_txt) or not os.path.isfile(vocabulary_txt): redo = True announce('Preparing {} shards of "{}"...'.format(ARGS.workers, unprepared_txt)) counters = Queue(ARGS.workers) source_bytes = os.path.getsize(unprepared_txt) aggregator_process = Process(target=aggregate_counters, args=(vocabulary_txt, source_bytes, counters)) aggregator_process.start() counter_processes = list(map(lambda index: Process(target=count_words, args=(index, counters)), range(ARGS.workers))) try: for p in counter_processes: p.start() for p in counter_processes: p.join() counters.put(STOP_TOKEN) aggregator_process.join() print('') partials = list(map(lambda i: get_partial_path(i), range(ARGS.workers))) join_files(partials, prepared_txt) for partial in partials: os.unlink(partial) except KeyboardInterrupt: aggregator_process.terminate() for p in counter_processes: p.terminate() raise else: announce('Files "{}" and \n\t"{}" existing - not preparing'.format(prepared_txt, vocabulary_txt)) redo = redo or ARGS.force_generate section('Building unfiltered language model') if redo or not os.path.isfile(unfiltered_arpa): redo = True lmplz_args = [ KENLM_BIN + '/lmplz', '--temp_prefix', temp_prefix, '--memory', '80%', '--discount_fallback', '--limit_vocab_file', vocabulary_txt, '--text', prepared_txt, '--arpa', unfiltered_arpa, '--skip', 'symbols', '--order', str(LANG.order) ] if len(LANG.prune) > 0: lmplz_args.append('--prune') lmplz_args.extend(list(map(str, LANG.prune))) subprocess.check_call(lmplz_args) else: announce('File "{}" existing - not generating'.format(unfiltered_arpa)) section('Filtering language model') if redo or not os.path.isfile(filtered_arpa): redo = True with open(vocabulary_txt, 'rb') as vocabulary_file: vocabulary_content = vocabulary_file.read() subprocess.run([ KENLM_BIN + '/filter', 'single', 'model:' + unfiltered_arpa, filtered_arpa ], input=vocabulary_content, check=True) else: announce('File "{}" existing - not filtering'.format(filtered_arpa)) section('Generating binary representation') if redo or not os.path.isfile(lm_binary): redo = True subprocess.check_call([ KENLM_BIN + '/build_binary', '-a', '255', '-q', '8', '-v', 'trie', filtered_arpa, lm_binary ]) else: announce('File "{}" existing - not generating'.format(lm_binary)) section('Building scorer') if redo or not os.path.isfile(kenlm_scorer): redo = True words = set() vocab_looks_char_based = True with open(vocabulary_txt) as vocabulary_file: for line in vocabulary_file: for word in line.split(): words.add(word.encode()) if len(word) > 1: vocab_looks_char_based = False announce("{} unique words read from vocabulary file.".format(len(words))) announce( "{} like a character based model.".format( "Looks" if vocab_looks_char_based else "Doesn't look" ) ) if ARGS.alphabet_mode == 'auto': use_utf8 = vocab_looks_char_based elif ARGS.alphabet_mode == 'utf8': use_utf8 = True else: use_utf8 = False serialized_alphabet = get_serialized_utf8_alphabet() if use_utf8 else LANG.get_serialized_alphabet() from ds_ctcdecoder import Scorer, Alphabet alphabet = Alphabet() err = alphabet.deserialize(serialized_alphabet, len(serialized_alphabet)) if err != 0: announce('Error loading alphabet: {}'.format(err)) sys.exit(1) scorer = Scorer() scorer.set_alphabet(alphabet) scorer.set_utf8_mode(use_utf8) scorer.reset_params(LANG.alpha, LANG.beta) scorer.load_lm(lm_binary) scorer.fill_dictionary(list(words)) shutil.copy(lm_binary, kenlm_scorer) scorer.save_dictionary(kenlm_scorer, True) # append, not overwrite announce('Package created in {}'.format(kenlm_scorer)) announce('Testing package...') scorer = Scorer() scorer.load_lm(kenlm_scorer) else: announce('File "{}" existing - not building'.format(kenlm_scorer))