def create_missing_files(self): utils.touch_file(os.path.join(self.model_dir, 'user_lexicon.txt')) def check_file(filename, src_filename): # Create missing file from its base file if not find_file(self.model_dir, filename): src = find_file(self.model_dir, src_filename) dst = src.replace(src_filename, filename) shutil.copyfile(src, dst) check_file('words.txt', 'words.base.txt') check_file('align_lexicon.int', 'align_lexicon.base.int') check_file('lexiconp_disambig.txt', 'lexiconp_disambig.base.txt')
def __init__(self, model_dir=None, tmp_dir=None): show_donation_message() self.exec_dir = os.path.join(utils.exec_dir, '') self.model_dir = os.path.join(model_dir or defaults.DEFAULT_MODEL_DIR, '') self.tmp_dir = os.path.join(tmp_dir or (os.path.normpath(self.model_dir) + defaults.DEFAULT_TMP_DIR_SUFFIX), '') if not os.path.isdir(self.exec_dir): raise KaldiError("cannot find exec_dir: %r" % self.exec_dir, "are you sure you installed kaldi-active-grammar correctly?") if not os.path.isdir(self.model_dir): raise KaldiError("cannot find model_dir: %r" % self.model_dir) if not os.path.exists(self.tmp_dir): _log.warning("%s: creating tmp dir: %r" % (self, self.tmp_dir)) os.mkdir(self.tmp_dir) utils.touch_file(os.path.join(self.tmp_dir, "FILES_ARE_SAFE_TO_DELETE")) if os.path.isfile(self.tmp_dir): raise KaldiError("please specify an available tmp_dir, or remove %r" % self.tmp_dir) version_file = os.path.join(self.model_dir, 'KAG_VERSION') if os.path.isfile(version_file): with open(version_file, 'r', encoding='utf-8') as f: model_version = f.read().strip() if model_version != REQUIRED_MODEL_VERSION: raise KaldiError("invalid model_dir version! please download a compatible model") else: _log.warning("model_dir has no version information; errors below may indicate an incompatible model") self.create_missing_files() self.check_user_lexicon() self.files_dict = { 'exec_dir': self.exec_dir, 'model_dir': self.model_dir, 'tmp_dir': self.tmp_dir, 'words.txt': find_file(self.model_dir, 'words.txt', default=True), 'words.base.txt': find_file(self.model_dir, 'words.base.txt', default=True), 'phones.txt': find_file(self.model_dir, 'phones.txt', default=True), 'align_lexicon.int': find_file(self.model_dir, 'align_lexicon.int', default=True), 'align_lexicon.base.int': find_file(self.model_dir, 'align_lexicon.base.int', default=True), 'disambig.int': find_file(self.model_dir, 'disambig.int', default=True), 'L_disambig.fst': find_file(self.model_dir, 'L_disambig.fst', default=True), 'tree': find_file(self.model_dir, 'tree', default=True), 'final.mdl': find_file(self.model_dir, 'final.mdl', default=True), # 'g.irelabel': find_file(self.model_dir, 'g.irelabel', default=True), # otf 'user_lexicon.txt': find_file(self.model_dir, 'user_lexicon.txt', default=True), 'left_context_phones.txt': find_file(self.model_dir, 'left_context_phones.txt', default=True), 'nonterminals.txt': find_file(self.model_dir, 'nonterminals.txt', default=True), 'wdisambig_phones.int': find_file(self.model_dir, 'wdisambig_phones.int', default=True), 'wdisambig_words.int': find_file(self.model_dir, 'wdisambig_words.int', default=True), 'lexiconp_disambig.txt': find_file(self.model_dir, 'lexiconp_disambig.txt', default=True), 'lexiconp_disambig.base.txt': find_file(self.model_dir, 'lexiconp_disambig.base.txt', default=True), } self.files_dict.update({ k: '"%s"' % v for (k, v) in self.files_dict.items() if v and ' ' in v }) # Handle spaces in paths self.files_dict.update({ k.replace('.', '_'): v for (k, v) in self.files_dict.items() }) # For named placeholder access in str.format() self.fst_cache = utils.FSTFileCache(os.path.join(self.tmp_dir, defaults.FILE_CACHE_FILENAME), dependencies_dict=self.files_dict) self.phone_to_int_dict = { phone: i for phone, i in load_symbol_table(self.files_dict['phones.txt']) } self.lexicon = Lexicon(self.phone_to_int_dict.keys()) self.nonterm_phones_offset = self.phone_to_int_dict.get('#nonterm_bos') if self.nonterm_phones_offset is None: raise KaldiError("missing nonterms in 'phones.txt'") self.nonterm_words_offset = symbol_table_lookup(self.files_dict['words.base.txt'], '#nonterm_begin') if self.nonterm_words_offset is None: raise KaldiError("missing nonterms in 'words.base.txt'") # Update files if needed, before loading words files = ['user_lexicon.txt', 'words.txt', 'align_lexicon.int', 'lexiconp_disambig.txt', 'L_disambig.fst',] if self.fst_cache.cache_is_new or not all(self.fst_cache.file_is_current(self.files_dict[file]) for file in files): self.generate_lexicon_files() self.fst_cache.update_dependencies() self.fst_cache.save() self.load_words(self.files_dict['words.txt']) # sets self.lexicon_words, self.longest_word