Esempio n. 1
0
 def create_missing_files(self):
     utils.touch_file(os.path.join(self.model_dir, 'user_lexicon.txt'))
     def check_file(filename, src_filename):
         # Create missing file from its base file
         if not find_file(self.model_dir, filename):
             src = find_file(self.model_dir, src_filename)
             dst = src.replace(src_filename, filename)
             shutil.copyfile(src, dst)
     check_file('words.txt', 'words.base.txt')
     check_file('align_lexicon.int', 'align_lexicon.base.int')
     check_file('lexiconp_disambig.txt', 'lexiconp_disambig.base.txt')
Esempio n. 2
0
    def __init__(self, model_dir=None, tmp_dir=None):
        show_donation_message()

        self.exec_dir = os.path.join(utils.exec_dir, '')
        self.model_dir = os.path.join(model_dir or defaults.DEFAULT_MODEL_DIR, '')
        self.tmp_dir = os.path.join(tmp_dir or (os.path.normpath(self.model_dir) + defaults.DEFAULT_TMP_DIR_SUFFIX), '')

        if not os.path.isdir(self.exec_dir):
            raise KaldiError("cannot find exec_dir: %r" % self.exec_dir,
                "are you sure you installed kaldi-active-grammar correctly?")
        if not os.path.isdir(self.model_dir):
            raise KaldiError("cannot find model_dir: %r" % self.model_dir)
        if not os.path.exists(self.tmp_dir):
            _log.warning("%s: creating tmp dir: %r" % (self, self.tmp_dir))
            os.mkdir(self.tmp_dir)
            utils.touch_file(os.path.join(self.tmp_dir, "FILES_ARE_SAFE_TO_DELETE"))
        if os.path.isfile(self.tmp_dir): raise KaldiError("please specify an available tmp_dir, or remove %r" % self.tmp_dir)

        version_file = os.path.join(self.model_dir, 'KAG_VERSION')
        if os.path.isfile(version_file):
            with open(version_file, 'r', encoding='utf-8') as f:
                model_version = f.read().strip()
                if model_version != REQUIRED_MODEL_VERSION:
                    raise KaldiError("invalid model_dir version! please download a compatible model")
        else:
            _log.warning("model_dir has no version information; errors below may indicate an incompatible model")

        self.create_missing_files()
        self.check_user_lexicon()

        self.files_dict = {
            'exec_dir': self.exec_dir,
            'model_dir': self.model_dir,
            'tmp_dir': self.tmp_dir,
            'words.txt': find_file(self.model_dir, 'words.txt', default=True),
            'words.base.txt': find_file(self.model_dir, 'words.base.txt', default=True),
            'phones.txt': find_file(self.model_dir, 'phones.txt', default=True),
            'align_lexicon.int': find_file(self.model_dir, 'align_lexicon.int', default=True),
            'align_lexicon.base.int': find_file(self.model_dir, 'align_lexicon.base.int', default=True),
            'disambig.int': find_file(self.model_dir, 'disambig.int', default=True),
            'L_disambig.fst': find_file(self.model_dir, 'L_disambig.fst', default=True),
            'tree': find_file(self.model_dir, 'tree', default=True),
            'final.mdl': find_file(self.model_dir, 'final.mdl', default=True),
            # 'g.irelabel': find_file(self.model_dir, 'g.irelabel', default=True),  # otf
            'user_lexicon.txt': find_file(self.model_dir, 'user_lexicon.txt', default=True),
            'left_context_phones.txt': find_file(self.model_dir, 'left_context_phones.txt', default=True),
            'nonterminals.txt': find_file(self.model_dir, 'nonterminals.txt', default=True),
            'wdisambig_phones.int': find_file(self.model_dir, 'wdisambig_phones.int', default=True),
            'wdisambig_words.int': find_file(self.model_dir, 'wdisambig_words.int', default=True),
            'lexiconp_disambig.txt': find_file(self.model_dir, 'lexiconp_disambig.txt', default=True),
            'lexiconp_disambig.base.txt': find_file(self.model_dir, 'lexiconp_disambig.base.txt', default=True),
        }
        self.files_dict.update({ k: '"%s"' % v for (k, v) in self.files_dict.items() if v and ' ' in v })  # Handle spaces in paths
        self.files_dict.update({ k.replace('.', '_'): v for (k, v) in self.files_dict.items() })  # For named placeholder access in str.format()
        self.fst_cache = utils.FSTFileCache(os.path.join(self.tmp_dir, defaults.FILE_CACHE_FILENAME), dependencies_dict=self.files_dict)

        self.phone_to_int_dict = { phone: i for phone, i in load_symbol_table(self.files_dict['phones.txt']) }
        self.lexicon = Lexicon(self.phone_to_int_dict.keys())
        self.nonterm_phones_offset = self.phone_to_int_dict.get('#nonterm_bos')
        if self.nonterm_phones_offset is None: raise KaldiError("missing nonterms in 'phones.txt'")
        self.nonterm_words_offset = symbol_table_lookup(self.files_dict['words.base.txt'], '#nonterm_begin')
        if self.nonterm_words_offset is None: raise KaldiError("missing nonterms in 'words.base.txt'")

        # Update files if needed, before loading words
        files = ['user_lexicon.txt', 'words.txt', 'align_lexicon.int', 'lexiconp_disambig.txt', 'L_disambig.fst',]
        if self.fst_cache.cache_is_new or not all(self.fst_cache.file_is_current(self.files_dict[file]) for file in files):
            self.generate_lexicon_files()
            self.fst_cache.update_dependencies()
            self.fst_cache.save()

        self.load_words(self.files_dict['words.txt'])  # sets self.lexicon_words, self.longest_word