def __init__(self, model_dir=None, tmp_dir=None):
        show_donation_message()

        self.exec_dir = os.path.join(utils.exec_dir, '')
        self.model_dir = os.path.join(model_dir or defaults.DEFAULT_MODEL_DIR, '')
        self.tmp_dir = os.path.join(tmp_dir or (os.path.normpath(self.model_dir) + defaults.DEFAULT_TMP_DIR_SUFFIX), '')

        if not os.path.isdir(self.exec_dir):
            raise KaldiError("cannot find exec_dir: %r" % self.exec_dir,
                "are you sure you installed kaldi-active-grammar correctly?")
        if not os.path.isdir(self.model_dir):
            raise KaldiError("cannot find model_dir: %r" % self.model_dir)
        if not os.path.exists(self.tmp_dir):
            _log.warning("%s: creating tmp dir: %r" % (self, self.tmp_dir))
            os.mkdir(self.tmp_dir)
            utils.touch_file(os.path.join(self.tmp_dir, "FILES_ARE_SAFE_TO_DELETE"))
        if os.path.isfile(self.tmp_dir): raise KaldiError("please specify an available tmp_dir, or remove %r" % self.tmp_dir)

        version_file = os.path.join(self.model_dir, 'KAG_VERSION')
        if os.path.isfile(version_file):
            with open(version_file, 'r', encoding='utf-8') as f:
                model_version = f.read().strip()
                if model_version != REQUIRED_MODEL_VERSION:
                    raise KaldiError("invalid model_dir version! please download a compatible model")
        else:
            _log.warning("model_dir has no version information; errors below may indicate an incompatible model")

        self.create_missing_files()
        self.check_user_lexicon()

        self.files_dict = {
            'exec_dir': self.exec_dir,
            'model_dir': self.model_dir,
            'tmp_dir': self.tmp_dir,
            'words.txt': find_file(self.model_dir, 'words.txt', default=True),
            'words.base.txt': find_file(self.model_dir, 'words.base.txt', default=True),
            'phones.txt': find_file(self.model_dir, 'phones.txt', default=True),
            'align_lexicon.int': find_file(self.model_dir, 'align_lexicon.int', default=True),
            'align_lexicon.base.int': find_file(self.model_dir, 'align_lexicon.base.int', default=True),
            'disambig.int': find_file(self.model_dir, 'disambig.int', default=True),
            'L_disambig.fst': find_file(self.model_dir, 'L_disambig.fst', default=True),
            'tree': find_file(self.model_dir, 'tree', default=True),
            'final.mdl': find_file(self.model_dir, 'final.mdl', default=True),
            # 'g.irelabel': find_file(self.model_dir, 'g.irelabel', default=True),  # otf
            'user_lexicon.txt': find_file(self.model_dir, 'user_lexicon.txt', default=True),
            'left_context_phones.txt': find_file(self.model_dir, 'left_context_phones.txt', default=True),
            'nonterminals.txt': find_file(self.model_dir, 'nonterminals.txt', default=True),
            'wdisambig_phones.int': find_file(self.model_dir, 'wdisambig_phones.int', default=True),
            'wdisambig_words.int': find_file(self.model_dir, 'wdisambig_words.int', default=True),
            'lexiconp_disambig.txt': find_file(self.model_dir, 'lexiconp_disambig.txt', default=True),
            'lexiconp_disambig.base.txt': find_file(self.model_dir, 'lexiconp_disambig.base.txt', default=True),
        }
        self.files_dict.update({ k: '"%s"' % v for (k, v) in self.files_dict.items() if v and ' ' in v })  # Handle spaces in paths
        self.files_dict.update({ k.replace('.', '_'): v for (k, v) in self.files_dict.items() })  # For named placeholder access in str.format()
        self.fst_cache = utils.FSTFileCache(os.path.join(self.tmp_dir, defaults.FILE_CACHE_FILENAME), dependencies_dict=self.files_dict)

        self.phone_to_int_dict = { phone: i for phone, i in load_symbol_table(self.files_dict['phones.txt']) }
        self.lexicon = Lexicon(self.phone_to_int_dict.keys())
        self.nonterm_phones_offset = self.phone_to_int_dict.get('#nonterm_bos')
        if self.nonterm_phones_offset is None: raise KaldiError("missing nonterms in 'phones.txt'")
        self.nonterm_words_offset = symbol_table_lookup(self.files_dict['words.base.txt'], '#nonterm_begin')
        if self.nonterm_words_offset is None: raise KaldiError("missing nonterms in 'words.base.txt'")

        # Update files if needed, before loading words
        files = ['user_lexicon.txt', 'words.txt', 'align_lexicon.int', 'lexiconp_disambig.txt', 'L_disambig.fst',]
        if self.fst_cache.cache_is_new or not all(self.fst_cache.file_is_current(self.files_dict[file]) for file in files):
            self.generate_lexicon_files()
            self.fst_cache.update_dependencies()
            self.fst_cache.save()

        self.load_words(self.files_dict['words.txt'])  # sets self.lexicon_words, self.longest_word
Example #2
0
    def __init__(self, model_dir=None, tmp_dir=None):
        self.exec_dir = os.path.join(utils.exec_dir, '')
        self.model_dir = os.path.join(model_dir or DEFAULT_MODEL_DIR, '')
        self.tmp_dir = os.path.join(
            tmp_dir
            or (os.path.normpath(self.model_dir) + DEFAULT_TMP_DIR_SUFFIX), '')

        if not os.path.isdir(self.exec_dir):
            raise KaldiError("cannot find exec_dir: %r" % self.exec_dir)
        if not os.path.isdir(self.model_dir):
            raise KaldiError("cannot find model_dir: %r" % self.model_dir)
        if not os.path.exists(self.tmp_dir):
            _log.warning("%s: creating tmp dir: %r" % (self, self.tmp_dir))
            os.mkdir(self.tmp_dir)
            utils.touch(os.path.join(self.tmp_dir, "FILES_ARE_SAFE_TO_DELETE"))
        if os.path.isfile(self.tmp_dir):
            raise KaldiError(
                "please specify an available tmp_dir, or remove %r" %
                self.tmp_dir)

        version_file = os.path.join(self.model_dir, 'KAG_VERSION')
        if os.path.isfile(version_file):
            with open(version_file) as f:
                model_version = f.read().strip()
                if model_version != REQUIRED_MODEL_VERSION:
                    raise KaldiError(
                        "invalid model_dir version! please download a compatible model"
                    )
        else:
            _log.warning(
                "model_dir has no version information; errors below may indicate an incompatible model"
            )

        utils.touch(os.path.join(self.model_dir, 'user_lexicon.txt'))
        self.files_dict = {
            'exec_dir':
            self.exec_dir,
            'model_dir':
            self.model_dir,
            'tmp_dir':
            self.tmp_dir,
            'words.txt':
            find_file(self.model_dir, 'words.txt'),
            'phones.txt':
            find_file(self.model_dir, 'phones.txt'),
            'align_lexicon.int':
            find_file(self.model_dir, 'align_lexicon.int'),
            'disambig.int':
            find_file(self.model_dir, 'disambig.int'),
            'L_disambig.fst':
            find_file(self.model_dir, 'L_disambig.fst'),
            'tree':
            find_file(self.model_dir, 'tree'),
            '1.mdl':
            find_file(self.model_dir, '1.mdl'),
            'final.mdl':
            find_file(self.model_dir, 'final.mdl'),
            'g.irelabel':
            find_file(self.model_dir, 'g.irelabel'),  # otf
            'user_lexicon.txt':
            find_file(self.model_dir, 'user_lexicon.txt'),
            'left_context_phones.txt':
            find_file(self.model_dir, 'left_context_phones.txt'),
            'nonterminals.txt':
            find_file(self.model_dir, 'nonterminals.txt'),
            'wdisambig_phones.int':
            find_file(self.model_dir, 'wdisambig_phones.int'),
            'wdisambig_words.int':
            find_file(self.model_dir, 'wdisambig_words.int'),
            'lexiconp_disambig.txt':
            find_file(self.model_dir, 'lexiconp_disambig.txt'),
        }
        self.files_dict.update({
            k: '"%s"' % v
            for (k, v) in self.files_dict.items() if v and ' ' in v
        })  # Handle spaces in paths
        self.files_dict.update({
            k.replace('.', '_'): v
            for (k, v) in self.files_dict.items()
        })  # For named placeholder access in str.format()
        self.fst_cache = utils.FSTFileCache(os.path.join(
            self.tmp_dir, FILE_CACHE_FILENAME),
                                            dependencies_dict=self.files_dict)

        self.phone_to_int_dict = {
            phone: i
            for phone, i in load_symbol_table(self.files_dict['phones.txt'])
        }
        self.nonterm_phones_offset = self.phone_to_int_dict['#nonterm_bos']
        self.nonterm_words_offset = symbol_table_lookup(
            self.files_dict['words.txt'], '#nonterm_begin')

        # Update files if needed, before loading words
        if not self.fst_cache.file_is_current(
                self.files_dict['user_lexicon.txt']):
            self.generate_lexicon_files()

        self.load_words(self.files_dict['words.txt']
                        )  # sets self.lexicon_words, self.longest_word