class WordDic: def __init__(self, dataDir, bigendian=False, splitted=False): self.trie = Searcher(dataDir + "/word2id", bigendian) if splitted: paths = sorted(glob.glob(dataDir + "/word.dat.*")) self.data = util.getCharArrayMulti(paths, bigendian) else: self.data = util.getCharArray(dataDir + "/word.dat", bigendian) self.indices = util.getIntArray(dataDir + "/word.ary.idx", bigendian) fmis = FileMappedInputStream(dataDir + "/word.inf", bigendian) try: wordCount = fmis.size() // (4 + 2 + 2 + 2) self.dataOffsets = fmis.getIntArray(wordCount) """ dataOffsets[単語ID] = 単語の素性データの開始位置 """ self.leftIds = fmis.getShortArray(wordCount) """ leftIds[単語ID] = 単語の左文脈ID """ self.rightIds = fmis.getShortArray(wordCount) """ rightIds[単語ID] = 単語の右文脈ID """ self.costs = fmis.getShortArray(wordCount) """ consts[単語ID] = 単語のコスト """ finally: fmis.close() def search(self, text, start, callback): costs = self.costs leftIds = self.leftIds rightIds = self.rightIds indices = self.indices def fn(start, offset, trieId): end = indices[trieId + 1] for i in range(indices[trieId], end): callback( ViterbiNode(i, start, offset, costs[i], leftIds[i], rightIds[i], False)) self.trie.eachCommonPrefix(text, start, fn) def searchFromTrieId(self, trieId, start, wordLength, isSpace, callback): costs = self.costs leftIds = self.leftIds rightIds = self.rightIds end = self.indices[trieId + 1] for i in range(self.indices[trieId], end): callback( ViterbiNode(i, start, wordLength, costs[i], leftIds[i], rightIds[i], isSpace)) def wordData(self, wordId): return self.data[self.dataOffsets[wordId]:self.dataOffsets[wordId + 1]]
class WordDic: def __init__(self, dataDir, bigendian=False, splitted=False): self.trie = Searcher(dataDir + "/word2id", bigendian) if splitted: paths = sorted(glob.glob(dataDir + "/word.dat.*")) self.data = util.getCharArrayMulti(paths, bigendian) else: self.data = util.getCharArray(dataDir + "/word.dat", bigendian) self.indices = util.getIntArray(dataDir + "/word.ary.idx", bigendian) fmis = FileMappedInputStream(dataDir + "/word.inf", bigendian) try: wordCount = fmis.size() // (4 + 2 + 2 + 2) self.dataOffsets = fmis.getIntArray(wordCount) """ dataOffsets[単語ID] = 単語の素性データの開始位置 """ self.leftIds = fmis.getShortArray(wordCount) """ leftIds[単語ID] = 単語の左文脈ID """ self.rightIds = fmis.getShortArray(wordCount) """ rightIds[単語ID] = 単語の右文脈ID """ self.costs = fmis.getShortArray(wordCount) """ consts[単語ID] = 単語のコスト """ finally: fmis.close() def search(self, text, start, callback): costs = self.costs leftIds = self.leftIds rightIds = self.rightIds indices = self.indices def fn(start, offset, trieId): end = indices[trieId + 1] for i in range(indices[trieId], end): callback(ViterbiNode(i, start, offset, costs[i], leftIds[i], rightIds[i], False)) self.trie.eachCommonPrefix(text, start, fn) def searchFromTrieId(self, trieId, start, wordLength, isSpace, callback): costs = self.costs leftIds = self.leftIds rightIds = self.rightIds end = self.indices[trieId + 1] for i in range(self.indices[trieId], end): callback(ViterbiNode(i, start, wordLength, costs[i], leftIds[i], rightIds[i], isSpace)) def wordData(self, wordId): return self.data[self.dataOffsets[wordId]:self.dataOffsets[wordId + 1]]
def __init__(self, dataDir, bigendian=False, splitted=False): self.trie = Searcher(dataDir + "/word2id", bigendian) if splitted: paths = sorted(glob.glob(dataDir + "/word.dat.*")) self.data = util.getCharArrayMulti(paths, bigendian) else: self.data = util.getCharArray(dataDir + "/word.dat", bigendian) self.indices = util.getIntArray(dataDir + "/word.ary.idx", bigendian) fmis = FileMappedInputStream(dataDir + "/word.inf", bigendian) try: wordCount = fmis.size() // (4 + 2 + 2 + 2) self.dataOffsets = fmis.getIntArray(wordCount) """ dataOffsets[単語ID] = 単語の素性データの開始位置 """ self.leftIds = fmis.getShortArray(wordCount) """ leftIds[単語ID] = 単語の左文脈ID """ self.rightIds = fmis.getShortArray(wordCount) """ rightIds[単語ID] = 単語の右文脈ID """ self.costs = fmis.getShortArray(wordCount) """ consts[単語ID] = 単語のコスト """ finally: fmis.close()
def __init__(self, dataDir, bigendian=False, splitted=False): self.trie = Searcher(dataDir + "/word2id", bigendian) if splitted: paths = sorted(glob.glob(dataDir + "/word.dat.*")) self.data = util.getCharArrayMulti(paths, bigendian) else: self.data = util.getCharArray(dataDir + "/word.dat", bigendian) self.indices = util.getIntArray(dataDir + "/word.ary.idx", bigendian) fmis = FileMappedInputStream(dataDir + "/word.inf", bigendian) try: wordCount = fmis.size() / (4 + 2 + 2 + 2) self.dataOffsets = fmis.getIntArray(wordCount) """ dataOffsets[単語ID] = 単語の素性データの開始位置 """ self.leftIds = fmis.getShortArray(wordCount) """ leftIds[単語ID] = 単語の左文脈ID """ self.rightIds = fmis.getShortArray(wordCount) """ rightIds[単語ID] = 単語の右文脈ID """ self.costs = fmis.getShortArray(wordCount) """ consts[単語ID] = 単語のコスト """ finally: fmis.close()
def __init__(self, path, bigendian=False, splitted=False, use_mmap=None): self.splitted = splitted self.trie = Searcher(path + "/word2id", bigendian, use_mmap) if splitted: paths = sorted(glob.glob(path + "/word.dat.*")) self.data = util.get_chararray_multi(paths, bigendian) else: self.wd_rd = DictReader(path + "/word.dat", bigendian, use_mmap) with self.wd_rd as r: self.data = r.get_chararray() self.wa_rd = DictReader(path + "/word.ary.idx", bigendian, use_mmap) with self.wa_rd as r: self.indices = r.get_intarray() self.wi_rd = DictReader(path + "/word.inf", bigendian, use_mmap) with self.wi_rd as r: wc = r.size() // (4 + 2 + 2 + 2) self.offsets = r.get_intarray(wc) """ dataOffsets[単語ID] = 単語の素性データの開始位置 """ self.left_ids = r.get_shortarray(wc) """ leftIds[単語ID] = 単語の左文脈ID """ self.right_ids = r.get_shortarray(wc) """ rightIds[単語ID] = 単語の右文脈ID """ self.costs = r.get_shortarray(wc) """ consts[単語ID] = 単語のコスト """
class WordDic: __slots__ = ['splitted', 'trie', 'data', 'wd_rd', 'wa_rd', 'indices', 'wi_rd', 'offsets', 'left_ids', 'right_ids', 'costs'] def __init__(self, path, bigendian=False, splitted=False, use_mmap=None): self.splitted = splitted self.trie = Searcher(path + "/word2id", bigendian, use_mmap) if splitted: paths = sorted(glob.glob(path + "/word.dat.*")) self.data = util.get_chararray_multi(paths, bigendian) else: self.wd_rd = DictReader(path + "/word.dat", bigendian, use_mmap) with self.wd_rd as r: self.data = r.get_chararray() self.wa_rd = DictReader(path + "/word.ary.idx", bigendian, use_mmap) with self.wa_rd as r: self.indices = r.get_intarray() self.wi_rd = DictReader(path + "/word.inf", bigendian, use_mmap) with self.wi_rd as r: wc = r.size() // (4 + 2 + 2 + 2) self.offsets = r.get_intarray(wc) """ dataOffsets[単語ID] = 単語の素性データの開始位置 """ self.left_ids = r.get_shortarray(wc) """ leftIds[単語ID] = 単語の左文脈ID """ self.right_ids = r.get_shortarray(wc) """ rightIds[単語ID] = 単語の右文脈ID """ self.costs = r.get_shortarray(wc) """ consts[単語ID] = 単語のコスト """ def release(self): del self.data del self.indices del self.offsets del self.left_ids del self.right_ids del self.costs self.trie.release() del self.trie if not self.splitted: self.wd_rd.release() del self.wd_rd self.wa_rd.release() del self.wa_rd self.wi_rd.release() del self.wi_rd def search(self, text, start, callback): costs = self.costs left_ids = self.left_ids right_ids = self.right_ids indices = self.indices def fn(start, offset, trieId): end = indices[trieId + 1] for i in range(indices[trieId], end): callback(ViterbiNode(i, start, offset, costs[i], left_ids[i], right_ids[i], False)) self.trie.commonprefix_search(text, start, fn) def search_from_trie(self, trie_id, start, length, isspace, callback): costs = self.costs left_ids = self.left_ids right_ids = self.right_ids end = self.indices[trie_id + 1] for i in range(self.indices[trie_id], end): callback(ViterbiNode(i, start, length, costs[i], left_ids[i], right_ids[i], isspace)) def word_data(self, word_id): return tobytes(self.data[self.offsets[word_id]:self.offsets[word_id + 1]])