class WordDic: def __init__(self, dataDir, bigendian=False, splitted=False): self.trie = Searcher(dataDir + "/word2id", bigendian) if splitted: paths = sorted(glob.glob(dataDir + "/word.dat.*")) self.data = util.getCharArrayMulti(paths, bigendian) else: self.data = util.getCharArray(dataDir + "/word.dat", bigendian) self.indices = util.getIntArray(dataDir + "/word.ary.idx", bigendian) fmis = FileMappedInputStream(dataDir + "/word.inf", bigendian) try: wordCount = fmis.size() // (4 + 2 + 2 + 2) self.dataOffsets = fmis.getIntArray(wordCount) """ dataOffsets[単語ID] = 単語の素性データの開始位置 """ self.leftIds = fmis.getShortArray(wordCount) """ leftIds[単語ID] = 単語の左文脈ID """ self.rightIds = fmis.getShortArray(wordCount) """ rightIds[単語ID] = 単語の右文脈ID """ self.costs = fmis.getShortArray(wordCount) """ consts[単語ID] = 単語のコスト """ finally: fmis.close() def search(self, text, start, callback): costs = self.costs leftIds = self.leftIds rightIds = self.rightIds indices = self.indices def fn(start, offset, trieId): end = indices[trieId + 1] for i in range(indices[trieId], end): callback(ViterbiNode(i, start, offset, costs[i], leftIds[i], rightIds[i], False)) self.trie.eachCommonPrefix(text, start, fn) def searchFromTrieId(self, trieId, start, wordLength, isSpace, callback): costs = self.costs leftIds = self.leftIds rightIds = self.rightIds end = self.indices[trieId + 1] for i in range(self.indices[trieId], end): callback(ViterbiNode(i, start, wordLength, costs[i], leftIds[i], rightIds[i], isSpace)) def wordData(self, wordId): return self.data[self.dataOffsets[wordId]:self.dataOffsets[wordId + 1]]
def __init__(self, dataDir, bigendian=False, splitted=False): self.trie = Searcher(dataDir + "/word2id", bigendian) if splitted: paths = sorted(glob.glob(dataDir + "/word.dat.*")) self.data = util.getCharArrayMulti(paths, bigendian) else: self.data = util.getCharArray(dataDir + "/word.dat", bigendian) self.indices = util.getIntArray(dataDir + "/word.ary.idx", bigendian) fmis = FileMappedInputStream(dataDir + "/word.inf", bigendian) try: wordCount = fmis.size() // (4 + 2 + 2 + 2) self.dataOffsets = fmis.getIntArray(wordCount) """ dataOffsets[単語ID] = 単語の素性データの開始位置 """ self.leftIds = fmis.getShortArray(wordCount) """ leftIds[単語ID] = 単語の左文脈ID """ self.rightIds = fmis.getShortArray(wordCount) """ rightIds[単語ID] = 単語の右文脈ID """ self.costs = fmis.getShortArray(wordCount) """ consts[単語ID] = 単語のコスト """ finally: fmis.close()