Beispiel #1
0
class WordDic:
    def __init__(self, dataDir, bigendian=False, splitted=False):
        self.trie = Searcher(dataDir + "/word2id", bigendian)
        if splitted:
            paths = sorted(glob.glob(dataDir + "/word.dat.*"))
            self.data = util.getCharArrayMulti(paths, bigendian)
        else:
            self.data = util.getCharArray(dataDir + "/word.dat", bigendian)
        self.indices = util.getIntArray(dataDir + "/word.ary.idx", bigendian)

        fmis = FileMappedInputStream(dataDir + "/word.inf", bigendian)
        try:
            wordCount = fmis.size() // (4 + 2 + 2 + 2)
            self.dataOffsets = fmis.getIntArray(wordCount)
            """ dataOffsets[単語ID] = 単語の素性データの開始位置 """
            self.leftIds = fmis.getShortArray(wordCount)
            """ leftIds[単語ID] = 単語の左文脈ID """
            self.rightIds = fmis.getShortArray(wordCount)
            """ rightIds[単語ID] = 単語の右文脈ID """
            self.costs = fmis.getShortArray(wordCount)
            """ consts[単語ID] = 単語のコスト """
        finally:
            fmis.close()

    def search(self, text, start, callback):
        costs = self.costs
        leftIds = self.leftIds
        rightIds = self.rightIds
        indices = self.indices

        def fn(start, offset, trieId):
            end = indices[trieId + 1]
            for i in range(indices[trieId], end):
                callback(
                    ViterbiNode(i, start, offset, costs[i], leftIds[i],
                                rightIds[i], False))

        self.trie.eachCommonPrefix(text, start, fn)

    def searchFromTrieId(self, trieId, start, wordLength, isSpace, callback):
        costs = self.costs
        leftIds = self.leftIds
        rightIds = self.rightIds
        end = self.indices[trieId + 1]
        for i in range(self.indices[trieId], end):
            callback(
                ViterbiNode(i, start, wordLength, costs[i], leftIds[i],
                            rightIds[i], isSpace))

    def wordData(self, wordId):
        return self.data[self.dataOffsets[wordId]:self.dataOffsets[wordId + 1]]
Beispiel #2
0
class WordDic:
    def __init__(self, dataDir, bigendian=False, splitted=False):
        self.trie = Searcher(dataDir + "/word2id", bigendian)
        if splitted:
            paths = sorted(glob.glob(dataDir + "/word.dat.*"))
            self.data = util.getCharArrayMulti(paths, bigendian)
        else:
            self.data = util.getCharArray(dataDir + "/word.dat", bigendian)
        self.indices = util.getIntArray(dataDir + "/word.ary.idx", bigendian)

        fmis = FileMappedInputStream(dataDir + "/word.inf", bigendian)
        try:
            wordCount = fmis.size() // (4 + 2 + 2 + 2)
            self.dataOffsets = fmis.getIntArray(wordCount)
            """ dataOffsets[単語ID] = 単語の素性データの開始位置 """
            self.leftIds = fmis.getShortArray(wordCount)
            """ leftIds[単語ID] = 単語の左文脈ID """
            self.rightIds = fmis.getShortArray(wordCount)
            """ rightIds[単語ID] = 単語の右文脈ID """
            self.costs = fmis.getShortArray(wordCount)
            """ consts[単語ID] = 単語のコスト """
        finally:
            fmis.close()

    def search(self, text, start, callback):
        costs = self.costs
        leftIds = self.leftIds
        rightIds = self.rightIds
        indices = self.indices

        def fn(start, offset, trieId):
            end = indices[trieId + 1]
            for i in range(indices[trieId], end):
                callback(ViterbiNode(i, start, offset, costs[i],
                                     leftIds[i], rightIds[i], False))

        self.trie.eachCommonPrefix(text, start, fn)

    def searchFromTrieId(self, trieId, start, wordLength, isSpace, callback):
        costs = self.costs
        leftIds = self.leftIds
        rightIds = self.rightIds
        end = self.indices[trieId + 1]
        for i in range(self.indices[trieId], end):
            callback(ViterbiNode(i, start, wordLength, costs[i],
                                 leftIds[i], rightIds[i], isSpace))

    def wordData(self, wordId):
        return self.data[self.dataOffsets[wordId]:self.dataOffsets[wordId + 1]]
Beispiel #3
0
    def __init__(self, dataDir, bigendian=False, splitted=False):
        self.trie = Searcher(dataDir + "/word2id", bigendian)
        if splitted:
            paths = sorted(glob.glob(dataDir + "/word.dat.*"))
            self.data = util.getCharArrayMulti(paths, bigendian)
        else:
            self.data = util.getCharArray(dataDir + "/word.dat", bigendian)
        self.indices = util.getIntArray(dataDir + "/word.ary.idx", bigendian)

        fmis = FileMappedInputStream(dataDir + "/word.inf", bigendian)
        try:
            wordCount = fmis.size() // (4 + 2 + 2 + 2)
            self.dataOffsets = fmis.getIntArray(wordCount)
            """ dataOffsets[単語ID] = 単語の素性データの開始位置 """
            self.leftIds = fmis.getShortArray(wordCount)
            """ leftIds[単語ID] = 単語の左文脈ID """
            self.rightIds = fmis.getShortArray(wordCount)
            """ rightIds[単語ID] = 単語の右文脈ID """
            self.costs = fmis.getShortArray(wordCount)
            """ consts[単語ID] = 単語のコスト """
        finally:
            fmis.close()
Beispiel #4
0
    def __init__(self, dataDir, bigendian=False, splitted=False):
        self.trie = Searcher(dataDir + "/word2id", bigendian)
        if splitted:
            paths = sorted(glob.glob(dataDir + "/word.dat.*"))
            self.data = util.getCharArrayMulti(paths, bigendian)
        else:
            self.data = util.getCharArray(dataDir + "/word.dat", bigendian)
        self.indices = util.getIntArray(dataDir + "/word.ary.idx", bigendian)

        fmis = FileMappedInputStream(dataDir + "/word.inf", bigendian)
        try:
            wordCount = fmis.size() / (4 + 2 + 2 + 2)
            self.dataOffsets = fmis.getIntArray(wordCount)
            """ dataOffsets[単語ID] = 単語の素性データの開始位置 """
            self.leftIds = fmis.getShortArray(wordCount)
            """ leftIds[単語ID] = 単語の左文脈ID """
            self.rightIds = fmis.getShortArray(wordCount)
            """ rightIds[単語ID] = 単語の右文脈ID """
            self.costs = fmis.getShortArray(wordCount)
            """ consts[単語ID] = 単語のコスト """
        finally:
            fmis.close()
Beispiel #5
0
 def __init__(self, path, bigendian=False, splitted=False, use_mmap=None):
     self.splitted = splitted
     self.trie = Searcher(path + "/word2id", bigendian, use_mmap)
     if splitted:
         paths = sorted(glob.glob(path + "/word.dat.*"))
         self.data = util.get_chararray_multi(paths, bigendian)
     else:
         self.wd_rd = DictReader(path + "/word.dat", bigendian, use_mmap)
         with self.wd_rd as r:
             self.data = r.get_chararray()
     self.wa_rd = DictReader(path + "/word.ary.idx", bigendian, use_mmap)
     with self.wa_rd as r:
         self.indices = r.get_intarray()
     self.wi_rd = DictReader(path + "/word.inf", bigendian, use_mmap)
     with self.wi_rd as r:
         wc = r.size() // (4 + 2 + 2 + 2)
         self.offsets = r.get_intarray(wc)
         """ dataOffsets[単語ID] = 単語の素性データの開始位置 """
         self.left_ids = r.get_shortarray(wc)
         """ leftIds[単語ID] = 単語の左文脈ID """
         self.right_ids = r.get_shortarray(wc)
         """ rightIds[単語ID] = 単語の右文脈ID """
         self.costs = r.get_shortarray(wc)
         """ consts[単語ID] = 単語のコスト """
Beispiel #6
0
class WordDic:
    __slots__ = ['splitted', 'trie', 'data', 'wd_rd', 'wa_rd', 'indices',
                 'wi_rd', 'offsets', 'left_ids', 'right_ids', 'costs']

    def __init__(self, path, bigendian=False, splitted=False, use_mmap=None):
        self.splitted = splitted
        self.trie = Searcher(path + "/word2id", bigendian, use_mmap)
        if splitted:
            paths = sorted(glob.glob(path + "/word.dat.*"))
            self.data = util.get_chararray_multi(paths, bigendian)
        else:
            self.wd_rd = DictReader(path + "/word.dat", bigendian, use_mmap)
            with self.wd_rd as r:
                self.data = r.get_chararray()
        self.wa_rd = DictReader(path + "/word.ary.idx", bigendian, use_mmap)
        with self.wa_rd as r:
            self.indices = r.get_intarray()
        self.wi_rd = DictReader(path + "/word.inf", bigendian, use_mmap)
        with self.wi_rd as r:
            wc = r.size() // (4 + 2 + 2 + 2)
            self.offsets = r.get_intarray(wc)
            """ dataOffsets[単語ID] = 単語の素性データの開始位置 """
            self.left_ids = r.get_shortarray(wc)
            """ leftIds[単語ID] = 単語の左文脈ID """
            self.right_ids = r.get_shortarray(wc)
            """ rightIds[単語ID] = 単語の右文脈ID """
            self.costs = r.get_shortarray(wc)
            """ consts[単語ID] = 単語のコスト """

    def release(self):
        del self.data
        del self.indices
        del self.offsets
        del self.left_ids
        del self.right_ids
        del self.costs
        self.trie.release()
        del self.trie
        if not self.splitted:
            self.wd_rd.release()
            del self.wd_rd
        self.wa_rd.release()
        del self.wa_rd
        self.wi_rd.release()
        del self.wi_rd

    def search(self, text, start, callback):
        costs = self.costs
        left_ids = self.left_ids
        right_ids = self.right_ids
        indices = self.indices

        def fn(start, offset, trieId):
            end = indices[trieId + 1]
            for i in range(indices[trieId], end):
                callback(ViterbiNode(i, start, offset, costs[i], left_ids[i],
                                     right_ids[i], False))

        self.trie.commonprefix_search(text, start, fn)

    def search_from_trie(self, trie_id, start, length, isspace, callback):
        costs = self.costs
        left_ids = self.left_ids
        right_ids = self.right_ids
        end = self.indices[trie_id + 1]
        for i in range(self.indices[trie_id], end):
            callback(ViterbiNode(i, start, length, costs[i], left_ids[i],
                                 right_ids[i], isspace))

    def word_data(self, word_id):
        return tobytes(self.data[self.offsets[word_id]:self.offsets[word_id +
                                                                    1]])