Ejemplo n.º 1
0
class CharCategory:
    __slots__ = ['cc_rd', 'cat', 'c2c_rd', 'char2id', 'eql_masks']

    def __init__(self, path, bigendian=False, use_mmap=None):
        self.cc_rd = DictReader(path + "/char.category", bigendian, use_mmap)
        with self.cc_rd as r:
            self.cat = self.convert_categories(r.get_intarray())
        self.c2c_rd = DictReader(path + "/code2category", bigendian, use_mmap)
        with self.c2c_rd as r:
            self.char2id = r.get_intarray(r.size() // 4 // 2)
            self.eql_masks = r.get_intarray(r.size() // 4 // 2)

    def release(self):
        del self.cat
        del self.char2id
        del self.eql_masks
        self.cc_rd.release()
        del self.cc_rd
        self.c2c_rd.release()
        del self.c2c_rd

    def category(self, code):
        return self.cat[self.char2id[code]]

    def is_compatible(self, code1, code2):
        return (self.eql_masks[code1] & self.eql_masks[code2]) != 0

    def convert_categories(self, d):
        return [
            Category(d[i], d[i + 1], d[i + 2], d[i + 3])
            for i in range(0, len(d), 4)
        ]
Ejemplo n.º 2
0
class Matrix:
    """
    形態素の連接コスト表を扱うクラス
    """
    __slots__ = ['rd', 'left_size', 'matrix']

    def __init__(self, path, bigendian=False, use_mmap=None):
        self.rd = DictReader(path + "/matrix.bin", bigendian, use_mmap)
        with self.rd as r:
            self.left_size = r.get_int()
            right_size = r.get_int()
            self.matrix = r.get_shortarray(self.left_size * right_size)

    def release(self):
        del self.matrix
        self.rd.release()

    def linkcost(self, left_id, right_id):
        """
        形態素同士の連接コストを求める
        """
        return self.matrix[right_id * self.left_size + left_id]
Ejemplo n.º 3
0
class Searcher:
    """
    DoubleArray検索用のクラス
    """
    __slots__ = ['rd', 'num_keys', 'begs', 'base', 'lens', 'chck', 'tail']

    def __init__(self, path, bigendian=False, use_mmap=None):
        """
        instantiate a DoubleArray Searcher

        @param filepath path of DoubleArray
        @param mmap use mmap or not; None: depends on environment
        """
        self.rd = DictReader(path, bigendian, use_mmap)
        with self.rd as r:
            node_size = r.get_int()
            tind_size = r.get_int()
            tail_size = r.get_int()
            self.num_keys = tind_size
            self.begs = r.get_intarray(tind_size)
            self.base = r.get_intarray(node_size)
            self.lens = r.get_shortarray(tind_size)
            self.chck = r.get_chararray(node_size)
            self.tail = r.get_chararray(tail_size)

    def release(self):
        del self.begs
        del self.base
        del self.lens
        del self.chck
        del self.tail
        self.rd.release()
        del self.rd

    def size(self):
        """
        DoubleArrayに格納されているキーの数を返す
        @return DoubleArrayに格納されているキー数
        """
        return self.num_keys

    def search(self, key):
        """
        キーを検索する

        @param key 検索対象のキー文字列
        @return キーが見つかった場合はそのIDを、見つからなかった場合は-1を返す
        """
        begs = self.begs
        tail = self.tail
        lens = self.lens
        base = self.base
        chck = self.chck
        node = base[0]

        def exists(kin, node):
            node_id = base_id(node)
            beg = begs[node_id]
            s = tail[beg:beg + lens[node_id]]
            return kin.rest().equals(s)

        kin = KeyStream(key)
        code = kin.read()
        while 1:
            idx = node + code
            node = base[idx]
            if chck[idx] == code:
                if node >= 0:
                    continue
                elif kin.eos() or exists(kin, node):
                    return base_id(node)
            return -1

# with, iterator
    def commonprefix_search(self, key, start, fn):
        """
        common-prefix検索を行う
        条件に一致するキーが見つかる度に、fn.call(...)メソッドが呼び出される

        @param key 検索対象のキー文字列
        @param start 検索対象となるキー文字列の最初の添字
        @param fn 一致を検出した場合に呼び出されるメソッドを定義したコールバック関数
        """
        base = self.base
        chck = self.chck
        begs = self.begs
        tail = self.tail
        lens = self.lens
        node = base[0]
        offset = -1
        kin = KeyStream(key, start)

        def call_if_key_including(kin, node, start, offset, fn):
            node_id = base_id(node)
            l = lens[node_id]
            beg = begs[node_id]
            prefix = tail[beg:beg+l]
            if kin.startswith(prefix):
                fn(start, offset + l + 1, node_id)

        while 1:
            code = kin.read()
            offset += 1
            terminal_idx = node + chck_TERMINATE_CODE
            if chck[terminal_idx] == chck_TERMINATE_CODE:
                fn(start, offset, base_id(base[terminal_idx]))
                if code == chck_TERMINATE_CODE:
                    return
            idx = node + code
            node = base[idx]
            if chck[idx] == code:
                if node >= 0:
                    continue
                else:
                    call_if_key_including(kin, node, start, offset, fn)
            return
Ejemplo n.º 4
0
class WordDic:
    __slots__ = ['splitted', 'trie', 'data', 'wd_rd', 'wa_rd', 'indices',
                 'wi_rd', 'offsets', 'left_ids', 'right_ids', 'costs']

    def __init__(self, path, bigendian=False, splitted=False, use_mmap=None):
        self.splitted = splitted
        self.trie = Searcher(path + "/word2id", bigendian, use_mmap)
        if splitted:
            paths = sorted(glob.glob(path + "/word.dat.*"))
            self.data = util.get_chararray_multi(paths, bigendian)
        else:
            self.wd_rd = DictReader(path + "/word.dat", bigendian, use_mmap)
            with self.wd_rd as r:
                self.data = r.get_chararray()
        self.wa_rd = DictReader(path + "/word.ary.idx", bigendian, use_mmap)
        with self.wa_rd as r:
            self.indices = r.get_intarray()
        self.wi_rd = DictReader(path + "/word.inf", bigendian, use_mmap)
        with self.wi_rd as r:
            wc = r.size() // (4 + 2 + 2 + 2)
            self.offsets = r.get_intarray(wc)
            """ dataOffsets[単語ID] = 単語の素性データの開始位置 """
            self.left_ids = r.get_shortarray(wc)
            """ leftIds[単語ID] = 単語の左文脈ID """
            self.right_ids = r.get_shortarray(wc)
            """ rightIds[単語ID] = 単語の右文脈ID """
            self.costs = r.get_shortarray(wc)
            """ consts[単語ID] = 単語のコスト """

    def release(self):
        del self.data
        del self.indices
        del self.offsets
        del self.left_ids
        del self.right_ids
        del self.costs
        self.trie.release()
        del self.trie
        if not self.splitted:
            self.wd_rd.release()
            del self.wd_rd
        self.wa_rd.release()
        del self.wa_rd
        self.wi_rd.release()
        del self.wi_rd

    def search(self, text, start, callback):
        costs = self.costs
        left_ids = self.left_ids
        right_ids = self.right_ids
        indices = self.indices

        def fn(start, offset, trieId):
            end = indices[trieId + 1]
            for i in range(indices[trieId], end):
                callback(ViterbiNode(i, start, offset, costs[i], left_ids[i],
                                     right_ids[i], False))

        self.trie.commonprefix_search(text, start, fn)

    def search_from_trie(self, trie_id, start, length, isspace, callback):
        costs = self.costs
        left_ids = self.left_ids
        right_ids = self.right_ids
        end = self.indices[trie_id + 1]
        for i in range(self.indices[trie_id], end):
            callback(ViterbiNode(i, start, length, costs[i], left_ids[i],
                                 right_ids[i], isspace))

    def word_data(self, word_id):
        return tobytes(self.data[self.offsets[word_id]:self.offsets[word_id +
                                                                    1]])