def test_darts_with_values(self):
     keys = ['test', 'テスト', 'テストケース']
     darts = DoubleArray()
     darts.build(sorted([key.encode() for key in keys]), values=[3, 5, 1])
     self.assertEqual(5, darts.exact_match_search('テスト'.encode(), pair_type=False))
     self.assertEqual(3, darts.common_prefix_search('testcase'.encode(), pair_type=False)[0])
     self.assertEqual(1, darts.exact_match_search('テストケース'.encode(), pair_type=False))
     self.assertEqual(1, darts.common_prefix_search('テストケース'.encode(), pair_type=False)[1])
    def write_lexicon(self, io_out):
        trie = DoubleArray()
        wordid_table = JTypedByteBuffer()
        keys = []
        vals = []
        for key, word_ids in self.trie_keys.items():
            keys.append(key)
            vals.append(wordid_table.tell())
            wordid_table.write_int(len(word_ids), 'byte')
            for wid in word_ids:
                wordid_table.write_int(wid, 'int')

        self.logger.info('building the trie...')

        trie.build(keys, lengths=[len(k) for k in keys], values=vals)

        self.logger.info('done\n')
        self.logger.info('writing the trie...')
        self.byte_buffer.clear()
        self.byte_buffer.write_int(trie.size(), 'int')
        self.byte_buffer.seek(0)
        io_out.write(self.byte_buffer.read())
        self.byte_buffer.clear()

        io_out.write(trie.array())
        self.__logging_size(trie.size() * 4 + 4)
        trie.clear()
        del trie

        self.logger.info('writing the word-ID table...')
        self.byte_buffer.write_int(wordid_table.tell(), 'int')
        self.byte_buffer.seek(0)
        io_out.write(self.byte_buffer.read())
        self.byte_buffer.clear()

        wordid_table.seek(0)
        io_out.write(wordid_table.read())
        self.__logging_size(wordid_table.tell() + 4)
        del wordid_table

        self.logger.info('writing the word parameters...')
        self.byte_buffer.write_int(len(self.entries), 'int')
        for entry in self.entries:
            self.byte_buffer.write_int(entry.parameters[0], 'short')
            self.byte_buffer.write_int(entry.parameters[1], 'short')
            self.byte_buffer.write_int(entry.parameters[2], 'short')
            self.byte_buffer.seek(0)
            io_out.write(self.byte_buffer.read())
            self.byte_buffer.clear()
        self.__logging_size(len(self.entries) * 6 + 4)
        self.write_wordinfo(io_out)
Beispiel #3
0
    def __init__(self, bytes_: mmap.mmap, offset: int, has_synonym_gid: bool):
        self.trie = DoubleArray()
        bytes_.seek(offset)
        size = int.from_bytes(bytes_.read(4), 'little')
        offset += 4

        array = memoryview(bytes_)[offset:offset + size * 4]
        self.trie.set_array(array, size)
        offset += self.trie.total_size()

        self.word_id_table = wordidtable.WordIdTable(bytes_, offset)
        offset += self.word_id_table.storage_size()

        self.word_params = wordparameterlist.WordParameterList(bytes_, offset)
        offset += self.word_params.storage_size()

        self.word_infos = wordinfolist.WordInfoList(bytes_, offset, self.word_params.get_size(), has_synonym_gid)
    def __init__(self, bytes_, offset):
        """Constructs a new double-array trie

        Args:
            bytes_ (mmap.mmap): a memory-mapped dictionary
            offset (int): byte offset
        """
        position = offset
        self.trie = DoubleArray()
        bytes_.seek(position)

        # trie size
        size = int.from_bytes(bytes_.read(4), 'little')
        position += 4

        # trie array
        array = memoryview(bytes_)[position:position + size * 4]
        self.trie.set_array(array, size)
        position += self.trie.total_size()

        self.group_id_table = idtable.IdTable(bytes_, position)
        position += self.group_id_table.storage_size()

        self.storage_size = position - offset
    def write_trie(self, io_out):
        """Writes ``headword``-``group_id`` pairs to the specified output file.

        Args:
            io_out (BufferedWriter): an output stream
        """
        trie = DoubleArray()
        keys = []
        vals = []
        id_table = JTypedByteBuffer()
        for key, ids in self.trie_keys.items():
            keys.append(key)
            vals.append(id_table.tell())
            id_table.write_int(len(ids), 'byte')
            for _id in ids:
                id_table.write_int(_id, 'int')

        self.logger.info('building the trie...')
        trie.build(keys, lengths=[len(k) for k in keys], values=vals)
        self.logger.info('done\n')
        self.logger.info('writing the trie...')
        self.byte_buffer.clear()
        self.byte_buffer.write_int(trie.size(), 'int')
        self.byte_buffer.seek(0)
        io_out.write(self.byte_buffer.read())
        self.byte_buffer.clear()
        io_out.write(trie.array())
        self.__logging_size(trie.size() * 4 + 4)
        trie.clear()
        del trie

        self.logger.info('writing the word-ID table...')
        self.byte_buffer.write_int(id_table.tell(), 'int')
        self.byte_buffer.seek(0)
        io_out.write(self.byte_buffer.read())
        self.byte_buffer.clear()
        id_table.seek(0)
        io_out.write(id_table.read())
        self.__logging_size(id_table.tell() + 4)
        del id_table
 def test_darts_pickle(self):
     keys = ['test', 'テスト', 'テストケース']
     darts = DoubleArray()
     darts.build(sorted([key.encode() for key in keys]), values=[3, 5, 1])
     with tempfile.NamedTemporaryFile('wb') as output_file:
         pickle.dump(darts, output_file)
         output_file.flush()
         with open(output_file.name, 'rb') as input_file:
             darts = pickle.load(input_file)
     self.assertEqual(5, darts.exact_match_search('テスト'.encode(), pair_type=False))
     self.assertEqual(3, darts.common_prefix_search('testcase'.encode(), pair_type=False)[0])
Beispiel #7
0
 def setUpClass(cls):
     cls.darts = DoubleArray()
class DoubleArrayTrie(object):

    def __init__(self, bytes_, offset):
        """Constructs a new double-array trie

        Args:
            bytes_ (mmap.mmap): a memory-mapped dictionary
            offset (int): byte offset
        """
        position = offset
        self.trie = DoubleArray()
        bytes_.seek(position)

        # trie size
        size = int.from_bytes(bytes_.read(4), 'little')
        position += 4

        # trie array
        array = memoryview(bytes_)[position:position + size * 4]
        self.trie.set_array(array, size)
        position += self.trie.total_size()

        self.group_id_table = idtable.IdTable(bytes_, position)
        position += self.group_id_table.storage_size()

        self.storage_size = position - offset

    def lookup_by_common_prefix(self, text, offset):
        """Searches group IDs with the `text` by common prefix.

        Args:
            text (bytes): a memory-mapped dictionary
            offset (int): byte offset

        Yields:
            tuple[int, int]: a group ID and
        """
        key = text[offset:]
        result = self.trie.common_prefix_search(key, length=len(key))
        for index, length in result:
            group_ids = self.group_id_table.get(index)
            length += offset
            for group_id in group_ids:
                yield group_id, length

    def lookup_by_exact_match(self, text):
        """Searches group IDs with the ``text`` by exact match.

        Args:
            text (bytes): a head word to search for

        Returns:
            list[int]: a list of synonym group IDs
        """
        results = self.trie.exact_match_search(text)
        if results[0] < 0:
            return []
        else:
            return list(self.group_id_table.get(results[0]))

    def get_storage_size(self):
        """int: a storage size of the double-array trie"""
        return self.storage_size
Beispiel #9
0
class DoubleArrayLexicon(Lexicon):

    __SIGNED_SHORT_MIN = -32768
    __SIGNED_SHORT_MAX = 32767
    __USER_DICT_COST_PER_MORPH = -20

    trie = None
    word_id_table = None
    word_params = None

    def __init__(self, bytes_: mmap.mmap, offset: int, has_synonym_gid: bool):
        self.trie = DoubleArray()
        bytes_.seek(offset)
        size = int.from_bytes(bytes_.read(4), 'little')
        offset += 4

        array = memoryview(bytes_)[offset:offset + size * 4]
        self.trie.set_array(array, size)
        offset += self.trie.total_size()

        self.word_id_table = wordidtable.WordIdTable(bytes_, offset)
        offset += self.word_id_table.storage_size()

        self.word_params = wordparameterlist.WordParameterList(bytes_, offset)
        offset += self.word_params.storage_size()

        self.word_infos = wordinfolist.WordInfoList(bytes_, offset, self.word_params.get_size(), has_synonym_gid)

    def __del__(self):
        del self.word_params

    def lookup(self, text: bytes, offset: int) -> Lexicon.Itr:
        key = text[offset:]
        result = self.trie.common_prefix_search(key, length=len(key))
        for index, length in result:
            word_ids = self.word_id_table.get(index)
            length += offset
            for word_id in word_ids:
                yield (word_id, length)

    def get_left_id(self, word_id: int) -> int:
        return self.word_params.get_left_id(word_id)

    def get_right_id(self, word_id: int) -> int:
        return self.word_params.get_right_id(word_id)

    def get_cost(self, word_id: int) -> int:
        return self.word_params.get_cost(word_id)

    def get_word_info(self, word_id: int) -> 'WordInfo':  # noqa: F821
        return self.word_infos.get_word_info(word_id)

    def size(self) -> int:
        return self.word_params.size

    def get_word_id(self, headword: str, pos_id: int, reading_form: str) -> int:
        for wid in range(self.word_infos.size()):
            info = self.word_infos.get_word_info(wid)
            if info.surface == headword \
                    and info.pos_id == pos_id \
                    and info.reading_form == reading_form:
                return wid
        return -1

    def get_dictionary_id(self, word_id: int) -> int:
        return 0

    def calculate_cost(self, tokenizer) -> None:
        for wid in range(self.word_params.size):
            if self.get_cost(wid) != self.__SIGNED_SHORT_MIN:
                continue
            surface = self.get_word_info(wid).surface
            ms = tokenizer.tokenize(surface, None)
            cost = ms.get_internal_cost() + self.__USER_DICT_COST_PER_MORPH * len(ms)
            cost = min(cost, self.__SIGNED_SHORT_MAX)
            cost = max(cost, self.__SIGNED_SHORT_MIN)
            self.word_params.set_cost(wid, cost)