def test_darts_with_values(self): keys = ['test', 'テスト', 'テストケース'] darts = DoubleArray() darts.build(sorted([key.encode() for key in keys]), values=[3, 5, 1]) self.assertEqual(5, darts.exact_match_search('テスト'.encode(), pair_type=False)) self.assertEqual(3, darts.common_prefix_search('testcase'.encode(), pair_type=False)[0]) self.assertEqual(1, darts.exact_match_search('テストケース'.encode(), pair_type=False)) self.assertEqual(1, darts.common_prefix_search('テストケース'.encode(), pair_type=False)[1])
def write_lexicon(self, io_out): trie = DoubleArray() wordid_table = JTypedByteBuffer() keys = [] vals = [] for key, word_ids in self.trie_keys.items(): keys.append(key) vals.append(wordid_table.tell()) wordid_table.write_int(len(word_ids), 'byte') for wid in word_ids: wordid_table.write_int(wid, 'int') self.logger.info('building the trie...') trie.build(keys, lengths=[len(k) for k in keys], values=vals) self.logger.info('done\n') self.logger.info('writing the trie...') self.byte_buffer.clear() self.byte_buffer.write_int(trie.size(), 'int') self.byte_buffer.seek(0) io_out.write(self.byte_buffer.read()) self.byte_buffer.clear() io_out.write(trie.array()) self.__logging_size(trie.size() * 4 + 4) trie.clear() del trie self.logger.info('writing the word-ID table...') self.byte_buffer.write_int(wordid_table.tell(), 'int') self.byte_buffer.seek(0) io_out.write(self.byte_buffer.read()) self.byte_buffer.clear() wordid_table.seek(0) io_out.write(wordid_table.read()) self.__logging_size(wordid_table.tell() + 4) del wordid_table self.logger.info('writing the word parameters...') self.byte_buffer.write_int(len(self.entries), 'int') for entry in self.entries: self.byte_buffer.write_int(entry.parameters[0], 'short') self.byte_buffer.write_int(entry.parameters[1], 'short') self.byte_buffer.write_int(entry.parameters[2], 'short') self.byte_buffer.seek(0) io_out.write(self.byte_buffer.read()) self.byte_buffer.clear() self.__logging_size(len(self.entries) * 6 + 4) self.write_wordinfo(io_out)
def __init__(self, bytes_: mmap.mmap, offset: int, has_synonym_gid: bool): self.trie = DoubleArray() bytes_.seek(offset) size = int.from_bytes(bytes_.read(4), 'little') offset += 4 array = memoryview(bytes_)[offset:offset + size * 4] self.trie.set_array(array, size) offset += self.trie.total_size() self.word_id_table = wordidtable.WordIdTable(bytes_, offset) offset += self.word_id_table.storage_size() self.word_params = wordparameterlist.WordParameterList(bytes_, offset) offset += self.word_params.storage_size() self.word_infos = wordinfolist.WordInfoList(bytes_, offset, self.word_params.get_size(), has_synonym_gid)
def __init__(self, bytes_, offset): """Constructs a new double-array trie Args: bytes_ (mmap.mmap): a memory-mapped dictionary offset (int): byte offset """ position = offset self.trie = DoubleArray() bytes_.seek(position) # trie size size = int.from_bytes(bytes_.read(4), 'little') position += 4 # trie array array = memoryview(bytes_)[position:position + size * 4] self.trie.set_array(array, size) position += self.trie.total_size() self.group_id_table = idtable.IdTable(bytes_, position) position += self.group_id_table.storage_size() self.storage_size = position - offset
def write_trie(self, io_out): """Writes ``headword``-``group_id`` pairs to the specified output file. Args: io_out (BufferedWriter): an output stream """ trie = DoubleArray() keys = [] vals = [] id_table = JTypedByteBuffer() for key, ids in self.trie_keys.items(): keys.append(key) vals.append(id_table.tell()) id_table.write_int(len(ids), 'byte') for _id in ids: id_table.write_int(_id, 'int') self.logger.info('building the trie...') trie.build(keys, lengths=[len(k) for k in keys], values=vals) self.logger.info('done\n') self.logger.info('writing the trie...') self.byte_buffer.clear() self.byte_buffer.write_int(trie.size(), 'int') self.byte_buffer.seek(0) io_out.write(self.byte_buffer.read()) self.byte_buffer.clear() io_out.write(trie.array()) self.__logging_size(trie.size() * 4 + 4) trie.clear() del trie self.logger.info('writing the word-ID table...') self.byte_buffer.write_int(id_table.tell(), 'int') self.byte_buffer.seek(0) io_out.write(self.byte_buffer.read()) self.byte_buffer.clear() id_table.seek(0) io_out.write(id_table.read()) self.__logging_size(id_table.tell() + 4) del id_table
def test_darts_pickle(self): keys = ['test', 'テスト', 'テストケース'] darts = DoubleArray() darts.build(sorted([key.encode() for key in keys]), values=[3, 5, 1]) with tempfile.NamedTemporaryFile('wb') as output_file: pickle.dump(darts, output_file) output_file.flush() with open(output_file.name, 'rb') as input_file: darts = pickle.load(input_file) self.assertEqual(5, darts.exact_match_search('テスト'.encode(), pair_type=False)) self.assertEqual(3, darts.common_prefix_search('testcase'.encode(), pair_type=False)[0])
def setUpClass(cls): cls.darts = DoubleArray()
class DoubleArrayTrie(object): def __init__(self, bytes_, offset): """Constructs a new double-array trie Args: bytes_ (mmap.mmap): a memory-mapped dictionary offset (int): byte offset """ position = offset self.trie = DoubleArray() bytes_.seek(position) # trie size size = int.from_bytes(bytes_.read(4), 'little') position += 4 # trie array array = memoryview(bytes_)[position:position + size * 4] self.trie.set_array(array, size) position += self.trie.total_size() self.group_id_table = idtable.IdTable(bytes_, position) position += self.group_id_table.storage_size() self.storage_size = position - offset def lookup_by_common_prefix(self, text, offset): """Searches group IDs with the `text` by common prefix. Args: text (bytes): a memory-mapped dictionary offset (int): byte offset Yields: tuple[int, int]: a group ID and """ key = text[offset:] result = self.trie.common_prefix_search(key, length=len(key)) for index, length in result: group_ids = self.group_id_table.get(index) length += offset for group_id in group_ids: yield group_id, length def lookup_by_exact_match(self, text): """Searches group IDs with the ``text`` by exact match. Args: text (bytes): a head word to search for Returns: list[int]: a list of synonym group IDs """ results = self.trie.exact_match_search(text) if results[0] < 0: return [] else: return list(self.group_id_table.get(results[0])) def get_storage_size(self): """int: a storage size of the double-array trie""" return self.storage_size
class DoubleArrayLexicon(Lexicon): __SIGNED_SHORT_MIN = -32768 __SIGNED_SHORT_MAX = 32767 __USER_DICT_COST_PER_MORPH = -20 trie = None word_id_table = None word_params = None def __init__(self, bytes_: mmap.mmap, offset: int, has_synonym_gid: bool): self.trie = DoubleArray() bytes_.seek(offset) size = int.from_bytes(bytes_.read(4), 'little') offset += 4 array = memoryview(bytes_)[offset:offset + size * 4] self.trie.set_array(array, size) offset += self.trie.total_size() self.word_id_table = wordidtable.WordIdTable(bytes_, offset) offset += self.word_id_table.storage_size() self.word_params = wordparameterlist.WordParameterList(bytes_, offset) offset += self.word_params.storage_size() self.word_infos = wordinfolist.WordInfoList(bytes_, offset, self.word_params.get_size(), has_synonym_gid) def __del__(self): del self.word_params def lookup(self, text: bytes, offset: int) -> Lexicon.Itr: key = text[offset:] result = self.trie.common_prefix_search(key, length=len(key)) for index, length in result: word_ids = self.word_id_table.get(index) length += offset for word_id in word_ids: yield (word_id, length) def get_left_id(self, word_id: int) -> int: return self.word_params.get_left_id(word_id) def get_right_id(self, word_id: int) -> int: return self.word_params.get_right_id(word_id) def get_cost(self, word_id: int) -> int: return self.word_params.get_cost(word_id) def get_word_info(self, word_id: int) -> 'WordInfo': # noqa: F821 return self.word_infos.get_word_info(word_id) def size(self) -> int: return self.word_params.size def get_word_id(self, headword: str, pos_id: int, reading_form: str) -> int: for wid in range(self.word_infos.size()): info = self.word_infos.get_word_info(wid) if info.surface == headword \ and info.pos_id == pos_id \ and info.reading_form == reading_form: return wid return -1 def get_dictionary_id(self, word_id: int) -> int: return 0 def calculate_cost(self, tokenizer) -> None: for wid in range(self.word_params.size): if self.get_cost(wid) != self.__SIGNED_SHORT_MIN: continue surface = self.get_word_info(wid).surface ms = tokenizer.tokenize(surface, None) cost = ms.get_internal_cost() + self.__USER_DICT_COST_PER_MORPH * len(ms) cost = min(cost, self.__SIGNED_SHORT_MAX) cost = max(cost, self.__SIGNED_SHORT_MIN) self.word_params.set_cost(wid, cost)