def test_ignore_case_match_longest(self): if sys.version_info.major < 3: return trie = Trie(ignore_case=True) ids = {w: trie.insert(w) for w in [u"aİİ", u"aai̇", u"aai̇bİ"]} matches = list(trie.match_longest(u"aaİ aai̇bİaa")) self.assertEqual(matches, [(ids[u"aai̇"], 0, len(u"aaİ")), (ids[u"aai̇bİ"], len(u"aaİ "), len(u"aaİ aai̇bİ"))]) sep = set([ord(" ")]) # space as seperator matches = list(trie.match_longest(u"aaİ aai̇bİaa", sep)) self.assertEqual(matches, [ (ids[u"aai̇"], 0, len(u"aaİ")), ])
def test_match_longest(self): trie = Trie() ids = { w: trie.insert(w) for w in [u"New York", u"New", u"York", u"York City", u"City", u"City is"] } matches = list(trie.match_longest(u"New York City isA")) self.assertEqual( matches, [(ids[u"New York"], 0, len(u"New York")), (ids[u"City is"], len(u"New York "), len(u"New York City is"))]) sep = set([ord(" ")]) # space as seperator matches = list(trie.match_longest(u"New York City isA", sep)) self.assertEqual( matches, [(ids[u"New York"], 0, len(u"New York")), (ids[u"City"], len(u"New York "), len(u"New York City"))])
def test_match_words(self): dir_ = os.path.dirname(__file__) trie = Trie() ids = [] with open(os.path.join(dir_, "../bench/words.txt")) as fi: for l in fi: l = l.strip() if isinstance(l, bytes): l = l.decode("utf8") if len(l) > 0: ids.append(trie.insert(l)) with open(os.path.join(dir_, "../bench/words.txt")) as fi: txt = fi.read() if isinstance(txt, bytes): txt = txt.decode("utf8") sep = set([ord("\n")]) matched = [] for v, start, end in trie.match_longest(txt, sep): matched.append(v) self.assertEqual(txt[start:end], trie[v]) self.assertEqual(matched, ids)
class WikiPageDetector: def __init__(self, pages: Iterable[str] = None): self._map = None self._trie = None if pages is not None: self.build(pages) @staticmethod def load(path: Path): wpd = WikiPageDetector() wpd._map = pickle_load(path / "wpd_map.gz") with (path / "wpd_trie").open("r+b") as bf: wpd._trie = Trie.from_buff(mmap(bf.fileno(), 0), copy=False) return wpd def dump(self, path: Path): self._trie.save(str(path / "wpd_trie")) pickle_dump(self._map, path / "wpd_map.gz", compress=True) def build(self, pages: Iterable[str]): key2titles = {} for page in pages: if not page: continue key = _clean_title(page).lower() if not key: key = page titles = key2titles.setdefault(key, []) titles.append(page) mapping = {} self._trie = Trie(ignore_case=True) for key in key2titles: id_ = self._trie.insert(key) mapping.setdefault(id_, tuple(key2titles[key])) self._map = tuple([mapping.get(i) for i in range(max(mapping) + 1)]) def find_pages(self, text: str): def iter_matches(source): ac_seps = set([ord(p) for p in _XP_SEPS.findall(source)]) for id_, start_idx, end_idx in self._trie.match_longest( source, ac_seps ): yield (start_idx, end_idx, self._map[id_]) for match in iter_matches(text): yield match match_text = text[match[0] : match[1]] seps = list(_XP_SEPS.finditer(match_text)) if len(seps) < 1: continue tokens = [] last_end = 0 for sep in seps: token = match_text[last_end : sep.start()] start = last_end last_end = sep.end() if len(token) < 2 and not token.isalnum(): continue tokens.append((start, token)) tokens.append((last_end, match_text[last_end:])) num_tokens = len(tokens) for s, e in combinations(range(num_tokens + 1), 2): if s == 0 and e == num_tokens: continue e -= 1 submatches = set() start = tokens[s][0] end = tokens[e][0] + len(tokens[e][1]) subtext = match_text[start:end] start += match[0] for sidx, eidx, pages in iter_matches(subtext): coords = (sidx + start, eidx + start) if coords in submatches: continue submatches.add(coords) yield (*coords, pages)