def test_startswith(self): pd = PrefixSet() keys = [''.join(combo) for combo in itertools.product('abc', repeat=3)] for key in reversed(keys): pd.add(key) subset = [k for k in keys if k.startswith('ab')] self.assertSequenceEqual(subset, list(pd.startswith('ab')))
def test_startswith(self): pd = PrefixSet() keys = ["".join(combo) for combo in itertools.product("abc", repeat=3)] for key in reversed(keys): pd.add(key) subset = [k for k in keys if k.startswith("ab")] self.assertSequenceEqual(subset, list(pd.startswith("ab")))
def insert_search_delete(self, keys): pd = PrefixSet() for key in keys: pd.add(key) self.assertEqual(len(pd), len(set(keys))) for key in keys: self.assertIn(key, pd) for key in keys: pd.discard(key) self.assertEqual(len(pd), 0) for key in keys: self.assertFalse(key in pd) self.assertEqual(len(pd._root), 0)
def test_pickle(self): pd = PrefixSet() pd.add('a') pickle.dumps(pd, pickle.HIGHEST_PROTOCOL)
def test_reversed(self): pd = PrefixSet() keys = [''.join(combo) for combo in itertools.product('abc', repeat=3)] for key in keys: pd.add(key) self.assertSequenceEqual(list(reversed(keys)), list(reversed(pd)))
def test_sort_order(self): pd = PrefixSet() keys = ['', 'a', 'aa', 'ab', 'b', 'ba'] for key in reversed(keys): pd.add(key) self.assertSequenceEqual(keys, list(iter(pd)))
def test_startswith_empty(self): pd = PrefixSet() pd.add('a') self.assertSequenceEqual([], list(pd.startswith('b')))
def test_pickle(self): pd = PrefixSet() pd.add("a") pickle.dumps(pd, pickle.HIGHEST_PROTOCOL)
def test_reversed(self): pd = PrefixSet() keys = ["".join(combo) for combo in itertools.product("abc", repeat=3)] for key in keys: pd.add(key) self.assertSequenceEqual(list(reversed(keys)), list(reversed(pd)))
def test_sort_order(self): pd = PrefixSet() keys = ["", "a", "aa", "ab", "b", "ba"] for key in reversed(keys): pd.add(key) self.assertSequenceEqual(keys, list(iter(pd)))
def test_startswith_empty(self): pd = PrefixSet() pd.add("a") self.assertSequenceEqual([], list(pd.startswith("b")))
# -*- coding: utf-8 -*- import json import os import re from prefixtree import PrefixSet file = os.path.join(os.path.dirname(__file__), "../dictionaries/areas.json") places = os.path.join(os.path.dirname(__file__), "../dictionaries/places.txt") import ast ps = PrefixSet() with open(file, 'r') as f,\ open(places, 'w') as out: content = f.read() spaces = re.findall('"[^"]+"', content) for s in spaces: space = ast.literal_eval(s) ps.add(space) out.write("%s 30000 ns\n" % space) # 北京 34488 ns assert "大连" not in ps assert ps.startswith("大连") for x in ps.startswith("大连"): print(x)
new_line = "%s\n" ps = PrefixSet() pos_sentences = set() neg_sentences = set() places = os.path.join(os.path.dirname(__file__), "../dictionaries/places.txt") with open(places) as f: tokenizer.load_userdict(f) for line in f: s = line.strip().split()[0] ps.add(s) # print(pseg.lcut("大连")) # x, y = pseg.lcut("大连")[0] # assert y == "ns" def is_space(word): l = list(ps.startswith(word)) return len(l) def common_igrnoe(word, tag, text_len): word_len = len(word) if word_len == 1: # before accuracy: 0.663919 return None elif tag.startswith('u'): # u 助词