forked from zed/trie-benchmark
-
Notifications
You must be signed in to change notification settings - Fork 0
/
longest_match.py
executable file
·155 lines (126 loc) · 4.8 KB
/
longest_match.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#!/usr/bin/env python
import importlib
import inspect
import os
import string
import sys
import urllib
import zipfile
from itertools import islice
from timeit import default_timer as timer
def init_hosts(trim):
global hosts
filename = "top-1m.csv.zip"
if not os.path.exists(filename):
def report(*args):
sys.stdout.write('\r'+ ' '*60 + '\r')
sys.stdout.write(' '.join(map(str, args)))
sys.stdout.flush()
urllib.urlretrieve(
"http://s3.amazonaws.com/alexa-static/top-1m.csv.zip",
reporthook=report, filename=filename)
with zipfile.ZipFile(filename, compression=zipfile.ZIP_DEFLATED) as z:
with z.open(z.namelist()[0]) as f:
hosts = [line.split(',')[1]
for line in islice(f, trim)
if ',' in line]
print "init_hosts(%d) -> len(hosts)=%d" % (trim, len(hosts))
def longest_match_suffixtree(url_prefix):
if longest_match_suffixtree.trie is None:
from SuffixTree import SubstringDict
# to install `SuffixTree`:
#
# $ wget http://hkn.eecs.berkeley.edu/~dyoo/python/suffix_trees/SuffixTree-0.7.tar.gz
# $ pip install SuffixTree-0.7.tar.gz
t = longest_match_suffixtree.trie = SubstringDict()
for url in hosts:
t['\n'+url] = url
matches = longest_match_suffixtree.trie['\n'+url_prefix]
return max(matches, key=len) if matches else ''
longest_match_suffixtree.trie = None
def longest_match_startswith(search):
matches = (url for url in hosts if url.startswith(search))
try:
return max(matches, key=len)
except ValueError:
return ''
def longest_match_pytrie(search):
if longest_match_pytrie.trie is None:
from pytrie import StringTrie
longest_match_pytrie.trie = StringTrie.fromkeys(hosts)
matches = longest_match_pytrie.trie.keys(prefix=search)
return max(matches, key=len) if matches else ''
longest_match_pytrie.trie = None
def longest_match_trie(search):
if longest_match_trie.trie is None:
from trie import Trie
t = longest_match_trie.trie = Trie()
for url in hosts:
t[url] = url
try: matches = (node.value
for node in longest_match_trie.trie._getnode(search).walk())
except KeyError:
return ''
try: return max(matches, key=len)
except ValueError:
return ''
longest_match_trie.trie = None
def longest_match_datrie(search):
if longest_match_datrie.trie is None:
import datrie
t = longest_match_datrie.trie = datrie.new(alphabet=string.printable)
for url in hosts:
t[url.decode('ascii')] = 1
matches = longest_match_datrie.trie.keys(search.decode('ascii'))
return max(matches, key=len) if matches else ''
longest_match_datrie.trie = None
def test(func, keyword):
me = importlib.import_module(__name__)
for name, f in inspect.getmembers(me, inspect.isfunction):
if not name.startswith('longest_match_'):
continue
for url_prefix in [keyword]+"google youtube abcdef \n ".split(' '):
sp = f(url_prefix)
me = func(url_prefix)
assert len(sp) == len(me), (url_prefix, f.__name__, sp, me)
f.trie = None
def timef(f, keyword, N):
init_hosts(trim=N)
number = 1000
r = []
for _ in range(3): # repeat to get minimum time
print '.',
sys.stdout.flush()
assert getattr(f, 'trie', None) is None # make sure trie is
# not created yet
start = timer() #NOTE: avoid using timeit.Timer() due to f.trie
for _ in range(number): #NOTE: for small N it introduces
#noticable overhead
f(keyword)
r.append((timer()-start) * 1e6 / number)
f.trie = None # force trie construction
t = min(r)
print "%s %5d microseconds, max %d" % (f.__name__, t, max(r))
return t
def main():
import argparse
p = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
p.add_argument('--test', action='store_true',
help='test all `longest_match_*` functions')
p.add_argument('--suffix', default='pytrie',
help='which `longest_match_`suffix`() to use')
p.add_argument('--keyword', default='abc',
help='keyword to search')
p.add_argument('--n', type=int, default=1000,
help='number of hosts to use for the search')
args = p.parse_args()
me = importlib.import_module(__name__)
func = getattr(me, "longest_match_"+args.suffix)
if args.test:
init_hosts(trim=args.n)
test(func, args.keyword)
else:
print me.timef(func, args.keyword, args.n)
if __name__=="__main__":
main()