/
searcher.py
207 lines (185 loc) · 6.91 KB
/
searcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
'''
Searcher class
#BUG: http connection failed and _todo_queue is empty.
'''
import threading
from queue import Queue
import networkx as nx
from crawler import Crawler
from log import log, debug_log
import settings # debug
LOCK = threading.Lock()
class Searcher(object):
'''Searcher class
'''
def __init__(self, start, end, max_pages=1000):
'''
#TODO: docstring
'''
self._start = start.lower().strip()
self._end = end.lower().strip()
self._max_page = max_pages
# stores all the names
self._todo_queue = Queue()
self._reached = dict()
self._path = None
self._nx_digraph = nx.DiGraph()
self.found_target = False
self._max_limit_reached = False
self._invalid_start_point = False
# # load graphs cached in the graph_bank
# # to search faster.
# if settings.debug:
# debug_log('loading all cached graphs...')
# self._reached = fileio.read_all_graphs()
# class variable
_NUM_WORKER_THREADS = 20
nodes_counter = 0
def worker(self):
'''worker which implements DFS on the graph of wikipedia.
'''
def target_found(tar):
'''Notify user that the target is found and prepare
for killing this worker.
'''
log('$$found target {}({})!$$'.format(self._end,
tar))
# modify self._end:
LOCK.acquire()
try:
self._end = tar
self.found_target = True
self._todo_queue.task_done()
finally:
LOCK.release()
while (not self.found_target and
not self._max_limit_reached and
not self._invalid_start_point):
# {
# get a task
curr_vertex_info = self._todo_queue.get()
LOCK.acquire()
try:
self.nodes_counter += 1
finally:
LOCK.release()
if self.nodes_counter > self._max_page:
self._todo_queue.task_done()
self._max_limit_reached = True
log('Maximum page limit reached. Terminating...')
return
if settings.debug:
debug_log('##now at ({}), total {}, queue size {}##'
.format(curr_vertex_info,
self.nodes_counter,
self._todo_queue.qsize()))
if self._end in curr_vertex_info[0]:
target_found(curr_vertex_info[0])
return
# do work
crawler = Crawler(start=curr_vertex_info[0],
relurl=curr_vertex_info[1])
if not crawler.has_soup():
if self._todo_queue.empty():
LOCK.acquire()
try:
self._invalid_start_point = True
finally:
LOCK.release()
return
# else, be tolerant, go to next iteration
continue
# get all link information; each element in neighbours
# contains (name, url)
neighbours = crawler.get_all_links()
# put all neighbours into _todo_queue except ones already
# visited
for neighbour in neighbours:
if (neighbour[0] not in self._reached and
neighbour[0] not in curr_vertex_info[0]):
if self._reached[curr_vertex_info[0]] != neighbour[0]:
# ADD LOCK ##########
LOCK.acquire()
try:
#################
# update graph###
#################
self._nx_digraph.add_edge(curr_vertex_info[0],
neighbour[0])
self._reached[neighbour[0]] = curr_vertex_info[0]
finally:
LOCK.release()
self._todo_queue.put(neighbour)
if self._end in neighbour[0]:
target_found(neighbour[0])
return
# finish current task
self._todo_queue.task_done()
# } end while loop
# Target has been found or max limit reached if the thread goes here.
if settings.debug:
debug_log('killing worker...')
# self._todo_queue.task_done()
def run_search(self):
'''Master: creates workers to do the graph search.
'''
# initialize bfs
self._todo_queue.put((self._start, None))
self._reached[self._start] = self._start
# apply BFS with the help of threads
threads = []
for _ in range(self._NUM_WORKER_THREADS):
thr = threading.Thread(target=self.worker)
# make sure the thread will eventually exit!
thr.deamon = True
threads.append(thr)
thr.start()
# block until all workers are done
for thr in threads:
if settings.debug:
debug_log('join thread [{}]'.format(thr.name))
thr.join()
# program goes here iff graph search is done
if self.found_target:
try:
if self._reached[self._end]:
self.generate_path()
except KeyError:
log('Cannot reach {} while generating path'.format(self._end))
def generate_path(self):
'''Trace back to generate path
Will raise a KeyError if a bad graph is parsed.
'''
curr_vertex = self._end
path = []
while curr_vertex != self._start:
old_vertex = curr_vertex
if settings.debug:
debug_log(curr_vertex)
path.append(curr_vertex)
# change edge color of the graph such that the path is different
# from other edges.
# TODO
curr_vertex = self._reached[old_vertex]
# this is aimed to prevent from loop in the graph
if curr_vertex == old_vertex:
if settings.debug:
debug_log('current vertex[{}] == old vertex[{}]'.
format(curr_vertex, old_vertex))
break
path.append(self._start)
self._path = path[::-1]
def get_result(self):
'''return a dict object containing all information
'''
res = dict()
res['path'] = self._path
try:
res['degree'] = len(self._path) - 1
except TypeError:
res['degree'] = 0
#####################
# add graph to res###
#####################
res['graph'] = self._nx_digraph
return res