/
crawler.py
266 lines (209 loc) · 6.96 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
import utils
import os
from time import time
def crawler_factory(seed_url=None, method='BFS', load_path=None):
if load_path:
return utils.pickle_load(load_path)
else:
return Crawler(seed_url, method)
class Crawler():
def __init__(self, seed_url, method='BFS', if_store_doc=True, if_focused=False):
self.seed_url = seed_url
self.depth = 1
self.depth_reached = 1
self.url_count = 0
self.duplicate_count = 0
self.level_end_str = '__level_ends__' #for BFS only
self.state_path = 'storage/state.pickle'
self.max_depth = 6
self.max_url_count = 1000
self.crawl_method = method
self.doc_paths = {
'BFS':'storage/docs_BFS/',
'DFS':'storage/docs_DFS/',
'BFS_focused':'storage/docs_BFS_focused/'
}
self.if_store_doc = if_store_doc
self.store_docs_at = self.doc_paths[self.crawl_method]
self.db_path = 'urls.db'
self.frontier = []
self.dfs_tree = {0:[self.seed_url]}
self.focused = if_focused
self.conn = None
self.url_file = None
self.tname_lookup = {
'BFS':'url_lookup_BFS',
'DFS':'url_lookup_DFS',
'BFS_focused':'url_lookup_DFS_f'
}
self.table_name = self.tname_lookup[self.crawl_method]
self.url_prefix = 'https://en.wikipedia.org'
self.content_class = 'mw-parser-output'
self.sleep_time = 1
self.keywords = [
'Mars',
'Rover',
'Orbiter',
'Pathfinder',
'Mars Mission',
'Mars Exploration',
'Martian',
'explore',
'orbit',
'red planet']
self.init_storage()
if self.seed_url:
self.init_seed()
else:
print('no seed url')
def init_storage(self):
#change: create the "storage" dir first, then do the rest
utils.create_doc_dir(self.crawl_method, self.doc_paths) #create file storage dir
self.url_file = open('storage/urls_{}.txt'.format(self.crawl_method), 'a')
#create/connect to db file
os.chdir('storage')
self.conn = utils.create_conn(self.db_path)
os.chdir('..')
utils.create_table(self.conn, self.table_name) #create url table
def init_seed(self):
#get frontier
seed_soup = utils.make_soup(self.seed_url, class_val=self.content_class)
self.frontier = utils.get_page_urls(seed_soup, url_prefix=self.url_prefix)
#store seed page
seed_hash_val = utils.hash_url(self.seed_url)
seed_doc_path = utils.store_doc(seed_hash_val, seed_soup, self.store_docs_at, if_store=self.if_store_doc)
utils.store_url(self.conn,
self.table_name,
seed_hash_val,
self.seed_url,
seed_doc_path,
url_file=self.url_file)
self.depth+=1
def pickle_self(self):
self.total_time = time() - self.t0
self.url_file.write('\nUrl counts:{}\nDuplicate counts{}'.format(self.url_count, self.duplicate_count))
self.url_file.write('\nTime taken: {}'.format(self.total_time))
if self.depth_reached > self.max_depth:
self.depth_reached-=1
self.url_file.write('\nDepth reached: {}'.format(self.depth_reached))
self.url_file.close()
self.url_file = None
self.conn.close()
utils.pickle_dump(self.state_path, self)
def BFS(self):
self.frontier.append(self.level_end_str)
while True:
#get current url
if len(self.frontier) != 0:
url = self.frontier.pop(0)
else:
self.pickle_self()
break #end if no more url in frontier
#check to break
if self.depth > self.max_depth or self.url_count >= self.max_url_count:
self.pickle_self()
break #end if reach max
#check to increment depth
if url == self.level_end_str:
self.frontier.append(self.level_end_str)
self.depth += 1
#track total depth
if self.depth > self.depth_reached:
self.depth_reached = self.depth
continue
#do crawl
hash_val = utils.hash_url(url)
if utils.check_unique(self.conn, self.table_name, hash_val): #query db to check in url is unique
doc_soup = utils.make_soup(url, class_val=self.content_class)
utils.delay(self.sleep_time)
if self.focused:
if_relevant = utils.check_relevant(doc_soup, self.keywords) #read the document and match key words
else:
if_relevant = True
if if_relevant:
doc_path = utils.store_doc(hash_val, doc_soup, self.store_docs_at, if_store=self.if_store_doc) #store document content on disk
utils.store_url(self.conn, self.table_name, hash_val, url, doc_path, url_file=self.url_file) #store url & path to doc content to db
self.frontier += utils.get_page_urls(doc_soup, url_prefix=self.url_prefix) #append urls in current page to frontier
self.url_count += 1
print('url count:', self.url_count)
else:
self.duplicate_count += 1
def DFS(self):
self.dfs_tree[1] = self.frontier
while True:
#check to break
if self.url_count >= self.max_url_count or self.depth < 0:
self.pickle_self()
break #end if reach max
#check to go back up a level
if self.depth > self.max_depth:
self.depth -= 1
#get current url
if len(self.dfs_tree[self.depth]) != 0:
url = self.dfs_tree[self.depth].pop(0)
else:
self.depth -= 1 #if current level level done, go up a level
#do crawl
hash_val = utils.hash_url(url)
if utils.check_unique(self.conn, self.table_name, hash_val): #query db to check in url is unique
doc_soup = utils.make_soup(url, class_val=self.content_class)
utils.delay(self.sleep_time)
if self.focused:
if_relevant = utils.check_relevant(doc_soup, self.keywords) #read the document and match key words
else:
if_relevant = True
if if_relevant:
doc_path = utils.store_doc(hash_val, doc_soup, self.store_docs_at, if_store=self.if_store_doc) #store document content on disk
utils.store_url(self.conn, self.table_name, hash_val, url, doc_path, url_file=self.url_file) #store url & path to doc content to db
self.depth += 1 #go down a level
#track total depth
if self.depth > self.depth_reached:
self.depth_reached = self.depth
self.dfs_tree[self.depth] = utils.get_page_urls(doc_soup, url_prefix=self.url_prefix) #create url list for lower level
self.url_count += 1
print('url count:', self.url_count)
else:
self.duplicate_count += 1
#implement for fun
def BFS_recursive(self):
pass
#implement for fun
def DFS_recursive(self):
pass
def crawl(self):
try:
self.t0 = time()
if self.crawl_method == 'BFS':
self.BFS()
elif self.crawl_method == 'DFS':
self.DFS()
elif self.crawl_method == 'BFS_focused':
self.BFS()
print('finished {}\n'.format(self.crawl_method))
except Exception as e:
print(e)
self.pickle_self()
'''
_ooOoo_
o8888888o
88" . "88
(| -_- |)
O| = /O
____/`---'|____
.' ||| ||| `.
/ ||||| : ||||| \
/ _||||| -:- |||||- \
| | ||| - ||| | |
| |_| ''|---|'' | |
| .-|__ `-` ___|-. /
___`. .' /--.--| `. . __
."" '< `.___|_<|>_/___.' >'"".
| | : `- |`.;`| _ /`;.`/ - ` : | |
| | `-. |_ __| /__ _/ .-` / /
======`-.____`-.___|_____/___.-`____.-'======
`=---='
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
佛祖保佑 永无BUG
'''
if __name__ == '__main__':
main()