/
crawler.py
161 lines (140 loc) · 5.28 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
'''
a crawler used for parsing pages in wikipedia
'''
import re
import http
from urllib.error import URLError
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
from log import debug_log, log
import settings
class Crawler(object):
'''crawler class
'''
def __init__(self, start=None, relurl=None):
'''`relurl` can be None if the url is not known yet. Crawler
will automatically generate the url for you.
start: name of lemma at the starting point
relurl: @NOTE:RELATIVE url
'''
self._start = start
self._url = self._BASE_RUL + relurl if relurl is not None else None
if settings.debug:
debug_log(self._url)
self._soup = self._make_soup()
# class varialbes
_BASE_RUL = 'https://en.wikipedia.org'
_HEADERS = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
def has_soup(self):
'''check if the crawler has a non-empty soup'''
return self._soup is not None
def generate_full_url(self):
'''generate full url of a given relative url.
O(n) since the encode and decode are linear.
'''
url = self._BASE_RUL + '/wiki/' + self._start
# get rid of non-ascii characters by encode + decode
url = url.encode('ascii', 'ignore').decode('ascii')
return url
def _make_soup(self):
'''send request, handle response, and get soup object
@NOTE: It will automatically generate url if url was
originally passed in as `None`.
Return:
soup object if no Exception is raised.
Otherwise return `None`.
'''
try:
if self._url is None:
self._url = self.generate_full_url()
if settings.debug:
debug_log('get request...')
debug_log('url is {}'.format(self._url))
req = Request(self._url, headers=self._HEADERS)
response = urlopen(req)
soup = BeautifulSoup(response, "html5lib")
except URLError as e:
err_msg = 'Cannot open the page with name {}: {}'.format(
self._start, e)
if settings.debug:
debug_log(err_msg)
log(err_msg)
return None
except UnicodeDecodeError as e:
# example: https://en.wikipedia.org/wiki/Kronkåsa
# 'Kronkåsa' contains non-ascii characters
# We don't do encode & decode trick simply because it will
# not be a reasonable keyword to search.
err_msg = 'Cannot open the page with name {}: {}'.format(
self._start, e)
if settings.debug:
debug_log(err_msg)
log(err_msg)
return None
except http.client.BadStatusLine as e:
err_msg = 'Cannot open the page with name {}: {}'.format(
self._start, e)
if settings.debug:
debug_log(err_msg)
log(err_msg)
return None
else:
return soup
def get_all_links(self):
'''get all the links in one page, aka get all edges of one node
Return: a list of tuples that the first element is the number of
links, the second element is the name and
the third element its the RELATIVE url.
O(n) where n is approximately the size of the HTML tree.
'''
assert self.has_soup(), 'self._soup is None'
# valid urls need to be in this format: start with /wiki/ and
# not contain `:`.
all_link_tags = self._soup.find(id='bodyContent')\
.find_all('a',
href=re.compile('^(/wiki/)((?!:).)*$'))
# invalid: aaa/wiki/
# /wiki/aaa:bbb:
# valid : /wiki/apple_pie
# valie : /wiki/fruit/apple (not likely to occur)
links = []
for link_tag in all_link_tags:
if 'href' in link_tag.attrs and link_tag.get_text() != 'ISBN':
links.append((link_tag.get_text().lower(),
link_tag.get('href')))
if settings.debug:
debug_log('number of links: {}'.format(len(links)))
return links
def get_node_name(self):
'''get the node name for the as the start name.
O(n) where n is the approximately the size of HTML tree.
'''
try:
title_node = self._soup.find(id="firstHeading")
title = title_node.get_text().strip()
if not title:
raise ValueError('Cannot find title')
except ValueError as e:
log('Wikipedia change their DOM!')
if settings.debug:
debug_log(e)
return None
except Exception as e:
log('iternal error detected')
if settings.debug:
debug_log(e)
return None
else:
return title
def _test():
'''some test to make sure it works
'''
test_cases = ['Mug', 'Star', 'Elephant', 'Cat', 'iphone', 'samsung']
test_cases = ['Mug']
for start in test_cases:
test_crawler = Crawler(start)
test_crawler.get_all_links()
name = test_crawler.get_node_name()
print(name)
if __name__ == '__main__':
_test()