/
crawler.py
205 lines (184 loc) · 6.03 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
import pprint
import logging
import urltools
import urllib
import queue
from urllib import robotparser
from urllib.parse import urlparse
from loggingconfig import LOGGING
from googleapiclient.discovery import build
import pqdict
from pqdict import maxpq
from downloader import Downloader
from calculator import Calculator
from Parser import Parser
from w2v import Relevance
#parameters for google search API:
API_KEY="AIzaSyBsx7wuJfoHIA9VsWayDFZW-w7APu-4gps"
SEARCH_ENGINE_ID = '012502276015408778302:aanpptkeffi'
class Crawler():
def __init__(self):
self.query = input("Enter search query: ")
self.webpages_limit = input("Set total number of webpages to be crawled: ")
self.limit = input("Set limits on how many webpages be crawled from single site: ")
self.priority_queue = maxpq()
self.queue = queue.Queue()
self.downloader = Downloader()
self.parser = Parser(self.query)
self.calculator = Calculator(self.query)
self.relevance = Relevance()
self.webpages_crawled = 0
self.logger = logging.getLogger(__name__)
self.visited_urls = set()
self.sites_times = {}
#fetch top 10 results from google search:
def __fetch_google_results(self):
service = build("customsearch","v1",developerKey=API_KEY)
res =service.cse().list(
q= self.query,
cx= SEARCH_ENGINE_ID).execute()
return res
#enqueue the 10 google search results
def enqueue_seeds(self):
res=self.__fetch_google_results()
for item in res['items']:
self.priority_queue.additem(item['link'],10)
self.queue.put(item['link'])
self.logger.debug("Enqueued: "+ item['link'])
#check has this url been visited before
#and has it reach the limit of each site
#and Robot Exclusion Protocols
def urlchecker (self, url):
if url is None:
return False
normalized_url = urltools.normalize(url)
robotparser = urllib.robotparser.RobotFileParser()
try:
url_comp = urlparse(normalized_url)
base_url = url_comp.scheme + "://" + url_comp.netloc + "/"
except:
self.logger.error("Cannot parse: " + url)
try:
robotparser.set_url(base_url + "robots.txt")
robotparser.read()
if not robotparser.can_fetch("*", normalized_url):
self.logger.error(url + " is excluded due to protocol")
return False
except:
self.logger.error("Cannot determine robots exclusion protocol: " + url)
if normalized_url in self.visited_urls:
self.logger.debug(url + " Has been visited before! ")
return False
elif base_url in self.sites_times and self.sites_times[base_url] > int(self.limit) :
#
self.logger.debug(url + " Times visiting this site have reach the limit ")
return False
elif 'cgi' in normalized_url:
return False
else:
return True
#the crawling process
def crawl(self):
try:
harvest_rate_accum = 0
while self.webpages_crawled < int(self.webpages_limit):
print(self.webpages_crawled)
try:
url = self.priority_queue.pop()
except e:
print("cannot pop")
print(url)
if self.urlchecker(url):
try:
content = self.downloader.download(url).decode('utf-8')
if content is not None:
self.webpages_crawled += 1
rel = self.relevance.relevance(content, self.query)
harvest_rate_accum += rel
self.crawled_log(" Harvest rate: " + str(harvest_rate_accum / self.webpages_crawled))
except:
print( "Failed in downloading")
normalized_url = urltools.normalize(url)
try:
url_comp = urlparse(normalized_url)
base_url = url_comp.scheme + "://" + url_comp.netloc + "/"
except:
self.logger.error("Cannot parse: " + url)
if base_url in self.sites_times:
self.sites_times[base_url] += 1
else:
self.sites_times[base_url] = 1
self.visited_urls.add(normalized_url)
if rel < 0.2:
continue
for link in self.parser.extract_all_links(content):
full_link = self.parser.parse_links(url, link)
if full_link is not None:
link_promise = self.calculator.link_promise(full_link) + rel
try:
self.priority_queue.additem(full_link, link_promise)
except:
pass
except KeyError :
print ("Queue is empty now")
def bfs_crawl(self):
try:
harvest_rate_accum = 0
while self.webpages_crawled < int(self.webpages_limit):
print(self.webpages_crawled)
try:
url = self.queue.get()
except e:
print("cannot pop")
print(url)
if self.urlchecker(url):
try:
content = self.downloader.download(url).decode('utf-8')
if content is not None:
self.webpages_crawled += 1
rel = self.relevance.relevance(content, self.query)
harvest_rate_accum += rel
self.crawled_log(" Harvest rate: " + str(harvest_rate_accum / self.webpages_crawled))
except:
print( "Failed in downloading")
normalized_url = urltools.normalize(url)
try:
url_comp = urlparse(normalized_url)
base_url = url_comp.scheme + "://" + url_comp.netloc + "/"
except:
self.logger.error("Cannot parse: " + url)
self.visited_urls.add(normalized_url)
for link in self.parser.extract_all_links(content):
full_link = self.parser.parse_links(url, link)
if full_link is not None :
try:
if base_url not in self.sites_times:
self.sites_times[base_url] = 1
elif self.sites_times[base_url] < int(self.limit):
self.sites_times[base_url] += 1
else:
continue
self.queue.put(full_link)
except:
pass
except KeyError :
print ("Queue is empty now")
def crawled_log(self, log):
file = open('demo.log', 'a')
file.write(log+'\n\n')
file.close()
def main():
crawler = Crawler()
crawler.enqueue_seeds()
bfs = input(" BFS or not?(y/n): ")
if bfs:
crawler.bfs_crawl()
else:
crawler.crawl()
statistics = "Total request: " + str(crawler.downloader.total_requests) + " Total 404 encountered: " + str(crawler.downloader.total_failed)
crawler.crawled_log(statistics)
if __name__ == '__main__':
import logging.config
logging.config.dictConfig(LOGGING)
#logging.config.fileConfig('logging1.config',disable_existing_loggers=False)
main()