/
Simple_Crawler.py
79 lines (68 loc) · 2.62 KB
/
Simple_Crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
from threading import Thread, Lock
from Crawler import Crawler
from queue import Queue
from urllib.parse import urlparse
from urllib.robotparser import RobotFileParser
from urllib import error
class SimpleCrawler(Thread):
def __init__(self, domain, no_crawlers=10):
Thread.__init__(self)
# Set number of workers
if no_crawlers < 1:
raise ValueError("Must have at least 1 crawler")
self.no_crawlers = no_crawlers
# Multi-threaded priority queue to schedule crawling.
self.queue = Queue()
# Keep a set of visited urls to avoid recursive/ duplicate crawling
self.visited_urls = set()
# Keep a set of excluded urls.
self.excluded = set()
# Lock used to synchronize access to visited and exlucded sets/
self.mutex = Lock()
self.crawlers = []
# Check if domain can be parsed and if robots.txt is present.
if not domain:
raise ValueError("Please provide a seed URL to crawl.")
self.domain = domain
try:
self.target_domain = urlparse(domain).netloc
except:
raise ValueError("Incorrect URL")
self.set_robot_parser()
self.queue.put(self.domain)
self.visited_urls.add(self.domain)
def set_robot_parser(self):
"""
Given a domain tries to search for /robots.txt and identify which urls should
not be visited by the crawler.
"""
if self.domain[-1] != '/':
self.domain += '/'
self.robotparser = RobotFileParser()
try:
self.robotparser.set_url(self.domain + "robots.txt")
except Exception as e:
raise ValueError("Incorrect URL or no robots.txt exists.")
try:
self.robotparser.read()
except error.URLError:
self.robotparser = None
raise ValueError("Incorrect URL")
except Exception as ex:
self.robotparser = None
raise ValueError("Incorrect URL or no robots.txt exists.")
def run(self):
# Spawn threads and start crawling.
for i in range(self.no_crawlers):
crawler = Crawler(i, self.queue, self.visited_urls, self.mutex, self.excluded, self.target_domain, self.robotparser)
self.crawlers.append(crawler)
crawler.start()
# Wait for all crawlers to finish.
self.queue.join()
# Notify all crawlers to stop.
for i in range(self.no_crawlers):
self.queue.put(None)
self.queue.join()
# Wait for all threads to exit
for t in self.crawlers:
t.join()