This repository has been archived by the owner on Jun 7, 2022. It is now read-only.
/
Crawler.py
91 lines (76 loc) · 3.12 KB
/
Crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import sys
sys.path.append("crawler4py")
sys.path.append("psycopg2-2.6.1")
from Crawler4py.Crawler import Crawler
from Crawler4py.Config import Config
from urlparse import urlparse, parse_qs
import re
from UrlValidator import UrlValidator
import psycopg2
import traceback
import NetShelve
class CrawlerConfig(Config):
def __init__(self):
Config.__init__(self)
self.UserAgentString = "UCI Inf141-CS121 crawler 63393716 32393047 22863530 82181685"
self.PolitenessDelay = 600
#Timeout(Seconds) for trying to get the next url from the frontier.
self.FrontierTimeOut = 60
#Timeout(Seconds) for trying to get a free worker thread, (worker is taking too long maybe?)
self.WorkerTimeOut = 60
#Timeout(Seconds) for getting data from the output queue
self.OutBufferTimeOut = 60
self.MaxQueueSize = 100
self.urlValidator = UrlValidator(verbose=False)
self.dbConf = open('db.conf').read()
self.conn = self.connectDatabase()
print "Using Postgres shelve implementation..."
self.PersistenceObject = NetShelve.PgShelve(self.connectDatabase())
def connectDatabase(self):
try:
return psycopg2.connect(self.dbConf)
print "Connected to database..."
except Exception:
traceback.print_exc()
print "Could not connect to database, exiting."
print "Please close manually if it doesn't exit..."
sys.exit(1)
def GetSeeds(self):
'''Returns the first set of urls to start crawling from'''
return ["http://www.ics.uci.edu/"]
def HandleData(self, parsedData):
'''Function to handle url data. Guaranteed to be Thread safe.
parsedData = {"url" : "url", "text" : "text data from html", "html" : "raw html data"}
Advisable to make this function light. Data can be massaged later. Storing data probably is more important'''
cur = None
try:
self.conn.rollback()
url = str(parsedData["url"])
text = str(parsedData["text"].encode('utf-8'))
cur = self.conn.cursor()
query = cur.mogrify("UPDATE PAGES SET TEXT = %s WHERE URL = %s", (text, url))
cur.execute(query)
self.conn.commit()
print "Saved data: "+parsedData["url"]
except psycopg2.IntegrityError:
traceback.print_exc()
self.conn.rollback()
except psycopg2.InterfaceError:
print "Connection reset"
self.conn = self.connectDatabase()
except Exception:
print "Error saving URL: "+url
traceback.print_exc()
try:
self.conn.rollback()
print "Rolled back transaction"
except Exception:
print "Failed to rollback transaction"
finally:
if cur != None: cur.close()
def ValidUrl(self, url):
'''Function to determine if the url is a valid url that should be fetched or not.'''
return self.urlValidator.allows(url)
crawler = Crawler(CrawlerConfig())
print (crawler.StartCrawling())
exit(0)