class CrawlerConfig(Config): def __init__(self): Config.__init__(self) self.UserAgentString = "UCI Inf141-CS121 crawler 63393716 32393047 22863530 82181685" self.PolitenessDelay = 600 #Timeout(Seconds) for trying to get the next url from the frontier. self.FrontierTimeOut = 60 #Timeout(Seconds) for trying to get a free worker thread, (worker is taking too long maybe?) self.WorkerTimeOut = 60 #Timeout(Seconds) for getting data from the output queue self.OutBufferTimeOut = 60 self.MaxQueueSize = 100 self.urlValidator = UrlValidator(verbose=False) self.dbConf = open('db.conf').read() self.conn = self.connectDatabase() print "Using Postgres shelve implementation..." self.PersistenceObject = NetShelve.PgShelve(self.connectDatabase()) def connectDatabase(self): try: return psycopg2.connect(self.dbConf) print "Connected to database..." except Exception: traceback.print_exc() print "Could not connect to database, exiting." print "Please close manually if it doesn't exit..." sys.exit(1) def GetSeeds(self): '''Returns the first set of urls to start crawling from''' return ["http://www.ics.uci.edu/"] def HandleData(self, parsedData): '''Function to handle url data. Guaranteed to be Thread safe. parsedData = {"url" : "url", "text" : "text data from html", "html" : "raw html data"} Advisable to make this function light. Data can be massaged later. Storing data probably is more important''' cur = None try: self.conn.rollback() url = str(parsedData["url"]) text = str(parsedData["text"].encode('utf-8')) cur = self.conn.cursor() query = cur.mogrify("UPDATE PAGES SET TEXT = %s WHERE URL = %s", (text, url)) cur.execute(query) self.conn.commit() print "Saved data: "+parsedData["url"] except psycopg2.IntegrityError: traceback.print_exc() self.conn.rollback() except psycopg2.InterfaceError: print "Connection reset" self.conn = self.connectDatabase() except Exception: print "Error saving URL: "+url traceback.print_exc() try: self.conn.rollback() print "Rolled back transaction" except Exception: print "Failed to rollback transaction" finally: if cur != None: cur.close() def ValidUrl(self, url): '''Function to determine if the url is a valid url that should be fetched or not.''' return self.urlValidator.allows(url)