class Crawler(): def __init__(self ): self.downloader = DownloadManager() self.webpage = None self.init_database() self.rules = {} def uninit_database(self): self.queue.close_db() self.webpagedb.close_db() self.feedsdb.close_db() def init_database(self): self.queue = QueueDB('queue.db', 0) self.webpagedb = WebpageDB('webpage.db') self.feedsdb = FeedsDB('duplcheck.db') def add_seeds(self, links): self.feedsdb.add_urls(links) self.queue.push_urls(links) def add_rules(self, rules): self.rules = {} for url, inurls in rules.items(): print url reurl = re.compile(url) repatn = [] for u in inurls: print u repatn.append(re.compile(u)) self.rules[reurl] = repatn print def get_patterns_from_rules(self,url): patns = [] for purl,ru in self.rules.items(): if purl.match(url)!= None: patns.extend(ru) return list(set(patns)) def start(self): while 1: url = self.queue.pop_url() print url if url == None: print "crawling task is done." break error_msg, url, redirected_url, html = self.downloader.download(url) #print error_msg, url, redirected_url, html if html !=None: self.webpage = WebPage(url,html) self.webpagedb.html2db(url,html) self.webpage.parse_links() ruptn = self.get_patterns_from_rules(url) print ruptn links = self.webpage.filter_links(tags = ['a'], patterns= ruptn) self.add_seeds(links) break self.mysleep(3) def mysleep(self, n): for i in range(n): time.sleep(1) print "sleep",i,"of",n
def init_database(self): self.queue = QueueDB('queue.db', 0) self.webpagedb = WebpageDB('webpage.db') self.feedsdb = FeedsDB('duplcheck.db')