class Crawler(): def __init__(self ): self.downloader = DownloadManager() self.webpage = None self.init_database() self.rules = {} def init_database(self): self.queue = QueueDB('queue.db') self.webpagedb = WebpageDB('webpage.db') self.duplcheck = DuplCheckDB('duplcheck.db') def add_seeds(self, links): new_links = self.duplcheck.filter_dupl_urls(links) self.duplcheck.add_urls(new_links) self.queue.push_urls(new_links) def add_rules(self, rules): self.rules = {} for url, inurls in rules.items(): reurl = re.compile(url) repatn = [] for u in inurls: repatn.append(re.compile(u)) self.rules[reurl] = repatn def get_patterns_from_rules(self,url): patns = [] for purl,ru in self.rules.items(): if purl.match(url)!= None: patns.extend(ru) return list(set(patns)) def start(self): while 1: url = self.queue.pop_url() print url if url == None: print "crawling task is done." break error_msg, url, redirected_url, html = self.downloader.download(url) #print error_msg, url, redirected_url, html if html !=None: self.webpagedb.html2db(url,html) self.webpage = WebPage(url,html) self.webpage.parse_links() ruptn = self.get_patterns_from_rules(url) print ruptn links = self.webpage.filter_links(tags = ['a'], patterns= ruptn) self.add_seeds(links) self.mysleep(3) def mysleep(self, n): for i in range(n): time.sleep(1) print "sleep",i,"of",n
def init_database(self): self.queue = QueueDB('queue.db') self.webpagedb = WebpageDB('webpage.db') self.duplcheck = DuplCheckDB('duplcheck.db')
class Crawler(object): def __init__(self): super(Crawler, self).__init__() self.downloader = DownloadManager() self.webpage = None self.init_database() self.rules = {} def init_database(self): self.queue = QueueDB('queue.db') self.webpagedb = WebpageDB('webpage.db') self.duplcheck = DuplCheckDB('duplcheck.db') def add_seeds(self, links): new_links = self.duplcheck.filter_dupl_urls(links) self.duplcheck.add_urls(new_links) self.queue.push_urls(new_links) def add_rules(self, rules): self.rules = {} for url, inurls in rules.items(): reurl = re.compile(url) repatn = [] for u in inurls: repatn.append(re.compile(u)) self.rules[reurl] = repatn def get_patterns_from_rules(self, url): patns = [] for purl, ru in self.rules.items(): if purl.match(url) != None: patns.extend(ru) return list(set(patns)) def getlinks(self, url, html): self.webpage = WebPage(url, html) self.webpage.parse_links() ruptn = self.get_patterns_from_rules(url) #print ruptn links = self.webpage.filter_links(tags=['a'], patterns=ruptn) return links def start(self): while 1: url = self.queue.pop_url() print url if url == None: print "crawling task is done." break error_msg, url, redirected_url, html = self.downloader.download( url) #print error_msg, url, redirected_url, html if html != None: self.webpagedb.html2db(url, html) links = self.getlinks(url, html) self.add_seeds(links) self.mysleep(3) def mysleep(self, n): for i in range(n): time.sleep(1) print "sleep", i, "of", n
def init_database(self): self.queue = QueueDB('queue.db', 0) self.webpagedb = WebpageDB('webpage.db') self.feedsdb = FeedsDB('duplcheck.db')
class Crawler(): def __init__(self ): self.downloader = DownloadManager() self.webpage = None self.init_database() self.rules = {} self.files = [] self.file_rule = ".+" def init_database(self): self.queue = QueueDB('queue.db') self.webpagedb = WebpageDB('webpage.db') self.duplcheck = DuplCheckDB('duplcheck.db') self.repodb = RepoStateDB() def add_seeds(self, links): new_links = self.duplcheck.filter_dupl_urls(links) self.duplcheck.add_urls(new_links) self.queue.push_urls(new_links) def add_rules(self, rules): self.rules = {} for url, inurls in rules.items(): reurl = re.compile(url) repatn = [] for u in inurls: repatn.append(re.compile(u)) self.rules[reurl] = repatn def set_file_rule(self, rule): self.file_rule = rule def get_patterns_from_rules(self,url): patns = [] for purl,ru in self.rules.items(): if purl.match(url)!= None: patns.extend(ru) return list(set(patns)) def download_files(self, files): for f in files: #cmd = "wget --force-directories -c " + f + " -P " + config.repos_dir cmd = "wget -c " + f + " -P " + config.repos_dir ret_code = os.system(cmd) self.repodb.update(f, ret_code == 0) def start(self): while 1: url = self.queue.pop_url() print url if url == None: print "crawling task is done." break error_msg, url, redirected_url, html = self.downloader.download(url) # print error_msg, url, redirected_url, html if html !=None: self.webpagedb.html2db(url,html) self.webpage = WebPage(url,html) self.webpage.parse_links() ruptn = self.get_patterns_from_rules(url) #print ruptn links = self.webpage.filter_links(tags = ['a'], patterns= ruptn) print links self.add_seeds(links) file_pattern = [] file_pattern.append(re.compile(self.file_rule)) files = self.webpage.filter_links(tags = ['a'], patterns = file_pattern) self.files.append(files) #TODO: self.download_files(files) print files def mysleep(self, n): for i in range(n): time.sleep(1) print "sleep",i,"of",n