class Crawler(): def __init__(self ): self.downloader = DownloadManager() self.webpage = None self.init_database() self.rules = {} def init_database(self): self.queue = QueueDB('queue.db') self.webpagedb = WebpageDB('webpage.db') self.duplcheck = DuplCheckDB('duplcheck.db') def add_seeds(self, links): new_links = self.duplcheck.filter_dupl_urls(links) self.duplcheck.add_urls(new_links) self.queue.push_urls(new_links) def add_rules(self, rules): self.rules = {} for url, inurls in rules.items(): reurl = re.compile(url) repatn = [] for u in inurls: repatn.append(re.compile(u)) self.rules[reurl] = repatn def get_patterns_from_rules(self,url): patns = [] for purl,ru in self.rules.items(): if purl.match(url)!= None: patns.extend(ru) return list(set(patns)) def start(self): while 1: url = self.queue.pop_url() print url if url == None: print "crawling task is done." break error_msg, url, redirected_url, html = self.downloader.download(url) #print error_msg, url, redirected_url, html if html !=None: self.webpagedb.html2db(url,html) self.webpage = WebPage(url,html) self.webpage.parse_links() ruptn = self.get_patterns_from_rules(url) print ruptn links = self.webpage.filter_links(tags = ['a'], patterns= ruptn) self.add_seeds(links) self.mysleep(3) def mysleep(self, n): for i in range(n): time.sleep(1) print "sleep",i,"of",n
class Crawler(): def __init__(self): self.downloader = DownloadManager() self.webpage = None self.rules = {} self.dbop = OperatorDB() def add_seeds(self, links): self.dbop.add_seeds(links) def add_rules(self, rules): self.rules = {} for url, inurls in rules.items(): reurl = re.compile(url) repatn = [] for u in inurls: repatn.append(re.compile(u)) self.rules[reurl] = repatn def get_patterns_from_rules(self, url): patns = [] for purl, ru in self.rules.items(): if purl.match(url) != None: patns.extend(ru) return list(set(patns)) def start(self): while 1: try: url = self.dbop.pop_url() print "url: %s" % url if url == None: print "crawling task is done." break error_msg, url, redirected_url, html = self.downloader.download(url) #print error_msg, url, redirected_url, html if html != None: self.webpage = WebPage(url, html) article = self.webpage.extract() if len(article) > 5: addtime = "%s %s" % (article[1], article[2]) self.dbop.html2db(url, html, article[0], addtime, article[3], article[5]) else: self.dbop.html2db(url, html) print self.webpage.parse_links() ruptn = self.get_patterns_from_rules(url) links = self.webpage.filter_links(tags=['a'], str_patterns=ruptn) self.add_seeds(links) self.mysleep(3) except Exception, err: print "!!error!! Exception happend! %s %s" % (url, err) self.dbop.close()
class Crawler(object): def __init__(self): super(Crawler, self).__init__() self.downloader = DownloadManager() self.webpage = None self.init_database() self.rules = {} def init_database(self): self.queue = QueueDB('queue.db') self.webpagedb = WebpageDB('webpage.db') self.duplcheck = DuplCheckDB('duplcheck.db') def add_seeds(self, links): new_links = self.duplcheck.filter_dupl_urls(links) self.duplcheck.add_urls(new_links) self.queue.push_urls(new_links) def add_rules(self, rules): self.rules = {} for url, inurls in rules.items(): reurl = re.compile(url) repatn = [] for u in inurls: repatn.append(re.compile(u)) self.rules[reurl] = repatn def get_patterns_from_rules(self, url): patns = [] for purl, ru in self.rules.items(): if purl.match(url) != None: patns.extend(ru) return list(set(patns)) def getlinks(self, url, html): self.webpage = WebPage(url, html) self.webpage.parse_links() ruptn = self.get_patterns_from_rules(url) #print ruptn links = self.webpage.filter_links(tags=['a'], patterns=ruptn) return links def start(self): while 1: url = self.queue.pop_url() print url if url == None: print "crawling task is done." break error_msg, url, redirected_url, html = self.downloader.download( url) #print error_msg, url, redirected_url, html if html != None: self.webpagedb.html2db(url, html) links = self.getlinks(url, html) self.add_seeds(links) self.mysleep(3) def mysleep(self, n): for i in range(n): time.sleep(1) print "sleep", i, "of", n
class Crawler(): def __init__(self ): self.downloader = DownloadManager() self.webpage = None self.init_database() self.rules = {} self.files = [] self.file_rule = ".+" def init_database(self): self.queue = QueueDB('queue.db') self.webpagedb = WebpageDB('webpage.db') self.duplcheck = DuplCheckDB('duplcheck.db') self.repodb = RepoStateDB() def add_seeds(self, links): new_links = self.duplcheck.filter_dupl_urls(links) self.duplcheck.add_urls(new_links) self.queue.push_urls(new_links) def add_rules(self, rules): self.rules = {} for url, inurls in rules.items(): reurl = re.compile(url) repatn = [] for u in inurls: repatn.append(re.compile(u)) self.rules[reurl] = repatn def set_file_rule(self, rule): self.file_rule = rule def get_patterns_from_rules(self,url): patns = [] for purl,ru in self.rules.items(): if purl.match(url)!= None: patns.extend(ru) return list(set(patns)) def download_files(self, files): for f in files: #cmd = "wget --force-directories -c " + f + " -P " + config.repos_dir cmd = "wget -c " + f + " -P " + config.repos_dir ret_code = os.system(cmd) self.repodb.update(f, ret_code == 0) def start(self): while 1: url = self.queue.pop_url() print url if url == None: print "crawling task is done." break error_msg, url, redirected_url, html = self.downloader.download(url) # print error_msg, url, redirected_url, html if html !=None: self.webpagedb.html2db(url,html) self.webpage = WebPage(url,html) self.webpage.parse_links() ruptn = self.get_patterns_from_rules(url) #print ruptn links = self.webpage.filter_links(tags = ['a'], patterns= ruptn) print links self.add_seeds(links) file_pattern = [] file_pattern.append(re.compile(self.file_rule)) files = self.webpage.filter_links(tags = ['a'], patterns = file_pattern) self.files.append(files) #TODO: self.download_files(files) print files def mysleep(self, n): for i in range(n): time.sleep(1) print "sleep",i,"of",n
class Crawler(): def __init__(self): self.downloader = DownloadManager()#下载网页的对象 self.webpage = None#解析页面的对象 self.initDatabase() self.rules = {} #初始化数据库 def initDatabase(self): self.queue = QueueDB()#TODO 表 self.webpagedb = WebpageDB() self.duplcheck = DuplCheckDB() #增加种子url #参数: links url 列表 def addSeeds(self, links): new_links = self.duplcheck.filterDuplUrls(links)#把重复的url过滤掉 self.duplcheck.addUrls(new_links)#已经访问过的url self.queue.pushUrls(new_links)#向todo表中增加url def addRules(self, rules): self.rules = {} for url, inurls in rules.items(): reurl = re.compile(url) repatn = [] for u in inurls: repatn.append(re.compile(u)) self.rules[reurl] = repatn def get_patterns_from_rules(self,url): patns = [] for purl,ru in self.rules.items(): if purl.match(url)!= None: patns.extend(ru) return list(set(patns)) #开始执行 def start(self): while 1: url = self.queue.popUrl() print url if url == None: print "crawling task is done." break error_msg, url, redirected_url, html = self.downloader.download(url) #print error_msg, url, redirected_url, html if html !=None: self.webpagedb.storeHtmlToDb(url,html)#把网页存储起来 self.webpage = WebPage(url,html)#开始解析网页 self.webpage.parseLinks()#得到全部的超链接 ruptn = self.get_patterns_from_rules(url) print ruptn links = self.webpage.filter_links(tags = ['a'], patterns= ruptn)#得到None if links: self.addSeeds(links) self.mysleep(3)#休息一下再继续爬 def mysleep(self, n): for i in range(1,n+1): time.sleep(1) print "sleep",i,"of",n
#''' url = "http://www.cnbeta.com/" downloader = DownloadManager() error_msg, url, redirected_url, html = downloader.download(url) print "error_msg=%s" %error_msg print "url=%s" %url print "redirected_url=%s" %redirected_url f = open("www.cnbeta.com.html",'w') f.write(html) f.close() webpage = WebPage(url, html) webpage.parse_links() website = 'cnbeta\.com' patnstr = '^(http|https)://(.*\.' + website + ')(.+)$'; links = webpage.filter_links(tags=['a'], str_patterns=[patnstr]) links.sort() f_filter_links = open('filter_links_cnbeta.txt', 'w') #print links f = open('links_regged_cnbeta.txt','w') for link in links: f_filter_links.write('%s\n' % link) f.write('%s\n' % link) for elem, attr, lnk, pos in webpage.doc.iterlinks(): absolute = urlparse.urljoin(webpage.url, lnk.strip()) if absolute == link and elem.text: f.write('%s\n' % elem.text.encode('utf-8')) f.close() f_filter_links.close()