class SpiderMan(object): #进行类的初始化 def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownLoader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self,root_url): #添加入口url self.manager.add_new_url(root_url) #判断url管理器中是否有新的url,同时判断抓取了多少个url while (self.manager.has_new_url() and self.manager.old_url_size() < 20): time.sleep(1) try: #从url管理器中获取新的url new_url = self.manager.get_new_url() #HTML 下载器下载页面 html = self.downloader.download(new_url) #HTML 解析器抽取页面数据 new_urls,data = self.parser.parser(new_url,html) #将抽取的url 添加到url管理器中 self.manager.add_new_urls(new_urls) #数据存储器存储文件 self.output.store_data(data) print '已经抓取%s个链接' % self.manager.old_url_size() except Exception,e: #Exception 常规错误的基类 print 'crawl failed' #数据存储器将文件输出成指定格式 self.output.output_html()
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self): # 添加URL入口 self.manager.add_new_url( "https://www.amazon.cn/gp/bestsellers/books/ref=zg_bs_pg_1?ie=UTF8&pg=1" ) self.manager.add_new_url( "https://www.amazon.cn/gp/bestsellers/books/ref=zg_bs_pg_2?ie=UTF8&pg=2" ) while (self.manager.has_new_url() and self.manager.old_url_size() < 100): try: # 从URL管理器获取新的URL new_url = self.manager.get_new_url() # HTML下载器下载网页 html = self.downloader.download(new_url) # HTML解析器抽取网页数据 book_details = self.parser.parser(new_url, html) # 数据存储器存储文件 print(book_details) for book_detail in book_details: self.output.store_book(book_detail) except Exception as e: print("crawl failed") self.output.output_html()
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.output = DataOutput() self.downloader = HtmlDownloader self.parser = HtmlParser() def crawl(self, root_url): self.manager.add_new_url(root_url) while (self.manager.has_new_url() and self.manager.get_old_url_size() < 100): try: new_url = self.manager.get_new_url() html = HtmlDownloader.download(new_url) data = self.parser.parser(new_url, html) # self.manager.add_new_url(new_urls) self.output.store_data(data) # print '已经抓取%s个链接' % self.manager.get_old_url_size() except Exception, e: print 'craml execption %s' % e self.output.output_html()
class SpiderMan(object): def _init_(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self,root_url): #添加入口URL self.manager.add_new_url(root_url) #判断url管理器中是否有新的url,同时判断抓取了多少个url while(self.manager.has_new_url() and self.manager.old_url_size()<100): try: #从URL管理器获取新的url new_url = self.manager.get_new_url() #HTML下载器下载网页 html = self.downloader.download(new_url) #HTML解析器抽取网页数据 new_urls,data = self.parser.parser(new_url,html) #将抽取的url添加到URL管理器中 self.output.store_data(data) print "已经抓取%s个链接"%self.manager.old_url_size() except Exception,e: print "crawl failed" #数据存储器将文件输出成指定格式 self.output.output_html()
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawlHTML(self, root_url): # 添加入口URL self.manager.add_new_url(root_url) # 判断url管理中时候是否有新的url, 同时判断抓取了多少个url while (self.manager.has_new_url()): try: # 从URL管理器获取新的url new_url = self.manager.get_new_url() # HTML下载器下载网页 html = self.downloader.download(new_url) # HTML解析器抽取网页数据 self.parser.parser(new_url, html) except Exception, e: print e print "crawl failed"
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): #调用函数 self.manager.add_new_url(root_url) while(self.manager.has_new_url() and self.manager.old_url_size()<100): try: new_url = self.manager.get_new_url() html = self.downloader.download(new_url) new_urls, data = self.parser.parser(new_url, html) self.manager.add_new_urls(new_urls) self.output.store_data(data) print("已经抓取%s个链接"%self.manager.old_url_size()) except: print("crawl failed") self.output.ouput_html()
def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownLoader() self.parser = HtmlParser() self.output = DataOutput()
def url_manager_proc(self, url_q, conn_q, root_url): url_manager = UrlManager() url_manager.add_new_url(root_url) while True: while(url_manager.has_new_url()): new_url = url_manager.get_new_url() url_q.put(nuw_url) print('olf_url=', url_manager.old_url_size()) if (url_manager.old_url_size()>2000): url_q.put('end') print("控制节点发起结束通知") url_manager.save_progress("new_urls.txt", url_manager.new_urls) url_manager.savr_progress('old_urls.txt', url_manager.old_urls) return try: if not conn_q.empty(): urls = conn_q.get() url_manager.add_new_urls(urls) except: time.sleep(0.1)