Beispiel #1
0
class SpiderMan(object):
    #进行类的初始化
  def __init__(self):
    self.manager = UrlManager()
    self.downloader = HtmlDownLoader()
    self.parser = HtmlParser()
    self.output = DataOutput()
  def crawl(self,root_url):
    #添加入口url
    self.manager.add_new_url(root_url)
    
    #判断url管理器中是否有新的url,同时判断抓取了多少个url
    while (self.manager.has_new_url() and self.manager.old_url_size() < 20):
      time.sleep(1)
      try:
        #从url管理器中获取新的url
        new_url = self.manager.get_new_url()
        #HTML 下载器下载页面
        html = self.downloader.download(new_url)
        #HTML 解析器抽取页面数据
        new_urls,data = self.parser.parser(new_url,html)
        #将抽取的url 添加到url管理器中
        self.manager.add_new_urls(new_urls)
        #数据存储器存储文件
        self.output.store_data(data)
        print '已经抓取%s个链接' % self.manager.old_url_size()
      except Exception,e:  #Exception	常规错误的基类
        print 'crawl failed'
    #数据存储器将文件输出成指定格式
    self.output.output_html()
Beispiel #2
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self):
        # 添加URL入口
        self.manager.add_new_url(
            "https://www.amazon.cn/gp/bestsellers/books/ref=zg_bs_pg_1?ie=UTF8&pg=1"
        )
        self.manager.add_new_url(
            "https://www.amazon.cn/gp/bestsellers/books/ref=zg_bs_pg_2?ie=UTF8&pg=2"
        )

        while (self.manager.has_new_url()
               and self.manager.old_url_size() < 100):
            try:
                # 从URL管理器获取新的URL
                new_url = self.manager.get_new_url()
                # HTML下载器下载网页
                html = self.downloader.download(new_url)
                # HTML解析器抽取网页数据
                book_details = self.parser.parser(new_url, html)
                # 数据存储器存储文件
                print(book_details)
                for book_detail in book_details:
                    self.output.store_book(book_detail)
            except Exception as e:
                print("crawl failed")
        self.output.output_html()
Beispiel #3
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.output = DataOutput()
        self.downloader = HtmlDownloader
        self.parser = HtmlParser()

    def crawl(self, root_url):
        self.manager.add_new_url(root_url)
        while (self.manager.has_new_url() and self.manager.get_old_url_size() < 100):
            try:
                new_url = self.manager.get_new_url()
                html = HtmlDownloader.download(new_url)
                data = self.parser.parser(new_url, html)
                # self.manager.add_new_url(new_urls)
                self.output.store_data(data)
                # print '已经抓取%s个链接' % self.manager.get_old_url_size()
            except Exception, e:
                print 'craml execption %s' % e
        self.output.output_html()
Beispiel #4
0
class SpiderMan(object):
	def _init_(self):
		self.manager = UrlManager()
		self.downloader = HtmlDownloader()
		self.parser = HtmlParser()
		self.output = DataOutput()
	def crawl(self,root_url):
		#添加入口URL
		self.manager.add_new_url(root_url)
		#判断url管理器中是否有新的url,同时判断抓取了多少个url
		while(self.manager.has_new_url() and self.manager.old_url_size()<100):
			try:
				#从URL管理器获取新的url
				new_url = self.manager.get_new_url()
				#HTML下载器下载网页
				html = self.downloader.download(new_url)
				#HTML解析器抽取网页数据
				new_urls,data = self.parser.parser(new_url,html)
				#将抽取的url添加到URL管理器中
				self.output.store_data(data)
				print "已经抓取%s个链接"%self.manager.old_url_size()
			except Exception,e:
				print "crawl failed"
				#数据存储器将文件输出成指定格式
		self.output.output_html()
Beispiel #5
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawlHTML(self, root_url):

        # 添加入口URL
        self.manager.add_new_url(root_url)

        # 判断url管理中时候是否有新的url, 同时判断抓取了多少个url
        while (self.manager.has_new_url()):
            try:
                # 从URL管理器获取新的url
                new_url = self.manager.get_new_url()
                # HTML下载器下载网页
                html = self.downloader.download(new_url)
                # HTML解析器抽取网页数据
                self.parser.parser(new_url, html)
            except Exception, e:
                print e
                print "crawl failed"
class SpiderMan(object):
	def __init__(self):
		self.manager = UrlManager()
		self.downloader = HtmlDownloader()
		self.parser = HtmlParser()
		self.output = DataOutput()
		
	def crawl(self, root_url):
		#调用函数
		self.manager.add_new_url(root_url)
		while(self.manager.has_new_url() and self.manager.old_url_size()<100):
			try:
				new_url = self.manager.get_new_url()
				html = self.downloader.download(new_url)
				new_urls, data = self.parser.parser(new_url, html)
				self.manager.add_new_urls(new_urls)
				self.output.store_data(data)
				print("已经抓取%s个链接"%self.manager.old_url_size())
			except:
				print("crawl failed")
		self.output.ouput_html()
Beispiel #7
0
 def __init__(self):
   self.manager = UrlManager()
   self.downloader = HtmlDownLoader()
   self.parser = HtmlParser()
   self.output = DataOutput()
	def url_manager_proc(self, url_q, conn_q, root_url):
		url_manager = UrlManager()
		url_manager.add_new_url(root_url)
		while True:
			while(url_manager.has_new_url()):
				new_url = url_manager.get_new_url()
				url_q.put(nuw_url)
				print('olf_url=', url_manager.old_url_size())
				if (url_manager.old_url_size()>2000):
					url_q.put('end')
					print("控制节点发起结束通知")
					url_manager.save_progress("new_urls.txt", url_manager.new_urls)
					url_manager.savr_progress('old_urls.txt', url_manager.old_urls)
					return
				try:
					if not conn_q.empty():
						urls = conn_q.get()
						url_manager.add_new_urls(urls)
				except:
					time.sleep(0.1)