class SpiderMan: def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): # 添加入口 url self.manager.add_new_url(root_url) # 判断 url 管理器是否有新的 url,同时判断抓去了多少 url while self.manager.has_new_url() and self.manager.old_url_size() < 100: try: # 从 URL 管理器获取新的 url new_url = self.manager.get_new_url() # 从 html 下载器下载网页 html = self.downloader.download(new_url) # print(html) # 从 html 解析器抽取网页数据 new_urls, data = self.parser.parser(new_url, html) # 将抽取的 url 添加到 URl 管理器 self.manager.add_new_urls(new_urls) # 数据存储器存储文件 self.output.store_data(data) print("已经抓取%s个链接" % self.manager.old_url_size()) except Exception as e: print("crawl failed") self.output.output_html()
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): self.manager.add_new_url(root_url) while (self.manager.has_new_url() and self.manager.old_url_size() < 50): try: new_url = self.manager.get_new_url() html = self.downloader.download(new_url) if html == None: print('failded to get pages') new_urls, data = self.parser.parser(new_url, html) self.manager.add_new_urls(new_urls) self.output.store_data(data) print('has scraped %s links' % self.manager.old_url_size()) except Exception as e: print('crawl failed') self.output.output_html() '''
class SpiderWorker(): def __init__(self): BaseManager.register('get_task_queue') BaseManager.register('get_result_queue') server_adrr = '127.0.0.1' print('connect to %s...' % server_adrr) self.m = BaseManager(address=(server_adrr, 8001), authkey=b'qiye') self.m.connect() self.task = self.m.get_task_queue() self.result = self.m.get_result_queue() self.downloader = HtmlDownloader() self.htmlparser = HtmlParser() self.dataoutput = DataOutput() def crawl(self): while True: try: if self.task.empty: url = self.task.get() if url == 'end': return None print('正在解析 %s' % url.encode('utf-8')) content = self.downloader.download(url) new_urls, data = self.htmlparser.parser(url, content) self.result.put({'new_urls': new_urls}) self.dataoutput.output_mongo({'data': data}) except Exception as e: print(e)
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def start(self, url, numMax=50): self.manager.addUrl(url) num = 0 errorsNum = 0 while self.manager.sizeofNew() != 0 and num < numMax: try: num = num + 1 url = self.manager.getUrl() print('%d\n %s' % (num, url)) html = self.downloader.download(url) newUrls, data = self.parser.parser(url, html) self.output.addData(data) if self.manager.sizeofNew() + self.manager.sizeofOld( ) < numMax: self.manager.addUrls(newUrls) print(data['title']) except: num = num - 1 errorsNum = errorsNum + 1 print('crawl failed %d' % errorsNum) self.output.outputData()
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownLoader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): # 添加入口URL self.manager.add_new_url(root_url) # 判断url管理器中是否有新的url,同时判断取了多少个url while(self.manager.has_new_url() and self.manager.old_url_size()<100): try: new_url = self.manager.get_new_url() html = self.downloader.download(new_url) new_urls, data = self.parser.parser(new_url, html) self.manager.add_new_urls(new_urls) self.output.store_data(data) print(self.manager.old_url_size()) print(data) except Exception as e: print('crawl failed') self.output.output_question() self.output.output_answer()
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawlOneTag(self, book_tag): page_num = 0 book_list = [] while page_num <= 2: try: new_url = self.manager.get_new_url(page_num, book_tag) html = self.downloader.download(new_url) book_list += self.parser.parser(html) except Exception as e: print("crawl failed") page_num += 1 return book_list def crawlAllTags(self, book_tag_lists, topath): book_lists = [] for book_tag in book_tag_lists: book_list = self.crawlOneTag(book_tag) book_list = sorted(book_list, key=lambda x: x[1], reverse=True) book_lists.append(book_list) self.output.output(book_lists, book_tag_lists, topath)
class SpiderMan(object): def __init__(self): self.manager = UrlManger() self.downloader = HtmlDownload() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): self.manager.add_new_url(root_url) # print(self.manager.new_url_size()) # print(self.manager.old_urls_size()) while (self.manager.has_new_url() and self.manager.old_urls_size() < 100): try: new_url = self.manager.get_new_url() html = self.downloader.download(new_url) # print(html) # print('新的url:', new_url) new_urls, data = self.parser.parser(new_url, html) # print("new_urls长度:", len(new_urls)) self.manager.add_new_urls(new_urls) self.output.store_data(data) print('已经抓取了%s个链接' % self.manager.old_urls_size()) except Exception as e: print(e, 'Crawl failed') self.output.output_html() print("已保存至 baike.html")
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self,root_url): #添加入口URL self.manager.add_new_url(root_url) #判断url管理器中是否有新的url,同时判断抓取了多少个url while(self.manager.has_new_url() and self.manager.old_url_size()<100): try: #从URL管理器获取新的url new_url = self.manager.get_new_url() #HTML下载器下载网页 html = self.downloader.download(new_url) #HTML解析器抽取网页数据 new_th, new_td, new_urls= self.parser.parser(new_url,html,"th","时间","td") #将抽取到url添加到URL管理器中 if new_urls!="meiyou": self.manager.add_new_urls(new_urls) print "已经抓取%s个链接"%self.manager.old_url_size() #数据存储器储存文件 if new_th!="meiyou" and new_td!="meiyou": self.output.store_data(new_th,new_td) self.output.output_html() except Exception as e: print "抓取失败!"
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): # 添加入口URL self.manager.add_new_url(root_url) # 判断是否有新的URL及已抓取数量 while (self.manager.has_new_url() and self.manager.old_url_size() < 100): try: # 从URL管理器获取新的链接 new_url = self.manager.get_new_url() print '1-------->new_url', new_url # 下载网页 html = self.downloader.download(new_url) print '2-------->html' # 解析抽取网页 new_urls, data = self.parser.parser(new_url, html) print '3-------->new_urls, data', new_urls, data # 将抽取的URL添加到管理器中 self.manager.add_new_urls(new_urls) print '4-------->new_urls', new_urls # 数据存储器存储文件 self.output.store_data(data) print '已经抓取%d个链接' % self.manager.old_url_size() except Exception, e: print 'crawl failed %s' % e # 将数据存储为指定格式 self.output.output_html()
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): #添加入口url self.manager.add_new_url(root_url) #判断url管理器中是否有新的url,同时判断抓取了多少个url while (self.manager.has_new_url() and self.manager.old_url_size() < 100): try: #从url管理器获取新的url new_url = self.manager.get_new_url() #html下载器下载网页 html = self.downloader.download(new_url) #html解析器抽取网页数据 new_urls, data = self.parser.parser(new_url, html) #将抽取的url添加到url管理器中 # self.manager.add_new_url(new_urls) 出现set不可hash问题,因为可迭代的数据是无法hash的 for new_url in new_urls: self.manager.add_new_url(new_url) #数据存储器存储文件 self.output.store_data(data) print('已经抓取%s个链接' % self.manager.old_url_size()) except Exception as e: print('crawl failed with' + str(e))
class SpiderMan(object): """爬虫调度器""" def __init__(self): self.urlManager = UrlManager() self.htmlDownloader = HtmlDownloader() self.htmlParser = HtmlParser() self.htmlOutput = DataOutput() def crawl(self, root_url): # 添加入口URL self.urlManager.add_new_url(root_url) # 判断url管理器中是否有新的url,同时判断抓取了多少个url while (self.urlManager.has_new_url() and self.urlManager.old_url_size() < 100): try: # 从URL管理器获取新的url new_url = self.urlManager.get_new_url() # HTML下载器下载网页 html = self.htmlDownloader.download(new_url) # HTML解析器抽取网页数据 new_urls, data = self.htmlParser.parser(new_url, html) # 将抽取的url添加到URL管理器中 self.urlManager.add_new_urls(new_urls) # 数据存储器存储数据 self.htmlOutput.store_data(data) except Exception as e: print(traceback.format_exc()) # 数据存储器将文件输出成指定格式 self.htmlOutput.output_html()
class SpiderMan(object): def __init__(self): self.manger = UrlManger() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): print 'crawl %s ' % root_url self.manger.add_new_url(root_url) #pdb.set_trace() while (self.manger.has_new_url() and self.manger.old_urls_size() < 100): try: new_url = self.manger.get_new_url() html = self.downloader.download(new_url) new_urls, data = self.parser.parser(new_url, html) self.manger.add_new_urls(new_urls) self.output.store_data(data) print 'Has crawl %s links ' % self.manger.old_urls_size() except Exception, e: print "crawl failed %s" % e break self.output.output_html()
class Spider_Scheduler(object): def __init__(self): self.urlmanager = UrlQueue() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): # 入口放url种子 self.urlmanager.add_new_url(root_url) # 判断url管理器中是否有新的url,同时判断抓取了多少个url while (self.urlmanager.has_new_url() and self.urlmanager.old_url_size() < 100): try: # 从URL管理器获取新的url new_url = self.urlmanager.get_new_url() # HTML下载器下载网页 html = self.downloader.download(new_url) # HTML解析器抽取网页数据 new_urls, data = self.parser.parser(new_url, html) # 将抽取到url添加到URL管理器中 self.urlmanager.add_new_urls(new_urls) # 存储器将数据序列化 self.output.data_to_list(data) print("已经抓取%s个链接" % self.urlmanager.old_url_size()) except Exception as e: print("crawl failed") # 存储器输出成指定格式 self.output.output_html()
class SpiderMain(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): # 添加入口URL self.manager.add_new_url(root_url) # 判断url管理器中是否有新的url,同时判断抓取了多少个url while (self.manager.has_new_url() and self.manager.old_url_size() < 100): try: # 从URL管理器获取新的url new_url = self.manager.get_new_url() # HTML下载器下载网页 html = self.downloader.download(new_url) # HTML解析器抽取网页数据 new_urls, data = self.parser.parser(new_url, html) # 将抽取到url添加到URL管理器中 self.manager.add_new_urls(new_urls) # 数据存储器储存文件 self.output.store_data(data) print("已经抓取%s个链接" % self.manager.old_url_size()) #print(new_url) except Exception as e: print("crawl failed") # 数据存储器将文件输出成指定格式 self.output.output_html()
class SpiderSchedule(object): ''' 爬虫调度器,负责初始化各个模块,然后通过crawl传递入口url 方法内部安卓运行流畅控制各个模块工作 ''' def __init__(self): self.manager = URLManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): # 添加入口url self.manager.add_new_url(root_url) # 判断是否有新的url,同时判断抓取url个数 while self.manager.has_new_url() and self.manager.old_urls_size() < 10: try: # 1.从URL管理器获取新的URL new_url = self.manager.get_new_url() # 2.将URL交给HtmlDownloader下载 html = self.downloader.download(new_url) # 3.将下载的页面交给HtmlParser解析 urls, data = self.parser.parser(new_url, html) # 4.将解析的数据存储,将重新抽取的URL交给URLManager self.output.store_data(data) for url in urls: self.manager.add_new_url(url) print('已经抓取{0}个链接:'.format(self.manager.old_urls_size()), new_url) except Exception as e: print(e.args) print('crawl failed:', url) self.output.output_html()
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): # 添加入口url self.manager.add_new_url(root_url) # 判断url管理器中是否有新的url,同时判断抓取了多少个url while (self.manager.has_new_url() and self.manager.old_url_size() < 200): try: #从url管理器中获取url new_url = self.manager.get_new_url() # html下载器下载网页 html = self.downloader.download(new_url) #html解析器抽取网页数据 new_urls, data = self.parser.parser(new_url, html) # 将抽取的url放到url管理器中 self.manager.add_new_urls(new_urls) # 将抽取的数据存储起来 self.output.store_data(data) print " 已经抽取了 %s 个链接" % self.manager.old_url_size() except Exception, e: print "crawl failed", e self.output.output_html()
class SpiderWork(object): def __init__(self): BaseManager.register('get_task_queue') BaseManager.register('get_result_queue') server_addr = '127.0.0.1' print('Connect to server %s...' % server_addr) self.m = BaseManager(address=(server_addr, 8001),authkey=('baike'.encode('utf-8'))) self.m.connect() self.task = self.m.get_task_queue() self.result = self.m.get_result_queue() self.downloader = HtmlDownloader() self.parser = HtmlParser() print('init finish') def crawl(self): while(True): try: if not self.task.empty(): url = self.task.get() print(url) if url =='end': print('控制节点通知爬虫节点停止工作...') self.result.put({'new_urls':'end','data':'end'}) return print('爬虫节点正在解析:%s'%url.encode('utf-8')) content = self.downloader.download(url) new_urls,data = self.parser.parser(url,content) self.result.put({"new_urls":new_urls,"data":data}) except EOFError as e: print("连接工作节点失败") return except Exception as e: print(e) print('Crawl fail')
class SpiderWork(): def __init__(self): BaseManager.register('get_task_queue') BaseManager.register('get_result_queue') server_addr = '127.0.0.1' print('Connect to server %s'%server_addr) self.m = BaseManager(address = (server_addr,8001),authkey = b'baike') self.m.connect() self.task = self.m.get_task_queue() self.result = self.m.get_result_queue() self.downloader = HtmlDownloader() self.parser = HtmlParser() print('init finish') def crawl(self): while True: try: if not self.task.empty(): url = self.task.get() if url == 'end': print('控制节点通知爬虫结束工作') self.result.put({'new_urls':'end','data':'end'}) return print('爬虫正在解析%s'%url.encode('utf-8')) content = self.downloader.download(url) new_urls,data = self.parser.parser(url,content) self.result.put({'new_urls':new_urls,'data':data}) except EOFError: print('连接工作节点失败') return except Exception: print(Exception) print('Crawl fail')
class SpiderWork(object): def __init__(self): #初始化分布式进程中的工作节点的连接工作 class QueueManager(BaseManager): pass # 实现第一步:使用BaseManager注册获取Queue的方法名称 QueueManager.register('get_task_queue') QueueManager.register('get_result_queue') # 实现第二步:连接到服务器: server_addr = ('192.168.10.128', 8004) print('Connect to server {}...'.format(server_addr)) # 端口和验证口令注意保持与服务进程设置的完全一致: self.m = QueueManager(address=server_addr, authkey='janson'.encode()) # 从网络连接: self.m.connect() # 实现第三步:获取Queue的对象: self.task = self.m.get_task_queue() self.result = self.m.get_result_queue() #初始化网页下载器和解析器 self.downloader = HtmlDownloader() self.parser = HtmlParser() print('init finish') def crawl(self): ''' 分布式爬虫节点调度器 :return: ''' while True: try: # url任务节点不为空时 if not self.task.empty(): # 获取url url = self.task.get() # 当url为end时,说明控制节点通知关闭 if url == 'end': print('控制节点通知爬虫节点停止工作...') # 接着通知其它节点停止工作 self.result.put({'new_urls': 'end', 'data': 'end'}) return else: # 否则解析数据 print('爬虫节点正在解析:%s' % url.encode('utf-8')) # 下载器下载 content = self.downloader.download(url) # 解析数据 new_urls, data = self.parser.parser(url, content) # 提交队列 self.result.put({"new_urls": new_urls, "data": data}) except Exception as error: print('error-------->', error) print("连接工作节点失败") return
class SpiderWork(object): def __init__(self): BaseManager.register('get_task_queue') BaseManager.register('get_result_queue') server_addr = '127.0.0.1' print('connect to server %s ...' % server_addr) self.m = BaseManager(address=(server_addr, 8001), authkey='baike') self.m.connect() self.task = self.m.get_task_queue() self.result = self.m.get_result_queue() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.lock = 0 self.start = time.time() print 'init finish' def work_thread(self, url): global lock self.lock += 1 print '爬虫节点正在解析:%s' % url.encode('utf-8') #yield content = self.downloader.download(url) new_urls, data = self.parser.parser(url, content) self.result.put({'new_urls': new_urls, 'data': data}) print '爬虫节点解析完成:%s' % url.encode('utf-8') self.lock -= 1 def gevent_print(self, str): print str def crawl(self): global lock while (True): try: if self.lock > 10: time.sleep(1) elif not self.task.empty(): url = self.task.get() if url == 'end': while self.lock: time.sleep(0.1) print '控制节点通知爬虫节点停止工作...' print "using time:", time.time() - self.start self.task.put('end') self.result.put({'new_urls': 'end', 'data': 'end'}) return thread = threading.Thread(target=self.work_thread, args=(url, )) thread.start() #time.sleep(0.1) except EOFError, e: print "连接工作节点失败" return except Exception, e: print e print 'Crawl fali'
class SpiderMan(object): def __init__(self): self.address = Get_Address() self.download = HtmlDownloader() self.parser = HtmlParser() def crawl(self): info = self.address.get_address() html_lists = self.download.download(info) tickets = self.parser.parser(html_lists)
class SpiderWork(object): def __init__(self): BaseManager.register('get_task_queue') BaseManager.register('get_result_queue') server_addr = '127.0.0.1' print('connect to server %s ...' % server_addr) self.m = BaseManager(address=(server_addr, 8001), authkey='baike') self.m.connect() self.task = self.m.get_task_queue() self.result = self.m.get_result_queue() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.lock = 0 print 'init finish' def work_gevent(self, url): monkey.patch_all() self.lock += 1 print '爬虫节点正在解析:%s' % url.encode('utf-8') #yield content = self.downloader.download(url) new_urls, data = self.parser.parser(url, content) self.result.put({'new_urls': new_urls, 'data': data}) print '爬虫节点解析完成:%s' % url.encode('utf-8') self.lock -= 1 def gevent_print(self, str): print str def crawl(self): while (True): try: if not self.task.empty(): url = self.task.get() if url == 'end': while self.lock: time.sleep(0.1) print '控制节点通知爬虫节点停止工作...' self.task.put('end') self.result.put({'new_urls': 'end', 'data': 'end'}) return g = gevent.spawn(self.work_gevent, url) g1 = gevent.spawn(self.gevent_print, 1) g2 = gevent.spawn(self.gevent_print, 2) g.join() time.sleep(0.1) except EOFError, e: print "连接工作节点失败" return except Exception, e: print e print 'Crawl fali'
class CrawlerMain(object): def __init__(self): self.G_STATE_OK = 200 self.crawMaxNum = -1 self.crawCountNum = 0 self.urlManager = UrlManager() self.dispatch = Dispatch() self.htmlParser = HtmlParser("http://baike.baidu.com") self.applicationShow = ApplicationShow() def __crawl(self, url): """ 设定计数器, 如果crawMaxCount > 0 说明需要进行计数, 生成一个count计数器,最大搜索多少次 当count计数器大于crawMaxCount的时候停止 否则,不进行计数 """ try: self.dispatch.launch_request(url) if self.dispatch.get_status() != self.G_STATE_OK: return context = self.dispatch.get_content() self.htmlParser.set_content(context) self.htmlParser.parser() summary = self.htmlParser.get_summary() title = self.htmlParser.get_title urls = self.htmlParser.get_new_urls() except Exception, e: print "Error " + url + " " + str(e) return self.applicationShow.add(url, title, summary) self.urlManager.add_url(urls)
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() def crawl(self, root_url): self.manager.add_new_url(root_url) while (self.manager.has_new_url()): # try: new_url = self.manager.get_new_url() html = self.downloader.download(new_url) new_urls, data = self.parser.parser(new_url, html) self.manager.add_new_urls(new_urls)
class Spider(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.paser = HtmlParser() def crawl(self, idi): rootloginurl = 'http://ygcp.njtech.edu.cn/User/LoginInSampleAction.aspx' pageurl = self.manager.url_login(idi) infourl = self.manager.url_userinfo(idi) htmlf, htmli = self.downloader.download(rootloginurl, idi, pageurl, infourl) xuehao, xingming, changpao, chenpao = self.paser.parser( infourl, pageurl, htmli, htmlf) print("学号:" + xuehao[0], "姓名:" + xingming[0], changpao, chenpao)
class SpiderManager: def __init__(self): self.parser = HtmlParser() self.save = SaveData() def prcocess(self): self.save.coneect() for page in range(10): url = "http://maoyan.com/board/4?offset=" + str(page * 10) res = requests.get(url) if res.status_code == 200: movies = self.parser.parser(res.text) for name, star, releasetime, nation in movies: self.save.save(name, star, releasetime, nation) self.save.close()
class SpiderWork(object): def __init__(self): #初始化分布式进程中工作节点的连接工作 #实现第一步:使用BaseManager注册用于获取QueQue的方法名称? BaseManager.register('get_task_queue') BaseManager.register('get_result_queue') #实现第二步:连接到服务器 server_addr = '127.0.0.1' print('Connect to server %s...' % server_addr) #注意保持端口和验证口令与服务进程设置的完全一致 self.m = BaseManager(address=(server_addr, 8001), authkey='baike') #从网络连接 self.m.connect() #实现第三步:获取Queue的对象 self.task = self.m.get_task_queue() self.result = self.m.get_result_queue() #初始化网页下载器和解析器 self.downloader = HtmlDownloader() self.parser = HtmlParser() print 'init finish' def crawl(self): while True: try: if not self.task.empty(): url = self.task.get() if url == 'end': print "控制节点通知爬虫节点停止工作..." #接着通知其他节点停止工作 self.result.put({'new_urls': "end", 'data': 'end'}) return print '爬虫节点正在解析:%s' % url.encode('utf-8') content = self.downloader.download(url) new_urls, data = self.parser.parser(url, content) self.result.put({'new_urls': new_urls, 'data': data}) except EOFError, e: #EOFError是什么错误?end of file文件末尾错误 print "连接工作节点失败" return except Exception, e: print e print 'Crawl fail'
class SpiderWork(object): def __init__(self): """ initialization: 1. initial distribution process then connect to control node 2. register basemanager for get Queue 3. sync with control node """ # 1. register BaseManager.register('get_task_queue') BaseManager.register('get_result_queue') # 2. connect to server server_addr = '192.168.65.176' print "connect to server %s..." % server_addr # 3. match port & authkey with control node self.m = BaseManager(address=(server_addr, 8001), authkey='baike') self.m.connect() # 4. get Queue object (url_q, result_q) self.task = self.m.get_task_queue() self.result = self.m.get_result_queue() # 5. initialize downloader, parser self.downloader = HtmlDownloader() self.parser = HtmlParser() print 'init finish' def crawl(self): while (True): try: if not self.task.empty(): url = self.task.get() if url == 'end': print 'control node inform spider node stop working' self.result.put({'new_urls': 'end', 'data': 'end'}) return print 'spider node is parsing:%s' % url.encode('utf-8') content = self.downloader.download(url) new_urls, data = self.parser.parser(url, content) self.result.put({"new_urls": new_urls, "data": data}) except EOFError, e: print "work node connect filed" return except Exception, e: print e print 'Crawl fail'
class SpiderWorker2(object): def __init__(self): # 初始化分布式进程中的工作节点的连接工作 # 实现第一步:使用BaseManager注册获取Queue的方法名称 BaseManager.register('get_task_queue') BaseManager.register('get_result_queue') # 实现第二步:连接到服务器: server_addr = '127.0.0.1' print "[*]Connect to server %s..." % server_addr # 端口和验证口令注意保持与服务进程设置的完全一致: self.m = BaseManager(address=(server_addr, 8001), authkey='test') # 从网络连接: self.m.connect() # 实现第三步:获取Queue的对象 self.task = self.m.get_task_queue() self.result = self.m.get_result_queue() # 初始化网页下载器和解析器 self.downloader = HtmlDownloader() self.parser = HtmlParser() print "[*]Init finished." def crawl(self): while True: try: if not self.task.empty(): url = self.task.get() if url == 'end': print "[*]Control Node informs all the Spider Nodes stop working." # 接着通知其它节点停止工作 self.result.put({'new_urls': 'end', 'data': 'end'}) return print "[*]The Spider Node is parsing: %s" % url.encode( 'utf-8') content = self.downloader.download(url) new_urls, data = self.parser.parser(url, content) self.result.put({'new_urls': new_urls, 'data': data}) except EOFError, e: print "[-]Fail to connect to the Worker Node." return except Exception, e: print e print "[-]Crawl failed."
class SpiderMain(object): def __init__(self): self.manager=urlManager() self.parser=HtmlParser() self.downloader=HtmlDownloader() self.output=DataOutPut() def crawl(self,root_url): self.manager.add_new_url(root_url) while(self.manager.has_new_url() and self.manager.old_url_size()<100): try: new_url = self.manager.get_new_url() html = self.downloader.download(new_url) new_urls,data=self.parser.parser(new_url,html) self.manager.add_new_urls(new_urls) self.output.store_data(data) print('have got %s urls:'%self.manager.old_url_size()) except: print('crawl failed') self.output.output_html()
class SpiderWork(object): def __init__(self): BaseManager.register('get_task_queue') BaseManager.register('get_result_queue') server_addr = '127.0.0.1' print ('Connect to server %s...' % server_addr) self.m=BaseManager(address=(server_addr,8001),authkey='qiye'.encode('utf-8')) print 'connecting...' self.m.connect() print 'connected' self.task = self.m.get_task_queue() self.result = self.m.get_result_queue() self.downloader = HtmlDownloader() self.parser = HtmlParser() print 'spider init finish' def crawl(self): while True: try: # print self.task if not self.task.empty(): url = self.task.get() if url == 'end': print ('stop...') # 通知其它节点停止 self.result.put({'new_urls':'end','data':'end'}) return print ('spider is working on %s'%url) content = self.downloader.download(url) new_urls, data = self.parser.parser(url, content) self.result.put({'new_urls':new_urls,'data':data}) except EOFError as e: print 'cannot connect other' return except Exception as e: print e print 'crawl fail'
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): self.manager.add_new_url(root_url) while(self.manager.has_new_url() and self.manager.old_url_size()<100): try: new_url = self.manager.get_new_url() html=self.downloader.download(new_url) new_urls, data = self.parser.parser(new_url,html) self.manager.add_new_urls(new_urls) self.output.store_data(data) print "already get %s url" % self.manager.old_url_size() except Exception,e: print "crawl failed" self.output.output_html()