class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): # 添加入口 self.manager.add_new_url(root_url) while(self.manager.has_new_url() and self.manager.old_urls_size() < 100): try: new_url = self.manager.get_new_url() # print(new_url, '.......') html = self.downloader.download(new_url) # print(html) new_urls, data = self.parser.parse(new_url, html) self.manager.add_new_urls(new_urls) self.output.store_data(data) print('已经抓取 %s 个链接' % self.manager.old_urls_size()) except Exception as e: print(e) # print('crawl failed') self.output.output_html()
class SpiderWork(object): def __init__(self): # 初始化分布式进程中的工作节点的连接工作 # 实现第一步: 使用BaseManager注册获取Queue的方法名称 BaseManager.register('get_task_queue') BaseManager.register('get_result_queue') # 实现第二步: 连接到服务器 server_addr = '127.0.0.1' print('Connect to server %s...' % server_addr) # 端口和验证口令注意保持与服务器进行设置的完全一致: self.m = BaseManager(address=(server_addr, 8001), authkey='baike'.encode('utf-8')) # 从网络连接 self.m.connect() # 实现第三步:获取Queue的对象 self.task = self.m.get_task_queue() self.result = self.m.get_result_queue() # 初始化网页下载器和解析器 self.downloader = HtmlDownloader() self.parser = HtmlParser() print('init finish') def crawl(self): while True: try: if not self.task.empty(): url = self.task.get() if url == 'end': print('控制节点通知爬虫节点停止工作...') # 接着通知其它节点停止工作 self.result.put({'new_urls': 'end', 'data': 'end'}) return print('爬虫节点正在解析:%s' % url.encode('utf-8')) content = self.downloader.download(url) new_urls, data = self.parser.parse(url, content) self.result.put({'new_urls': new_urls, 'data': data}) except EOFError as e: print('连接工作节点失败') return except Exception as e: print(e) print('Crawl fail')
class SpideMan(object): def __init__(self): self.manager = URLManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): self.manager.add_new_url(root_url) while self.manager.has_new_url() and self.manager.old_url_size() < 100: try: new_url = self.manager.get_new_url() html = self.downloader.download(new_url) new_urls, data = self.parser.parse(new_url, html) self.manager.add_new_urls(new_urls) self.output.store_data(data) print("catch %s link" % self.manager.old_url_size()) except Exception as e: print(e) self.output.output_html()
def save_tokens(webpage, dir_name, dst_PATH, dwl_dir, sleep=5, verbose=True): os.makedirs(dst_PATH + dir_name, exist_ok=True) if verbose: print('============================') print(dir_name) imgNameExtractor = HtmlExtractor( ['\<em>', '\</em>'], ['<div class='], lambda x: x[x.index('>') + 1:x.index('</')]) urlExtractor = HtmlExtractor(['<a class="lightly" href="h'], None, lambda x: x[25:-2]) roll20Parser = HtmlParser([urlExtractor, imgNameExtractor]) urlAndNames = roll20Parser.parse(webpage) links = urlAndNames[urlExtractor] names = urlAndNames[imgNameExtractor] assert (len(links) == len(names)) if verbose: print('============================') print('Found', len(links), 'extracted images') print('============================') for j in range(len(links)): url = links[j] webbrowser.get('windows-default').open_new_tab(url) time.sleep(sleep) _name = names[j] if verbose: print(j, '--', _name, '--', url) for _img in os.listdir(dwl_dir): if (str(_img)[:3] == 'max') and (str(_img)[-1] == 'g'): move(dwl_dir + _img, dst_PATH + dir_name + '/' + _name + '.png') if not (len(os.listdir(dst_PATH + dir_name)) == len(links)): print('WARNING: number of url different than final number of images')
class Crawler(object): """ 用途: 主要爬虫程序 """ def __init__(self, bind_domain): # 建立管理爬取URL的物件 , 用于记录已经爬过的URL self.urlManager = UrlManager(enable_external_link=False, bind_domain=bind_domain) # 建立请求链接的物件 self.downloader = HtmlDownloader() # 建立转换Html源码成lxml.html物件 , 获取新的链接 self.parser = HtmlParser() def craw(self, url): # 加入根页面 self.urlManager.add_new_url(root_url) # 查询Manager内储存url的集合 while self.urlManager.has_new_url(): # 获取新的url链接 request_url = self.urlManager.get_new_url() print("目前请求{0}".format(request_url)) # 下载页面 html_content = self.downloader.downlaod(request_url) # 转换html后筛选出所有a记录并且取得新的链接 new_urls = self.parser.parse(request_url, html_content) # 加入链接到管理器 self.urlManager.add_new_urls(new_urls)