def crawl_items(self, data): ''' :parameter: :data 主程序传过来的数据 格式如下{'title':xxxx,'url':[xxxx,xxxx,xxxx]} :return:无 ''' manager = UrlManager() # 获取文章标题 title = data.get('title') # 去重 if manager.remove_duplication(title): manager.add_new_urls(data.get('url')) # 下载图片文件 while (manager.has_new_url()): print('下载开始==>', title) image_urls = manager.get_new_urls() # 使用序列修改文件名 for index, url in enumerate(image_urls): print('下载中==>图片%s' % (index + 1)) data = self.downloader.download(url) self.output.save_2_binary(title, index + 1, data) # 全部下载完成,增加去重标志 if not manager.has_new_url(): manager.add_duplication(title) print('下载完成==>') else: print('重复|无需下载==>', title)
def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.dataoutput = DataOutput() self.mongoengine = Use_MongoEngine() self.urloutput = Url_info_Output()
def url_manager_proc(url_q, conn_q, root_url, num=6): """ :param url_q:里面放的是url集合单个url :param conn_q:里面放的是url集合 :param root_url: :param num: :return: """ url_manager = UrlManager() url_manager.add_new_url(root_url) while True: while url_manager.has_new_url(): print("# url_manager_proc将要爬取的url放入url_q中") new_url = url_manager.get_new_url() print(new_url) url_q.put(new_url) if url_manager.old_url_size() > num: # 通知爬行节点工作结束 url_q.put('end') print('控制节点发起结束通知!') # 关闭管理节点,同时存储 set 状态 url_manager.save_progress() break try: if not conn_q.empty(): print("# url_manager_proc从conn_q中拿取urls") urls = conn_q.get() print(urls) url_manager.add_new_urls(urls) else: # 延时休息 time.sleep(0.1) except Exception as e: print(e)
def url_manager_proc(self, url_que, conn_que, root_url): url_manager = UrlManager() url_manager.add_new_url(root_url) while True: while url_manager.has_new_url(): new_url = url_manager.get_new_url() # 将新的url发给工作节点 url_que.put(new_url) print('old_url=', url_manager.old_urls_size()) if url_manager.old_urls_size() > 2000: url_que.put('end') print('控制节点发出结束通知') # 关闭管理节点,同时存储set状态 url_manager.save_progress('new_urls.txt', url_manager.new_urls) url_manager.save_progress('old_urls.txt', url_manager.old_urls) return # 将从result_solve_proc 获取的urls添加到URL管理器 try: if not conn_que.empty(): urls = conn_que.get() for url in urls: url_manager.add_new_url(url) except BaseException: time.sleep(0.1)
def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput()
def url_manager_proc(self, task_queue, url_queue, root_url): url_manager = UrlManager() url_manager.add_new_url(root_url) while True: if url_manager.has_new_url(): new_url = url_manager.get_new_url() print('url: %s放入任务队列' % new_url) task_queue.put(new_url) if not url_queue.empty(): next_url = url_queue.get() url_manager.add_new_url(next_url)
def crawl_image(self, start_url, total_page, __page=2): ''' 爬取蜂鸟大师板块和技法板块的画集 :parameter: :start_url 参数为需要下载的文章URL :total_page 下载页数 :__page 扩展页数起始参数,用户请勿设定 :return:无 ''' manager = UrlManager() # 添加入口URL if 'image' in start_url or 'academy' in start_url: manager.add_new_url(start_url) # 判断url管理器中是否有新的url while (manager.has_new_url()): try: # 从URL管理器获取新的url new_url = manager.get_new_url() # HTML下载器下载网页 html = self.downloader.download(new_url) # 通过关键词判断是否是二级网页 if 'slide' in new_url: # HTML解析器抽取二级网页数据 data = self.parser.parse_data(html) self.crawl_items(data) else: # HTML解析器抽取一级网页数据 data = self.parser.parse_urls(html) manager.add_new_urls(data) except Exception as e: print('爬取失败==>', e) # 爬取后续页数 if __page <= total_page: if 'image' in start_url: next_url = '%s/index.php?action=getList&class_id=192&sub_classid=0&page=%s¬_in_id=' % ( start_url, str(__page)) elif 'academy' in start_url: next_url = '%s/index.php?action=getList&class_id=190&sub_classid=0&page=%s¬_in_id=' % ( start_url, str(__page)) print('开始爬取==>第', str(__page), '页') return self.crawl_image(next_url, total_page, __page + 1) else: print('网址有错误,请检查')
def url_manager_proc(self,url_q,conn_q,root_url): url_manager = UrlManager() url_manager.add_new_url(root_url) while True: while(url_manager.has_new_url()): new_url = url_manager.get_new_url() url_q.put(new_url) print('old_url=',url_manager.old_url_size()) if(url_manager.old_url_size()>30): url_q.put('end') print('控制节点发起结束通知!') url_manager.save_progress('new_urls.txt',url_manager.new_urls) url_manager.save_progress('old_urls.txt',url_manager.old_urls) return try: urls = conn_q.get() url_manager.add_new_urls(urls) except: time.sleep(0.1)
def url_manager_proc(self, url_q, conn_q, root_url): url_manager = UrlManager() url_manager.add_new_url(root_url) while True: while (url_manager.has_new_url()): new_url = url_manager.get_new_url() url_q.put(new_url) print('old_url=', url_manager.old_url_size()) if (url_manager.old_url_size() > 2000): url_q.put('end') print('控制节点发起通知') url_manager.save_progress('new_urls.txt', url_manager.new_urls) url_manager.save_progress('old_urls.txt', url_manager.old_urls) return try: if not conn_q.empty(): urls = conn_q.get() url_manager.add_new_urls() except BaseException as e: print(e) time.sleep(0.1)
def url_manager_proc(self, url_q, conn_q, root_url, num=200): url_manager = UrlManager() url_manager.add_new_url(root_url) while True: while url_manager.has_new_url(): new_url = url_manager.get_new_url() url_q.put(new_url) if url_manager.old_url_size() > num: # 通知爬行节点工作结束 url_q.put('end') print('控制节点发起结束通知!') # 关闭管理节点,同时存储 set 状态 url_manager.save_progress() return # 没有url了就从conn_q里拿 try: if not conn_q.empty(): urls = conn_q.get() url_manager.add_new_urls(urls) else: # 延时休息 time.sleep(0.1) except Exception as e: print(e)
def url_manager_proc(self, url_q, conn_q, root_url): """从conn_q队列获取新URL到URL管理器, 取URL放入url_q供爬虫节点获取""" url_manager = UrlManager() url_manager.add_new_url(root_url) while True: while (url_manager.has_new_url()): new_url = url_manager.get_new_url() url_q.put(new_url) logging.info("old_url_size = %s " % url_manager.old_url_size()) if url_manager.old_url_size() > 50: url_q.put("end") logging.info("控制节点发起结束通知") url_manager.save_process("new_urls.txt", url_manager.new_urls) url_manager.save_process("old_urls.txt", url_manager.old_urls) return try: if not conn_q.empty(): urls = conn_q.get() url_manager.add_new_urls(urls) except BaseException as e: time.sleep(0.1)
def url_manager_proc(self, url_q, conn_q, root_url): url_manager = UrlManager() url_manager.add_new_url(root_url) while True: while url_manager.has_new_url(): new_url = url_manager.get_new_url() print("url " + new_url) url_q.put(new_url) # print("old_url=",url_manager.old_url_size()) if url_manager.old_url_size() > 2000: url_q.put("end") print("控制节点发起结束通知!") url_manager.save_progress("new_urls.txt", url_manager.new_urls) url_manager.save_progress("old_urls.txt", url_manager.old_urls) return try: if not conn_q.empty(): urls = conn_q.get() # print(urls) url_manager.add_new_urls(urls) except BaseException: time.sleep(0.1)
def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser()
def __init__(self): self.urlmanager = UrlManager()
from URLManager import UrlManager import pickle import hashlib print("has_new_url", UrlManager.has_new_url.__doc__) print("add_new_url", UrlManager.add_new_url.__doc__) print("add_new_urls", UrlManager.add_new_urls.__doc__) print("get_new_url", UrlManager.get_new_url.__doc__) print("new_url_size", UrlManager.new_url_size.__doc__) print("old_url_size", UrlManager.old_url_size.__doc__) print("save_progress", UrlManager.save_progress.__doc__) print("load_progress", UrlManager.load_progress.__doc__) urls = set([ "http://qq.ip138.com/tianqi/", "http://qq.ip138.com/shenfenzheng/", "http://qq.ip138.com/huoche/", "http://qq.ip138.com/daishoudian/mobile.htm", "http://www.miitbeian.gov.cn/" ]) urlmanager = UrlManager() print(type(urls)) # urlmanager获得新的url集合 urlmanager.add_new_urls(urls) print(urlmanager.has_new_url()) # urlmanager输出一个未爬取的url new_url = urlmanager.get_new_url() #拿出的同时将其放的到已经爬取的url集合中 # 没有未爬取的url时返回None print(new_url) print(urlmanager.old_url_size()) # 保存进度 urlmanager.save_progress()
def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() # 实例化时连接到数据库 self.s = Settings().setting
def __init__(self): self.manager = UrlManager() # 我再练习时出错的地方,少了(),导致报错 self.downloader = HtmlDownloader() self.parser = HtmlParse() self.output = DataOutput()
def __init__(self): self.urlManager = UrlManager() self.downloader = HtmlDownloader() self.dataStore = SimpleHtmlDataStore() self.parser = BikeParser()