def crawler(self,_dep=None): ''' 爬行器主函数 ''' while not self.stopped.isSet() and not self.crawler_stopped.isSet(): try: self._maintain_spider() #维护爬虫池 url_data = self.crawler_queue.get(block=False) except queue.Empty,e: if self.crawler_queue.unfinished_tasks == 0 and self.fetcher_queue.unfinished_tasks == 0: self.stop() else: if self.crawler_mode == 1: gevent.sleep() else: pre_depth = url_data.depth curr_depth = pre_depth+1 link_generator = HtmlAnalyzer.extract_links(url_data.html,url_data.url,self.crawl_tags) link_list = [ url for url in link_generator] if self.dynamic_parse: link_generator = self.webkit.extract_links(url_data.url) link_list.extend([ url for url in link_generator]) link_list = list(set(link_list)) for index,link in enumerate(link_list): if not self.check_url_usable(link): continue # 增加url相似性判断,详见urlfilter.py if not self.check_url_similar(link): continue # 增加url重复判断,详见urlfilter.py if not self.check_url_repeat(link): continue if curr_depth > self.depth: #最大爬行深度判断 if self.crawler_stopped.isSet(): break else: self.crawler_stopped.set() break if len(self.fetcher_cache) == self.max_url_num: #最大收集URL数量判断 if self.crawler_stopped.isSet(): break else: self.crawler_stopped.set() break link = to_unicode(link) url = UrlData(link,depth=curr_depth) self.fetcher_cache.insert(url) self.fetcher_queue.put(url,block=True) for plugin_name in self.plugin_handler: #循环动态调用初始化时注册的插件 try: plugin_obj = eval(plugin_name)() plugin_obj.start(url_data) except Exception,e: import traceback traceback.print_exc() self.crawler_queue.task_done()
def crawler(self, _dep=None): ''' 爬行器主函数 ''' while not self.stopped.isSet() and not self.crawler_stopped.isSet(): try: self._maintain_spider() #维护爬虫池 url_data = self.crawler_queue.get(block=False) except queue.Empty, e: if self.crawler_queue.unfinished_tasks == 0 and self.fetcher_queue.unfinished_tasks == 0: self.stop() else: if self.crawler_mode == 1: gevent.sleep() else: pre_depth = url_data.depth curr_depth = pre_depth + 1 #link_generator = HtmlAnalyzer.extract_links(url_data.html,url_data.url,self.crawl_tags) link_list = [] link_generator = HtmlAnalyzer.extract_links_ithome( url_data.html) link_list = [url for url in link_generator] if self.dynamic_parse: link_generator = self.webkit.extract_links(url_data.url) link_list.extend([url for url in link_generator]) link_list = list(set(link_list)) for index, link in enumerate(link_list): if not self.check_url_usable(link): continue if curr_depth > self.depth: #最大爬行深度判断 if self.crawler_stopped.isSet(): break else: self.crawler_stopped.set() break if len(self.fetcher_cache ) == self.max_url_num: #最大收集URL数量判断 if self.crawler_stopped.isSet(): break else: self.crawler_stopped.set() break link = to_unicode(link) url = UrlData(link, depth=curr_depth) self.fetcher_cache.insert(url) self.fetcher_queue.put(url, block=True) for plugin_name in self.plugin_handler: #循环动态调用初始化时注册的插件 try: plugin_obj = eval(plugin_name)() plugin_obj.start(url_data) except Exception, e: import traceback traceback.print_exc() self.crawler_queue.task_done()
def crawler(self, _dep=None): """ 爬行器主函数 """ while not self.stopped.isSet() and not self.crawler_stopped.isSet(): try: self._maintain_spider() # 维护爬虫池 url_data = self.crawler_queue.get(block=False) except queue.Empty, e: if self.crawler_queue.unfinished_tasks == 0 and self.fetcher_queue.unfinished_tasks == 0: self.stop() else: if self.crawler_mode == 1: gevent.sleep() else: pre_depth = url_data.depth curr_depth = pre_depth + 1 link_generator = HtmlAnalyzer.extract_links(url_data.html, url_data.url, self.crawl_tags) link_list = [url for url in link_generator] if self.dynamic_parse: link_generator = self.webkit.extract_links(url_data.url) link_list.extend([url for url in link_generator]) link_list = list(set(link_list)) for index, link in enumerate(link_list): if not self.check_url_usable(link): continue if curr_depth > self.depth: # 最大爬行深度判断 if self.crawler_stopped.isSet(): break else: self.crawler_stopped.set() break if len(self.fetcher_cache) == self.max_url_num: # 最大收集URL数量判断 if self.crawler_stopped.isSet(): break else: self.crawler_stopped.set() break link = to_unicode(link) url = UrlData(link, depth=curr_depth) self.fetcher_cache.insert(url) self.fetcher_queue.put(url, block=True) self.crawler_queue.task_done()
def crawler(self, _dep=None): '''爬行器主函数''' while not self.stopped.isSet() and not self.crawler_stopped.isSet(): try: self._maintain_spider() url_data = self.crawler_queue.get(block=False) except queue.Empty as e: if self.crawler_queue.unfinished_tasks == 0 and self.fetcher_queue.unfinished_tasks == 0: self.stop() else: if self.crawler_mode == 1: gevent.sleep() else: curr_depth = len(str(url_data).split('/'))-2 link_generator = HtmlAnalyzer.extract_links(url_data.html, url_data.url, self.crawl_tags) link_list = list(link_generator) if self.dynamic_parse: link_generator = self.webkit.extract_links(url_data.url) link_list.extend([ url for url in link_generator]) link_list = list(set(link_list)) for index, link in enumerate(link_list): if not self.check_url_usable(link): continue if curr_depth > self.depth: if self.crawler_stopped.isSet(): break else: self.crawler_stopped.set() break if self.fetched_url == self.max_url_num: if self.crawler_stopped.isSet(): break else: self.crawler_stopped.set() break url = UrlData(link, depth=curr_depth) self.fetcher_bf.insert_element(str(url)) self.fetched_url+=1 self.fetcher_queue.put(url, block=True) self.crawler_queue.task_done()
def feed(self,html): return HtmlAnalyzer.extractLinks(html,self.urlobj.url,self.charset)
return for link in self.feed(html): url = UrlObj(link, depth) self.spider.put(url) self.spider.remove_thread(self.thread_id) def open(self, url): strategy = self.spider.strategy try: resp = requests.get(url, timeout=strategy.timeout) except requests.exceptions.RequestException, e: raise e if resp.status_code != requests.codes.ok: resp.raise_for_status() charset = HtmlAnalyzer.detectCharSet(resp.text) if charset is not None: self.charset = charset resp.encoding = charset return resp.text def feed(self,html): return HtmlAnalyzer.extractLinks(html,self.urlobj.url,self.charset) class MySpider(object): def __init__(self, max_depth, max_count, root_url): self.spider = GeventSpider(max_depth=max_depth, max_count=max_count, root_url=root_url) def run(self): self.spider.run()
self.logger.debug( "sucess crawled '%s' the <%d> urls", url, len(urltable)) self.stop() def open(self, url): strategy = self.spider.strategy try: resp = requests.get(url, headers=strategy.headers, cookies=strategy.cookies, timeout=strategy.timeout, verify=strategy.ssl_verify) except requests.exceptions.RequestException, e: raise e if resp.status_code != requests.codes.ok: resp.raise_for_status() charset = HtmlAnalyzer.detectCharSet(resp.text) if charset is not None: self.charset = charset resp.encoding = charset return resp.text def feed(self,html): return HtmlAnalyzer.extractLinks(html,self.urlobj.url,self.charset) def stop(self): self.spider.greenlet_finished.set() self.kill(block=False) class TestSpider(unittest.TestCase):