def __init__(self, concurrent_num=20, crawl_tags=[], custom_headers={}, plugin=[], depth=3, max_url_num=300, internal_timeout=60, spider_timeout=6*3600, crawler_mode=0, same_origin=True, dynamic_parse=False, spider_type='img'): self.stopped = event.Event() self.internal_timeout = internal_timeout self.internal_timer = Timeout(internal_timeout) self.crawler_mode = crawler_mode # 爬取器模型 self.concurrent_num = concurrent_num self.fetcher_pool = pool.Pool(self.concurrent_num) # 运行的进程数目 if self.crawler_mode == 0: self.crawler_pool = threadpool.ThreadPool(min(50, self.concurrent_num)) else: self.crawler_pool = pool.Pool(self.concurrent_num) # self.crawler_pool = [] self.fetcher_queue = threadpool.Queue(maxsize=self.concurrent_num*10000) self.crawler_queue = threadpool.Queue(maxsize=self.concurrent_num*10000) self.fetcher_cache = UrlCache() # url缓存设置 self.crawler_cache = UrlCache() self.default_crawl_tags = ['a', 'base', 'iframe', 'frame', 'object'] self.ignore_ext = ['js', 'css', 'png', 'jpg', 'gif', 'bmp', 'svg', 'exif', 'jpeg', 'exe', 'rar', 'zip'] self.crawl_tags = list(set(self.default_crawl_tags) | set(crawl_tags)) # 取二者的并集 self.same_origin = same_origin self.depth = depth # 爬虫访问深度 self.max_url_num = max_url_num self.dynamic_parse = dynamic_parse if self.dynamic_parse: self.webkit = WebKit() self.crawler_stopped = event.Event() self.plugin_handler = plugin # 注册Crawler中使用的插件 self.custom_headers = custom_headers self.unspider_url_list = [] self.spider_type = spider_type
class Spider(object): """ concurrent_num : 并行crawler和fetcher数量 crawl_tags : 爬行时收集URL所属标签列表 custom_headers : 自定义HTTP请求头 plugin : 自定义插件列表 depth : 爬行深度限制 max_url_num : 最大收集URL数量 internal_timeout : 内部调用超时时间 spider_timeout : 爬虫超时时间 crawler_mode : 爬取器模型(0:多线程模型,1:gevent模型) same_origin : 是否限制相同域下 dynamic_parse : 是否使用WebKit动态解析 """ def __init__(self, concurrent_num=20, crawl_tags=[], custom_headers={}, plugin=[], depth=3, max_url_num=300, internal_timeout=60, spider_timeout=6*3600, crawler_mode=0, same_origin=True, dynamic_parse=False, spider_type='img'): self.stopped = event.Event() self.internal_timeout = internal_timeout self.internal_timer = Timeout(internal_timeout) self.crawler_mode = crawler_mode # 爬取器模型 self.concurrent_num = concurrent_num self.fetcher_pool = pool.Pool(self.concurrent_num) # 运行的进程数目 if self.crawler_mode == 0: self.crawler_pool = threadpool.ThreadPool(min(50, self.concurrent_num)) else: self.crawler_pool = pool.Pool(self.concurrent_num) # self.crawler_pool = [] self.fetcher_queue = threadpool.Queue(maxsize=self.concurrent_num*10000) self.crawler_queue = threadpool.Queue(maxsize=self.concurrent_num*10000) self.fetcher_cache = UrlCache() # url缓存设置 self.crawler_cache = UrlCache() self.default_crawl_tags = ['a', 'base', 'iframe', 'frame', 'object'] self.ignore_ext = ['js', 'css', 'png', 'jpg', 'gif', 'bmp', 'svg', 'exif', 'jpeg', 'exe', 'rar', 'zip'] self.crawl_tags = list(set(self.default_crawl_tags) | set(crawl_tags)) # 取二者的并集 self.same_origin = same_origin self.depth = depth # 爬虫访问深度 self.max_url_num = max_url_num self.dynamic_parse = dynamic_parse if self.dynamic_parse: self.webkit = WebKit() self.crawler_stopped = event.Event() self.plugin_handler = plugin # 注册Crawler中使用的插件 self.custom_headers = custom_headers self.unspider_url_list = [] self.spider_type = spider_type def _start_fetcher(self): for i in xrange(self.concurrent_num): fetcher = Fetcher(self) self.fetcher_pool.start(fetcher) def _start_crawler(self): for _ in xrange(self.concurrent_num): self.crawler_pool.spawn(self.crawler) # self.crawler_pool = [gevent.spawn(self.crawler) for _ in xrange(self.concurrent_num)] def start(self): logging.info("spider starting...") if self.crawler_mode == 0: logging.info("crawler run in multi-thread mode.") elif self.crawler_mode == 1: logging.info("crawler run in gevent mode.") self._start_fetcher() self._start_crawler() self.stopped.wait() # 等待停止事件置位 try: self.internal_timer.start() self.fetcher_pool.join(timeout=self.internal_timer) if self.crawler_mode == 1: self.crawler_pool.join(timeout=self.internal_timer) else: self.crawler_pool.join() except Timeout: logging.error("internal timeout triggered") finally: self.internal_timer.cancel() self.stopped.clear() if self.dynamic_parse: self.webkit.close() unspider_url_list = list(set(self.unspider_url_list)) # 对未访问的list进行去重 unspider_url_list.sort(key=self.unspider_url_list.index) redis_key = IMG_UNSPIDER_URL_KEY # redis_key控制spider检索的数据类型 try: for url_link in unspider_url_list: with global_redis.pipeline() as pipe: pipe.lpush(redis_key, url_link).ltrim(redis_key, 0, 100).expire(redis_key, 72000).execute() except: logging.info("store unspider url error!!") pass logging.info("crawler_cache:%s fetcher_cache:%s" % (len(self.crawler_cache), len(self.fetcher_cache))) logging.info("spider process quit.") def crawler(self, _dep=None): while not self.stopped.isSet() and not self.crawler_stopped.isSet(): try: self._maintain_spider() # 维护爬虫池 url_data = self.crawler_queue.get(block=False) except queue.Empty, e: if self.crawler_queue.unfinished_tasks == 0 and self.fetcher_queue.unfinished_tasks == 0: self.stop() else: if self.crawler_mode == 1: gevent.sleep() else: pre_depth = url_data.depth curr_depth = pre_depth+1 link_generator = HtmlAnalyzer.extract_links(url_data.html, url_data.url, self.crawl_tags) link_list = [url for url in link_generator] if self.dynamic_parse: link_generator = self.webkit.extract_links(url_data.url) link_list.extend([url for url in link_generator]) link_list = list(set(link_list)) for index, link in enumerate(link_list): if not self.check_url_usable(link): continue if curr_depth > self.depth: # 最大爬行深度判断 if self.crawler_stopped.isSet(): break else: self.crawler_stopped.set() break if len(self.fetcher_cache) == self.max_url_num: # 最大收集URL数量判断 if self.crawler_stopped.isSet(): break else: self.crawler_stopped.set() break link = to_unicode(link) url = UrlData(link, depth=curr_depth) self.fetcher_cache.insert(url) self.fetcher_queue.put(url, block=True) for plugin_name in self.plugin_handler: # 循环动态调用初始化时注册的插件 try: plugin_obj = eval(plugin_name)() plugin_obj.start(url_data) except Exception, e: import traceback traceback.print_exc() self.crawler_queue.task_done()
def url_read(url): """Grab HTML content from the url""" urlc = UrlCache(url) return urlc.read()