def __init__(self, concurrent_num=20, crawl_tags=[], custom_headers={}, plugin=[], depth=10, max_url_num=3000, internal_timeout=60, spider_timeout=1800, dir_max_url=15, crawler_mode=0, same_origin=True, dynamic_parse=False, login_dict={}, scan_task_id=0): """ concurrent_num : 并行crawler和fetcher数量 crawl_tags : 爬行时收集URL所属标签列表 custom_headers : 自定义HTTP请求头 plugin : 自定义插件列表 depth : 爬行深度限制 max_url_num : 最大收集URL数量 internal_timeout : 内部调用超时时间 spider_timeout : 爬虫超时时间 crawler_mode : 爬取器模型(0:多线程模型,1:gevent模型) same_origin : 是否限制相同域下 dynamic_parse : 是否使用WebKit动态解析 """ self.logger.setLevel(logging.DEBUG) hd = logging.StreamHandler() formatter = logging.Formatter( "%(asctime)s - %(name)s - %(levelname)s - %(message)s") hd.setFormatter(formatter) self.logger.addHandler(hd) self.stopped = event.Event() self.internal_timeout = internal_timeout self.internal_timer = Timeout(internal_timeout) self.spider_stop_time = time() + spider_timeout self.crawler_mode = crawler_mode # 爬取器模型 self.concurrent_num = concurrent_num self.fetcher_pool = pool.Pool(self.concurrent_num) if self.crawler_mode == 0: self.crawler_pool = threadpool.ThreadPool( min(50, self.concurrent_num)) else: self.crawler_pool = pool.Pool(self.concurrent_num) # self.fetcher_queue = queue.JoinableQueue(maxsize=self.concurrent_num*100) self.fetcher_queue = threadpool.Queue(maxsize=self.concurrent_num * 10000) self.crawler_queue = threadpool.Queue(maxsize=self.concurrent_num * 10000) self.fetcher_cache = UrlCache() self.crawler_cache = UrlCache() self.default_crawl_tags = [ 'script', 'a', 'base', 'iframe', 'frame', 'object' ] self.ignore_ext = [ 'js', 'css', 'png', 'jpg', 'gif', 'bmp', 'svg', 'exif', 'jpeg', 'exe', 'rar', 'zip', 'swf', 'ico' ] self.crawl_tags = list(set(self.default_crawl_tags) | set(crawl_tags)) self.same_origin = same_origin self.depth = depth self.max_url_num = max_url_num self.dir_max_url = dir_max_url self.dynamic_parse = dynamic_parse if self.dynamic_parse: self.webkit = WebKit(login_dict) if login_dict: self.webkit.auto_login() # elif custom_headers.get('Cookie'): # # self.webkit.set_cookie(custom_headers) self.crawler_stopped = event.Event() self.plugin_handler = plugin # 注册Crawler中使用的插件 self.custom_headers = custom_headers self.scan_task_id = scan_task_id
def __init__(self, concurrent_num=20, crawl_tags=[], depth=3, max_url_num=300, internal_timeout=60, spider_timeout=6 * 3600, crawler_mode=0, same_origin=True, dynamic_parse=False): """ concurrent_num : 并行crawler和fetcher数量 crawl_tags : 爬行时收集URL所属标签列表 depth : 爬行深度限制 max_url_num : 最大收集URL数量 internal_timeout : 内部调用超时时间 spider_timeout : 爬虫超时时间 crawler_mode : 爬取器模型(0:多线程模型,1:gevent模型) same_origin : 是否限制相同域下 dynamic_parse : 是否使用WebKit动态解析 """ self.logger.setLevel(logging.DEBUG) hd = logging.StreamHandler() formatter = logging.Formatter( "%(asctime)s - %(name)s - %(levelname)s - %(message)s") hd.setFormatter(formatter) self.logger.addHandler(hd) self.stopped = event.Event() self.internal_timer = Timeout(internal_timeout) self.crawler_mode = crawler_mode #爬取器模型 self.concurrent_num = concurrent_num self.fetcher_pool = pool.Pool(self.concurrent_num) if self.crawler_mode == 0: self.crawler_pool = threadpool.ThreadPool( min(50, self.concurrent_num)) else: self.crawler_pool = pool.Pool(self.concurrent_num) #self.fetcher_queue = queue.JoinableQueue(maxsize=self.concurrent_num*100) self.fetcher_queue = threadpool.Queue(maxsize=self.concurrent_num * 100) self.crawler_queue = threadpool.Queue(maxsize=self.concurrent_num * 100) self.fetcher_cache = UrlCache() self.crawler_cache = UrlCache() self.default_crawl_tags = ['a', 'base', 'iframe', 'frame', 'object'] self.ignore_ext = [ 'js', 'css', 'png', 'jpg', 'gif', 'bmp', 'svg', 'exif', 'jpeg', 'exe', 'rar', 'zip' ] self.crawl_tags = list(set(self.default_crawl_tags) | set(crawl_tags)) self.same_origin = same_origin self.depth = depth self.max_url_num = max_url_num self.dynamic_parse = dynamic_parse if self.dynamic_parse: self.webkit = WebKit() self.crawler_stopped = event.Event()
def __init__(self, concurrent_num=20, crawl_tags=[], custom_headers={}, plugin=[], depth=3, max_url_num=300, internal_timeout=60, spider_timeout=6 * 3600, crawler_mode=0, same_origin=True, dynamic_parse=False): """ concurrent_num : 并行crawler和fetcher数量 crawl_tags : 爬行时收集URL所属标签列表 custom_headers : 自定义HTTP请求头 plugin : 自定义插件列表 depth : 爬行深度限制 max_url_num : 最大收集URL数量 internal_timeout : 内部调用超时时间 spider_timeout : 爬虫超时时间 crawler_mode : 爬取器模型(0:多线程模型,1:gevent模型) same_origin : 是否限制相同域下 dynamic_parse : 是否使用WebKit动态解析 """ # 日志模块 self.logger.setLevel(logging.DEBUG) # 日志级别 formatter = logging.Formatter( "%(asctime)s - %(name)s - %(levelname)s - %(message)s") # 日志格式 hd = logging.StreamHandler() hd.setFormatter(formatter) self.logger.addHandler(hd) self.stopped = event.Event() self.internal_timeout = internal_timeout # 内部调用超时时间 self.internal_timer = Timeout(internal_timeout) self.crawler_mode = crawler_mode # 爬取器模型 self.concurrent_num = concurrent_num # 并行crawler与fetcher数量 # fetcher使用gevent模型 self.fetcher_pool = pool.Pool(self.concurrent_num) # crawler模型设置 # crawler负责解析并爬取HTML中的URL,送入fetcher,fetcher负责获取HTML,送入crawler if self.crawler_mode == 0: # 线程池模型 self.crawler_pool = threadpool.ThreadPool( min(50, self.concurrent_num)) else: # gevent模型 self.crawler_pool = pool.Pool(self.concurrent_num) # fetcher和crawler两部分独立工作,互不干扰,通过queue进行链接 # self.fetcher_queue = queue.JoinableQueue(maxsize=self.concurrent_num*100) self.fetcher_queue = threadpool.Queue(maxsize=self.concurrent_num * 10000) self.crawler_queue = threadpool.Queue(maxsize=self.concurrent_num * 10000) self.fetcher_cache = UrlCache() self.crawler_cache = UrlCache() self.default_crawl_tags = ['a', 'base', 'iframe', 'frame', 'object'] # 默认的爬行时收集URL所属标签列表 self.ignore_ext = [ 'js', 'css', 'png', 'jpg', 'gif', 'bmp', 'svg', 'exif', 'jpeg', 'exe', 'rar', 'zip' ] # 爬行时忽略的URL种类 self.crawl_tags = list(set(self.default_crawl_tags) | set(crawl_tags)) # 爬行时收集URL所属标签列表 self.same_origin = same_origin # 是否同源 self.depth = depth # 爬行深度限制 self.max_url_num = max_url_num # 最大收集URL数量 self.dynamic_parse = dynamic_parse # 是否使用WebKit动态解析 # 如果开启动态解析 if self.dynamic_parse: self.webkit = WebKit() self.crawler_stopped = event.Event() self.plugin_handler = plugin # 注册Crawler中使用的插件 # 自定义HTTP头 self.custom_headers = custom_headers