예제 #1
0
파일: spider.py 프로젝트: zhupite233/scaner
    def __init__(self,
                 concurrent_num=20,
                 crawl_tags=[],
                 custom_headers={},
                 plugin=[],
                 depth=10,
                 max_url_num=3000,
                 internal_timeout=60,
                 spider_timeout=1800,
                 dir_max_url=15,
                 crawler_mode=0,
                 same_origin=True,
                 dynamic_parse=False,
                 login_dict={},
                 scan_task_id=0):
        """
        concurrent_num    : 并行crawler和fetcher数量
        crawl_tags        : 爬行时收集URL所属标签列表
        custom_headers    : 自定义HTTP请求头
        plugin            : 自定义插件列表
        depth             : 爬行深度限制
        max_url_num       : 最大收集URL数量
        internal_timeout  : 内部调用超时时间
        spider_timeout    : 爬虫超时时间
        crawler_mode      : 爬取器模型(0:多线程模型,1:gevent模型)
        same_origin       : 是否限制相同域下
        dynamic_parse     : 是否使用WebKit动态解析
        """

        self.logger.setLevel(logging.DEBUG)
        hd = logging.StreamHandler()
        formatter = logging.Formatter(
            "%(asctime)s - %(name)s - %(levelname)s - %(message)s")
        hd.setFormatter(formatter)
        self.logger.addHandler(hd)

        self.stopped = event.Event()
        self.internal_timeout = internal_timeout
        self.internal_timer = Timeout(internal_timeout)
        self.spider_stop_time = time() + spider_timeout
        self.crawler_mode = crawler_mode  # 爬取器模型
        self.concurrent_num = concurrent_num
        self.fetcher_pool = pool.Pool(self.concurrent_num)
        if self.crawler_mode == 0:
            self.crawler_pool = threadpool.ThreadPool(
                min(50, self.concurrent_num))
        else:
            self.crawler_pool = pool.Pool(self.concurrent_num)
        # self.fetcher_queue = queue.JoinableQueue(maxsize=self.concurrent_num*100)
        self.fetcher_queue = threadpool.Queue(maxsize=self.concurrent_num *
                                              10000)
        self.crawler_queue = threadpool.Queue(maxsize=self.concurrent_num *
                                              10000)

        self.fetcher_cache = UrlCache()
        self.crawler_cache = UrlCache()

        self.default_crawl_tags = [
            'script', 'a', 'base', 'iframe', 'frame', 'object'
        ]
        self.ignore_ext = [
            'js', 'css', 'png', 'jpg', 'gif', 'bmp', 'svg', 'exif', 'jpeg',
            'exe', 'rar', 'zip', 'swf', 'ico'
        ]
        self.crawl_tags = list(set(self.default_crawl_tags) | set(crawl_tags))
        self.same_origin = same_origin
        self.depth = depth
        self.max_url_num = max_url_num
        self.dir_max_url = dir_max_url
        self.dynamic_parse = dynamic_parse
        if self.dynamic_parse:
            self.webkit = WebKit(login_dict)
            if login_dict:
                self.webkit.auto_login()
            # elif custom_headers.get('Cookie'):
            #
            #     self.webkit.set_cookie(custom_headers)

        self.crawler_stopped = event.Event()

        self.plugin_handler = plugin  # 注册Crawler中使用的插件
        self.custom_headers = custom_headers
        self.scan_task_id = scan_task_id
예제 #2
0
    def __init__(self,
                 concurrent_num=20,
                 crawl_tags=[],
                 depth=3,
                 max_url_num=300,
                 internal_timeout=60,
                 spider_timeout=6 * 3600,
                 crawler_mode=0,
                 same_origin=True,
                 dynamic_parse=False):
        """
        concurrent_num    : 并行crawler和fetcher数量
        crawl_tags        : 爬行时收集URL所属标签列表
        depth             : 爬行深度限制
        max_url_num       : 最大收集URL数量
        internal_timeout  : 内部调用超时时间
        spider_timeout    : 爬虫超时时间
        crawler_mode      : 爬取器模型(0:多线程模型,1:gevent模型)
        same_origin       : 是否限制相同域下
        dynamic_parse     : 是否使用WebKit动态解析
        """

        self.logger.setLevel(logging.DEBUG)
        hd = logging.StreamHandler()
        formatter = logging.Formatter(
            "%(asctime)s - %(name)s - %(levelname)s - %(message)s")
        hd.setFormatter(formatter)
        self.logger.addHandler(hd)

        self.stopped = event.Event()
        self.internal_timer = Timeout(internal_timeout)

        self.crawler_mode = crawler_mode  #爬取器模型
        self.concurrent_num = concurrent_num
        self.fetcher_pool = pool.Pool(self.concurrent_num)
        if self.crawler_mode == 0:
            self.crawler_pool = threadpool.ThreadPool(
                min(50, self.concurrent_num))
        else:
            self.crawler_pool = pool.Pool(self.concurrent_num)

        #self.fetcher_queue = queue.JoinableQueue(maxsize=self.concurrent_num*100)
        self.fetcher_queue = threadpool.Queue(maxsize=self.concurrent_num *
                                              100)
        self.crawler_queue = threadpool.Queue(maxsize=self.concurrent_num *
                                              100)

        self.fetcher_cache = UrlCache()
        self.crawler_cache = UrlCache()

        self.default_crawl_tags = ['a', 'base', 'iframe', 'frame', 'object']
        self.ignore_ext = [
            'js', 'css', 'png', 'jpg', 'gif', 'bmp', 'svg', 'exif', 'jpeg',
            'exe', 'rar', 'zip'
        ]
        self.crawl_tags = list(set(self.default_crawl_tags) | set(crawl_tags))
        self.same_origin = same_origin
        self.depth = depth
        self.max_url_num = max_url_num
        self.dynamic_parse = dynamic_parse
        if self.dynamic_parse:
            self.webkit = WebKit()
        self.crawler_stopped = event.Event()
예제 #3
0
파일: vulcan.py 프로젝트: Flygend/vulcan
    def __init__(self,
                 concurrent_num=20,
                 crawl_tags=[],
                 custom_headers={},
                 plugin=[],
                 depth=3,
                 max_url_num=300,
                 internal_timeout=60,
                 spider_timeout=6 * 3600,
                 crawler_mode=0,
                 same_origin=True,
                 dynamic_parse=False):
        """
        concurrent_num    : 并行crawler和fetcher数量
        crawl_tags        : 爬行时收集URL所属标签列表
        custom_headers    : 自定义HTTP请求头
        plugin            : 自定义插件列表
        depth             : 爬行深度限制
        max_url_num       : 最大收集URL数量
        internal_timeout  : 内部调用超时时间
        spider_timeout    : 爬虫超时时间
        crawler_mode      : 爬取器模型(0:多线程模型,1:gevent模型)
        same_origin       : 是否限制相同域下
        dynamic_parse     : 是否使用WebKit动态解析
        """

        #   日志模块
        self.logger.setLevel(logging.DEBUG)  #   日志级别
        formatter = logging.Formatter(
            "%(asctime)s - %(name)s - %(levelname)s - %(message)s")  #   日志格式
        hd = logging.StreamHandler()
        hd.setFormatter(formatter)
        self.logger.addHandler(hd)

        self.stopped = event.Event()
        self.internal_timeout = internal_timeout  #   内部调用超时时间
        self.internal_timer = Timeout(internal_timeout)

        self.crawler_mode = crawler_mode  #   爬取器模型
        self.concurrent_num = concurrent_num  #   并行crawler与fetcher数量

        #   fetcher使用gevent模型
        self.fetcher_pool = pool.Pool(self.concurrent_num)

        #   crawler模型设置
        #   crawler负责解析并爬取HTML中的URL,送入fetcher,fetcher负责获取HTML,送入crawler
        if self.crawler_mode == 0:
            #   线程池模型
            self.crawler_pool = threadpool.ThreadPool(
                min(50, self.concurrent_num))
        else:
            #   gevent模型
            self.crawler_pool = pool.Pool(self.concurrent_num)

        #   fetcher和crawler两部分独立工作,互不干扰,通过queue进行链接
        # self.fetcher_queue = queue.JoinableQueue(maxsize=self.concurrent_num*100)
        self.fetcher_queue = threadpool.Queue(maxsize=self.concurrent_num *
                                              10000)
        self.crawler_queue = threadpool.Queue(maxsize=self.concurrent_num *
                                              10000)

        self.fetcher_cache = UrlCache()
        self.crawler_cache = UrlCache()

        self.default_crawl_tags = ['a', 'base', 'iframe', 'frame',
                                   'object']  #   默认的爬行时收集URL所属标签列表
        self.ignore_ext = [
            'js', 'css', 'png', 'jpg', 'gif', 'bmp', 'svg', 'exif', 'jpeg',
            'exe', 'rar', 'zip'
        ]  #   爬行时忽略的URL种类
        self.crawl_tags = list(set(self.default_crawl_tags)
                               | set(crawl_tags))  #   爬行时收集URL所属标签列表
        self.same_origin = same_origin  #   是否同源
        self.depth = depth  #   爬行深度限制
        self.max_url_num = max_url_num  #   最大收集URL数量
        self.dynamic_parse = dynamic_parse  #   是否使用WebKit动态解析

        #   如果开启动态解析
        if self.dynamic_parse:
            self.webkit = WebKit()
        self.crawler_stopped = event.Event()

        self.plugin_handler = plugin  # 注册Crawler中使用的插件
        #   自定义HTTP头
        self.custom_headers = custom_headers