コード例 #1
0
    def __init__(self, settings=None, spidercls=None):
        if isinstance(settings, dict) or settings is None:
            settings = Setting(settings)
        self.settings = settings

        self.lfs = load_object(self.settings['LOG_FORMATTER_CLASS'])
        self.lfm = self.lfs.from_settings(self.settings)

        logger.info(*self.lfm.crawled("CrawlerRunner", '', '已初始化'))

        self.spider_loder = []
        # 装载的是Crawler的集合
        self._crawlers = set()
        # 装载的是defer的集合
        self._active = set()
        # 子爬虫的数量
        self.MAX_CHILD_NUM = 9
        if not spidercls:
            # 子爬虫的类
            self.spidercls = spidercls
        else:
            # 从设置中导入子爬虫的类
            self.spidercls = load_object(self.settings['SPIDER_CHILD_CLASS'])

        #  task完成标志位
        self.task_finish = False
        self.slot = None

        self.running = False
        self._task_schedule = queue.Queue()
コード例 #2
0
    def __init__(self, crawler, spider_closed_callback):
        # 获取log的格式
        self.lfm = crawler.logformatter
        # logger.debug("Engine 已初始化")
        logger.debug(
            *self.lfm.crawled("Spider", crawler.spider.name, '已初始化', 'Engine'))
        self.crawler = crawler
        self.settings = crawler.settings

        self.slot = None
        self.spider = None
        self.running = False
        self.paused = False
        self.engine_name = None
        self.start_time = None
        self._closewait = None

        # 从settings中找到Scheduler调度器,找到Scheduler类
        self.scheduler_cls = load_object(self.settings["SCHEDULER"])
        # 同样,找到Downloader下载器类
        downloader_cls = load_object(self.settings["DOWNLOADER"])
        self.downloader = downloader_cls(crawler)
        self.scraper = Scraper(crawler)
        self.crawlling = []
        self._spider_closed_callback = spider_closed_callback

        self.flag = False
コード例 #3
0
    def __init__(self,tasks,settings=None,spidercls=None,name=None,logformat=None,middlewares=None):
        if isinstance(settings, dict) or settings is None:
            settings = Setting(settings)
        self.settings = settings
        self.middlewares = middlewares
        if logformat:
            self.lfm = logformat
        else:
            self.lfs = load_object(self.settings['LOG_FORMATTER_CLASS'])
            self.lfm = self.lfs.from_settings(self.settings)
        self.name = name if name else ''

        logger.info(*self.lfm.crawled(
            "CrawlerRunner", self.name,
            '已初始化')
                     )
        if isinstance(tasks,dict):
            self.tasks = iter_dict(tasks)
        else:
            self.tasks = tasks
        self.spider_loder = []
        # 装载的是Crawler的集合
        self._crawlers = set()
        # 装载的是defer的集合
        self._active = set()
        self._active_finish = False
        # 子爬虫的数量
        if self.name:
            self.MAX_CHILD_NUM = 4
        else:
            self.MAX_CHILD_NUM = 4
        # 子爬虫的名称
        # self.SPIDER_NAME_CHOICE = self.settings['SPIDER_NAME_CHOICE']
        self.SPIDER_NAME_CHOICE = False

        # 缓冲的地址最大数量
        # self.MAX_SCHEDULE_NUM = 10
        if spidercls:
            # 子爬虫的类
            self.spidercls = spidercls
        else :
            # 从设置中导入子爬虫的类
            self.spidercls = load_object(self.settings['SPIDER_CHILD_CLASS'])

        #  task完成标志位
        # self.filter_task = FilterTask(self.SPIDER_NAME_CHOICE)
        # 导入队列中任务个数的最大值
        self.filter_task = 10
        self._push_task_finish = False
        self._pull_task_finish = False
        self._next_task = None
        self.fifer = FilterTask(settings)

        self.slot = None
        self._closewait = None

        self.running = False
        self._pause = False
        self._task_schedule = queue.Queue()
コード例 #4
0
    def __init__(self,
                 spidercls=None,
                 settings=None,
                 logformat=None,
                 father_name=None):
        self.crawling = False
        self.spider = None
        self.engine = None

        # 导入的是爬虫对应的模块,不是名称
        self.spidercls = spidercls

        self.father_name = father_name
        self.settings = settings.copy()

        # 获取log的格式
        if not logformat:
            lf_cls = load_object(self.settings['LOG_FORMATTER_CLASS'])
            self.logformatter = lf_cls.from_crawler(self)
        else:
            self.logformatter = logformat
        logger.debug(
            *self.logformatter.crawled("Spider", 'None', '已初始化', "Crawler"))

        self.spidercls.update_settings(self.settings)
        d = dict(overridden_or_new_settings(self.settings))
        if d:
            logger.info(*self.logformatter.crawled(
                "Spider", 'None', "添加或重写的设置如下:\n {settings}".format(
                    settings=d), "Crawler"))
コード例 #5
0
    def __init__(self,crawler):
        self.settings = crawler.settings
        self.lfm = crawler.logformatter
        #  如果只是加载一个类不带参数,而这个类的初始化带有参数的时候,使用这个类的时候会报错
        #  XXX missing X required positional argument
        # logger.debug("Downloader 已初始化...")
        logger.debug(*self.lfm.crawled("Spider", crawler.spider.name,
                                       '已初始化', 'Downloader'))
        self.handler = load_object(self.settings["DOWNLOAD_HANDLER"])(self.lfm,self.settings)
        self.spider = None
        self.slots = {}
        # active是一个活动集合,用于记录当前正在下载的request集合。
        self.active = set()
        # 从配置中获取设置的并发数
        self.total_concurrency = self.settings.getint('CONCURRENT_REQUESTS')
        # 同一域名并发数
        self.domain_concurrency = self.settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN')
        # 同一IP并发数
        self.ip_concurrency = self.settings.getint('CONCURRENT_REQUESTS_PER_IP')
        # 随机延迟下载时间 默认是True
        self.randomize_delay = self.settings.getbool('RANDOMIZE_DOWNLOAD_DELAY')

        # 初始化下载器中间件
        if crawler.middlewares.get('DownloaderMiddlewareManager'):
            self.middleware = crawler.middlewares['DownloaderMiddlewareManager']
        else:
            self.middleware = DownloaderMiddlewareManager.from_crawler(crawler)
            crawler.middlewares['DownloaderMiddlewareManager'] = self.middleware

        # ask.LoopingCall安装了一个60s的定时心跳函数_slot_gc,这个函数用于对slots中的对象进行定期的回收。
        self._slot_gc_loop = task.LoopingCall(self._slot_gc)
        self._slot_gc_loop.start(60)
コード例 #6
0
ファイル: test_engine_01.py プロジェクト: lalacat/crawler
    def __init__(self, crawler, spider_closed_callback):
        logger.debug("引擎初始化")
        self.crawler = crawler
        self.settings = crawler.settings
        # 获取log的格式
        self.logformatter = crawler.logformatter

        self.slot = None
        self.spider = None
        self.running = False
        self.paused = False

        # 从settings中找到Scheduler调度器,找到Scheduler类
        self.scheduler_cls = load_object(self.settings["SCHEDULER"])
        # 同样,找到Downloader下载器类
        downloader_cls = load_object(self.settings["DOWNLOADER"])
        self.downloader = downloader_cls(crawler)
        self.crawlling = []
        self._spider_closed_callback = spider_closed_callback

        self.flag = False
コード例 #7
0
def _get_spider_loader(settings):
    cls_path = settings["SPIDER_MANAGER_CLASS"]
    loader_cls = load_object(cls_path)
    try:
        verifyClass(ISpiderLoader, loader_cls)

    except AttributeError as e:
        logger.warning("接口方法实现不完全:", e)

    except DoesNotImplement:
        logger.warning("爬虫导入失败,查看设定是否爬虫爬虫导入类的地址设置" "")

    return loader_cls.from_settings()
コード例 #8
0
 def from_settings(cls, settings, crawler=None):
     """
     从default settings中加载默认的中间件
     :param settings:
     :param crawler:
     :return:
     """
     cls.lfm = crawler.logformatter
     try:
         mwlist = cls._get_mwlist_from_settings(settings)
         middlewares = []
         clsnames = []
         enabled = []
         for clspath in mwlist:
             try:
                 clsname = clspath.split('.')[-1]
                 mwcls = load_object(clspath)
                 #  两个if用来判断mwcls是类的情况下,是跟crawler关联还是和settings相关联
                 # 用来实例化所有的中间件
                 if crawler and hasattr(mwcls, 'from_crawler'):
                     mw = mwcls.from_crawler(crawler)
                 elif hasattr(mwcls, 'from_settings'):
                     mw = mwcls.from_settings(settings)
                 else:
                     mw = mwcls
                 middlewares.append(mw)
                 enabled.append(clspath)
                 clsnames.append(clsname)
             except Exception as e:
                 if e.args:
                     args = {'clsname': clsname, 'eargs': e.args[0]}
                     logger.warning(*cls.lfm.crawled(
                         "Middleware", cls.component_name,
                         '未生效:{clsname}: {eargs}'.format(**args)))
         if len(middlewares) != len(clsnames):
             raise ImportError("中间件载入不完整")
         if middlewares and clsnames:
             # logger.info("生效%(componentname)ss的Middleware :\n%(enabledlist)s",
             #             {'componentname': cls.component_name,
             #              'enabledlist': pprint.pformat(enabled)},
             #             extra={'crawler': crawler})
             for mw in enabled:
                 logger.info(*cls.lfm.crawled(
                     "Middleware", cls.component_name, '生效的中间件: %s' % mw))
         return cls(clsnames, middlewares)
     except Exception as e:
         logger.error(*cls.lfm.error("Middleware",
                                     cls.component_name,
                                     function=None,
                                     msg=e),
                      exc_info=True)
コード例 #9
0
ファイル: crawler.py プロジェクト: lalacat/crawler
    def __init__(self, spidercls=None, settings=None,logformat=None,crawlerRunner=None,middlewares=None):
        self.crawling = False
        self.spider = None
        self.engine = None

        # 导入的是爬虫对应的模块,不是名称
        self.spidercls = spidercls
        self.settings = settings.copy()
        self.crawlerRunner = crawlerRunner
        if crawlerRunner:
            self.father_name = crawlerRunner.name
        else:
            self.father_name = None

        # 获取log的格式
        if not logformat:
            lf_cls = load_object(self.settings['LOG_FORMATTER_CLASS'])
            self.logformatter = lf_cls.from_crawler(self)
        else:
            self.logformatter = logformat
        logger.debug(*self.logformatter.crawled(
            "Crawler", 'None',
             '已初始化!!!'))

        self.build_time = time .clock()
        self.middlewares = middlewares if middlewares is not None else dict()
        if self.middlewares:
            if not isinstance(middlewares,dict):
                    logger.warning(*self.logformatter.crawled(
                        'Crawler','',
                        '继承的中间件格式不对'
                    ))
            else:
                logger.info(*self.logformatter.crawled(
                    'Crawler', '',
                    '继承中间件'
                ))
        self.spidercls.update_settings(self.settings)
        d = dict(overridden_or_new_settings(self.settings))
        if d:
            for k,v in d.items():
                logger.info(*self.logformatter.crawled(
                    "Spider", 'None',
                    "添加或重写的设置如下:%s:%s" %(k,v),
                    "Crawler"))
コード例 #10
0
import os, logging
from test.framework.objectimport.loadobject import load_object
from test.framework.setting import Setting


class A(object):
    def __init__(self):
        self.flags = "Test"
        self.name = "name"
        self.status = "500"


rq = A()
rp = A()
sp = A()
s = Setting()
lf = load_object(s['LOG_FORMATTER'])
logfor = lf.from_crawler(rq)
strs = logfor.crawled(rq, rp, sp)
print(strs)
コード例 #11
0
ファイル: http10.py プロジェクト: lalacat/crawler
 def __init__(self, settings):
     self.HTTPClientFactory = load_object(
         settings['DOWNLOADER_HTTPCLIENTFACTORY'])
     self.ClientContextFactory = load_object(
         settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])