def __init__(self, settings=None, spidercls=None): if isinstance(settings, dict) or settings is None: settings = Setting(settings) self.settings = settings self.lfs = load_object(self.settings['LOG_FORMATTER_CLASS']) self.lfm = self.lfs.from_settings(self.settings) logger.info(*self.lfm.crawled("CrawlerRunner", '', '已初始化')) self.spider_loder = [] # 装载的是Crawler的集合 self._crawlers = set() # 装载的是defer的集合 self._active = set() # 子爬虫的数量 self.MAX_CHILD_NUM = 9 if not spidercls: # 子爬虫的类 self.spidercls = spidercls else: # 从设置中导入子爬虫的类 self.spidercls = load_object(self.settings['SPIDER_CHILD_CLASS']) # task完成标志位 self.task_finish = False self.slot = None self.running = False self._task_schedule = queue.Queue()
def __init__(self, crawler, spider_closed_callback): # 获取log的格式 self.lfm = crawler.logformatter # logger.debug("Engine 已初始化") logger.debug( *self.lfm.crawled("Spider", crawler.spider.name, '已初始化', 'Engine')) self.crawler = crawler self.settings = crawler.settings self.slot = None self.spider = None self.running = False self.paused = False self.engine_name = None self.start_time = None self._closewait = None # 从settings中找到Scheduler调度器,找到Scheduler类 self.scheduler_cls = load_object(self.settings["SCHEDULER"]) # 同样,找到Downloader下载器类 downloader_cls = load_object(self.settings["DOWNLOADER"]) self.downloader = downloader_cls(crawler) self.scraper = Scraper(crawler) self.crawlling = [] self._spider_closed_callback = spider_closed_callback self.flag = False
def __init__(self,tasks,settings=None,spidercls=None,name=None,logformat=None,middlewares=None): if isinstance(settings, dict) or settings is None: settings = Setting(settings) self.settings = settings self.middlewares = middlewares if logformat: self.lfm = logformat else: self.lfs = load_object(self.settings['LOG_FORMATTER_CLASS']) self.lfm = self.lfs.from_settings(self.settings) self.name = name if name else '' logger.info(*self.lfm.crawled( "CrawlerRunner", self.name, '已初始化') ) if isinstance(tasks,dict): self.tasks = iter_dict(tasks) else: self.tasks = tasks self.spider_loder = [] # 装载的是Crawler的集合 self._crawlers = set() # 装载的是defer的集合 self._active = set() self._active_finish = False # 子爬虫的数量 if self.name: self.MAX_CHILD_NUM = 4 else: self.MAX_CHILD_NUM = 4 # 子爬虫的名称 # self.SPIDER_NAME_CHOICE = self.settings['SPIDER_NAME_CHOICE'] self.SPIDER_NAME_CHOICE = False # 缓冲的地址最大数量 # self.MAX_SCHEDULE_NUM = 10 if spidercls: # 子爬虫的类 self.spidercls = spidercls else : # 从设置中导入子爬虫的类 self.spidercls = load_object(self.settings['SPIDER_CHILD_CLASS']) # task完成标志位 # self.filter_task = FilterTask(self.SPIDER_NAME_CHOICE) # 导入队列中任务个数的最大值 self.filter_task = 10 self._push_task_finish = False self._pull_task_finish = False self._next_task = None self.fifer = FilterTask(settings) self.slot = None self._closewait = None self.running = False self._pause = False self._task_schedule = queue.Queue()
def __init__(self, spidercls=None, settings=None, logformat=None, father_name=None): self.crawling = False self.spider = None self.engine = None # 导入的是爬虫对应的模块,不是名称 self.spidercls = spidercls self.father_name = father_name self.settings = settings.copy() # 获取log的格式 if not logformat: lf_cls = load_object(self.settings['LOG_FORMATTER_CLASS']) self.logformatter = lf_cls.from_crawler(self) else: self.logformatter = logformat logger.debug( *self.logformatter.crawled("Spider", 'None', '已初始化', "Crawler")) self.spidercls.update_settings(self.settings) d = dict(overridden_or_new_settings(self.settings)) if d: logger.info(*self.logformatter.crawled( "Spider", 'None', "添加或重写的设置如下:\n {settings}".format( settings=d), "Crawler"))
def __init__(self,crawler): self.settings = crawler.settings self.lfm = crawler.logformatter # 如果只是加载一个类不带参数,而这个类的初始化带有参数的时候,使用这个类的时候会报错 # XXX missing X required positional argument # logger.debug("Downloader 已初始化...") logger.debug(*self.lfm.crawled("Spider", crawler.spider.name, '已初始化', 'Downloader')) self.handler = load_object(self.settings["DOWNLOAD_HANDLER"])(self.lfm,self.settings) self.spider = None self.slots = {} # active是一个活动集合,用于记录当前正在下载的request集合。 self.active = set() # 从配置中获取设置的并发数 self.total_concurrency = self.settings.getint('CONCURRENT_REQUESTS') # 同一域名并发数 self.domain_concurrency = self.settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN') # 同一IP并发数 self.ip_concurrency = self.settings.getint('CONCURRENT_REQUESTS_PER_IP') # 随机延迟下载时间 默认是True self.randomize_delay = self.settings.getbool('RANDOMIZE_DOWNLOAD_DELAY') # 初始化下载器中间件 if crawler.middlewares.get('DownloaderMiddlewareManager'): self.middleware = crawler.middlewares['DownloaderMiddlewareManager'] else: self.middleware = DownloaderMiddlewareManager.from_crawler(crawler) crawler.middlewares['DownloaderMiddlewareManager'] = self.middleware # ask.LoopingCall安装了一个60s的定时心跳函数_slot_gc,这个函数用于对slots中的对象进行定期的回收。 self._slot_gc_loop = task.LoopingCall(self._slot_gc) self._slot_gc_loop.start(60)
def __init__(self, crawler, spider_closed_callback): logger.debug("引擎初始化") self.crawler = crawler self.settings = crawler.settings # 获取log的格式 self.logformatter = crawler.logformatter self.slot = None self.spider = None self.running = False self.paused = False # 从settings中找到Scheduler调度器,找到Scheduler类 self.scheduler_cls = load_object(self.settings["SCHEDULER"]) # 同样,找到Downloader下载器类 downloader_cls = load_object(self.settings["DOWNLOADER"]) self.downloader = downloader_cls(crawler) self.crawlling = [] self._spider_closed_callback = spider_closed_callback self.flag = False
def _get_spider_loader(settings): cls_path = settings["SPIDER_MANAGER_CLASS"] loader_cls = load_object(cls_path) try: verifyClass(ISpiderLoader, loader_cls) except AttributeError as e: logger.warning("接口方法实现不完全:", e) except DoesNotImplement: logger.warning("爬虫导入失败,查看设定是否爬虫爬虫导入类的地址设置" "") return loader_cls.from_settings()
def from_settings(cls, settings, crawler=None): """ 从default settings中加载默认的中间件 :param settings: :param crawler: :return: """ cls.lfm = crawler.logformatter try: mwlist = cls._get_mwlist_from_settings(settings) middlewares = [] clsnames = [] enabled = [] for clspath in mwlist: try: clsname = clspath.split('.')[-1] mwcls = load_object(clspath) # 两个if用来判断mwcls是类的情况下,是跟crawler关联还是和settings相关联 # 用来实例化所有的中间件 if crawler and hasattr(mwcls, 'from_crawler'): mw = mwcls.from_crawler(crawler) elif hasattr(mwcls, 'from_settings'): mw = mwcls.from_settings(settings) else: mw = mwcls middlewares.append(mw) enabled.append(clspath) clsnames.append(clsname) except Exception as e: if e.args: args = {'clsname': clsname, 'eargs': e.args[0]} logger.warning(*cls.lfm.crawled( "Middleware", cls.component_name, '未生效:{clsname}: {eargs}'.format(**args))) if len(middlewares) != len(clsnames): raise ImportError("中间件载入不完整") if middlewares and clsnames: # logger.info("生效%(componentname)ss的Middleware :\n%(enabledlist)s", # {'componentname': cls.component_name, # 'enabledlist': pprint.pformat(enabled)}, # extra={'crawler': crawler}) for mw in enabled: logger.info(*cls.lfm.crawled( "Middleware", cls.component_name, '生效的中间件: %s' % mw)) return cls(clsnames, middlewares) except Exception as e: logger.error(*cls.lfm.error("Middleware", cls.component_name, function=None, msg=e), exc_info=True)
def __init__(self, spidercls=None, settings=None,logformat=None,crawlerRunner=None,middlewares=None): self.crawling = False self.spider = None self.engine = None # 导入的是爬虫对应的模块,不是名称 self.spidercls = spidercls self.settings = settings.copy() self.crawlerRunner = crawlerRunner if crawlerRunner: self.father_name = crawlerRunner.name else: self.father_name = None # 获取log的格式 if not logformat: lf_cls = load_object(self.settings['LOG_FORMATTER_CLASS']) self.logformatter = lf_cls.from_crawler(self) else: self.logformatter = logformat logger.debug(*self.logformatter.crawled( "Crawler", 'None', '已初始化!!!')) self.build_time = time .clock() self.middlewares = middlewares if middlewares is not None else dict() if self.middlewares: if not isinstance(middlewares,dict): logger.warning(*self.logformatter.crawled( 'Crawler','', '继承的中间件格式不对' )) else: logger.info(*self.logformatter.crawled( 'Crawler', '', '继承中间件' )) self.spidercls.update_settings(self.settings) d = dict(overridden_or_new_settings(self.settings)) if d: for k,v in d.items(): logger.info(*self.logformatter.crawled( "Spider", 'None', "添加或重写的设置如下:%s:%s" %(k,v), "Crawler"))
import os, logging from test.framework.objectimport.loadobject import load_object from test.framework.setting import Setting class A(object): def __init__(self): self.flags = "Test" self.name = "name" self.status = "500" rq = A() rp = A() sp = A() s = Setting() lf = load_object(s['LOG_FORMATTER']) logfor = lf.from_crawler(rq) strs = logfor.crawled(rq, rp, sp) print(strs)
def __init__(self, settings): self.HTTPClientFactory = load_object( settings['DOWNLOADER_HTTPCLIENTFACTORY']) self.ClientContextFactory = load_object( settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])