def __init__(self, settings=None, spidercls=None): if isinstance(settings, dict) or settings is None: settings = Setting(settings) self.settings = settings self.lfs = load_object(self.settings['LOG_FORMATTER_CLASS']) self.lfm = self.lfs.from_settings(self.settings) logger.info(*self.lfm.crawled("CrawlerRunner", '', '已初始化')) self.spider_loder = [] # 装载的是Crawler的集合 self._crawlers = set() # 装载的是defer的集合 self._active = set() # 子爬虫的数量 self.MAX_CHILD_NUM = 9 if not spidercls: # 子爬虫的类 self.spidercls = spidercls else: # 从设置中导入子爬虫的类 self.spidercls = load_object(self.settings['SPIDER_CHILD_CLASS']) # task完成标志位 self.task_finish = False self.slot = None self.running = False self._task_schedule = queue.Queue()
def __init__(self,tasks,settings=None,spidercls=None,name=None,logformat=None,middlewares=None): if isinstance(settings, dict) or settings is None: settings = Setting(settings) self.settings = settings self.middlewares = middlewares if logformat: self.lfm = logformat else: self.lfs = load_object(self.settings['LOG_FORMATTER_CLASS']) self.lfm = self.lfs.from_settings(self.settings) self.name = name if name else '' logger.info(*self.lfm.crawled( "CrawlerRunner", self.name, '已初始化') ) if isinstance(tasks,dict): self.tasks = iter_dict(tasks) else: self.tasks = tasks self.spider_loder = [] # 装载的是Crawler的集合 self._crawlers = set() # 装载的是defer的集合 self._active = set() self._active_finish = False # 子爬虫的数量 if self.name: self.MAX_CHILD_NUM = 4 else: self.MAX_CHILD_NUM = 4 # 子爬虫的名称 # self.SPIDER_NAME_CHOICE = self.settings['SPIDER_NAME_CHOICE'] self.SPIDER_NAME_CHOICE = False # 缓冲的地址最大数量 # self.MAX_SCHEDULE_NUM = 10 if spidercls: # 子爬虫的类 self.spidercls = spidercls else : # 从设置中导入子爬虫的类 self.spidercls = load_object(self.settings['SPIDER_CHILD_CLASS']) # task完成标志位 # self.filter_task = FilterTask(self.SPIDER_NAME_CHOICE) # 导入队列中任务个数的最大值 self.filter_task = 10 self._push_task_finish = False self._pull_task_finish = False self._next_task = None self.fifer = FilterTask(settings) self.slot = None self._closewait = None self.running = False self._pause = False self._task_schedule = queue.Queue()
def __init__(self, settings=None): if isinstance(settings, dict) or settings is None: settings = Setting(settings) self.settings = settings logger.debug(type(self.settings)) self.spider_loder = _get_spider_loader(settings) #装载的是Crawler的集合 self._crawlers = set() #装载的是defer的集合 self._active = set()
def __init__(self, tasks, settings=None): if isinstance(settings, dict) or settings is None: settings = Setting(settings) self.settings = settings self.spider_loder = [] # 装载的是Crawler的集合 self._crawlers = set() # 装载的是defer的集合 self._active = set() # 子爬虫的数量 self._childNum = 3 self.task_schedule = tasks self.task_finish = False self.slot = None self.running = False self.searchRes = tasks self.child_task = queue.Queue()
import logging import pprint from test.framework.log.log import LogFormat from test.framework.setting import Setting from test.framework.log.logfilter import ErrorFilter s = Setting() a = LogFormat(s) logger = logging.getLogger("lala") # 生成一个log实例 logger.debug(*a.crawled("Spider","lala", '出现出现错误', {'function':'Scraper','request':"baba",'time':333.333}), extra={'extra_info':'error'}) logger.info(*a.crawled("Spider",'works', 'info', { 'time':6.7777777, 'function':"engine"}), extra={ "extra_info":" inprogress中还剩下{:d}个任务".format(3)}) logger.info(*a.crawled("Spider","lala", '出现出现错误', { 'function':'Scraper', 'request':"baba"
def request_errback(content): print("request_and_response errback") print(content) return content def agent_print(content): print("agent_print") print(type(content)) print(content) request = Request(url=url, callback=request_callback, method='get', headers=headers, errback=request_errback, meta={"download_timeout": 2}) settings = Setting() spider = Spider1.update_settings(settings) httphandler = HTTPDownloadHandler(settings) agent = httphandler.download_request(request, spider) agent.addCallback(agent_print) agent.addErrback(request_errback) agent.addBoth(lambda _: reactor.stop()) reactor.run()
from test.framework.core.crawler import _get_spider_loader from test.framework.setting import Setting, overridden_or_new_settings import logging logging.basicConfig(level=logging.DEBUG) s = Setting() _active = set() new_s = s.copy() cls = _get_spider_loader(new_s) new_and_over_setting = ["TEST2 = t2", "TEST3 = t3"] temp = [x.split('=', 1) for x in new_and_over_setting] tm = dict(temp) for k, v in tm.items(): new_s.set(k.strip(), v, "project") d = dict(overridden_or_new_settings(new_s)) logging.info("Overridden settings:\n %(settings)r", {'settings': d}) ''' for name, module in cls._spiders.items(): module.update_settings(new_s) overridden_or_new_settings(new_s)