def from_crawler(cls, crawler): settings = crawler.settings # ------------------------dupefilter------------------------- dupefilter_cls = load_object( settings.get("DUPEFILTER_CLASS", default_settings.SCHEDULER_DUPEFILTER_CLASS)) dupefilter = create_instance(dupefilter_cls, None, crawler) server = client_from_settings(settings) # ------------------------scheduler------------------------- scheduler_queue_class = load_object( default_settings.SCHEDULER_PRIORITY_QUEUE) scheduler_queue_key = settings.get( "SCHEDULER_QUEUE_KEY", default_settings.SCHEDULER_QUEUE_KEY) scheduler_serializer = load_object( settings.get("SCHEDULER_QUEUE_SERIALIZER", default_settings.SCHEDULER_QUEUE_SERIALIZER)) # TODO: Make scheduler_queue_key more diverse queue = create_instance( scheduler_queue_class, None, crawler, server, str(scheduler_queue_key % {'spider': crawler.spider.name}), scheduler_serializer) scheduler_queue_pop_timeout = settings.get( "SCHEDULER_QUEUE_POP_TIMEOUT", default_settings.SCHEDULER_QUEUE_POP_TIMEOUT) scheduler_clear_queue_at_open = settings.get( "SCHEDULER_CLEAR_QUEUE_AT_OPEN", default_settings.SCHEDULER_CLEAR_QUEUE_AT_OPEN) return cls(dupefilter, queue=queue, stats=crawler.stats, crawler=crawler, scheduler_queue_pop_timeout=scheduler_queue_pop_timeout, scheduler_clear_queue_at_open=scheduler_clear_queue_at_open)
def load_context_factory_from_settings(settings, crawler): ssl_method = openssl_methods[settings.get('DOWNLOADER_CLIENT_TLS_METHOD')] context_factory_cls = load_object( settings['DOWNLOADER_CLIENTCONTEXTFACTORY']) # try method-aware context factory try: context_factory = create_instance( objcls=context_factory_cls, settings=settings, crawler=crawler, method=ssl_method, ) except TypeError: # use context factory defaults context_factory = create_instance( objcls=context_factory_cls, settings=settings, crawler=crawler, ) msg = ( f"{settings['DOWNLOADER_CLIENTCONTEXTFACTORY']} does not accept " "a `method` argument (type OpenSSL.SSL method, e.g. " "OpenSSL.SSL.SSLv23_METHOD) and/or a `tls_verbose_logging` " "argument and/or a `tls_ciphers` argument. Please, upgrade your " "context factory class to handle them or ignore them.") warnings.warn(msg) return context_factory
def __init__(self, settings): self._pool = HTTPConnectionPool(reactor, persistent=True) self._pool.maxPersistentPerHost = settings.getint( 'CONCURRENT_REQUESTS_PER_DOMAIN') self._pool._factory.noisy = False self._sslMethod = openssl_methods[settings.get( 'DOWNLOADER_CLIENT_TLS_METHOD')] self._contextFactoryClass = load_object( settings['DOWNLOADER_CLIENTCONTEXTFACTORY']) # try method-aware context factory try: self._contextFactory = create_instance( self._contextFactoryClass, settings=settings, crawler=None, method=self._sslMethod, ) except TypeError: # use context factory defaults self._contextFactory = create_instance( self._contextFactoryClass, settings=settings, crawler=None, ) msg = """ '%s' does not accept `method` argument (type OpenSSL.SSL method,\ e.g. OpenSSL.SSL.SSLv23_METHOD) and/or `tls_verbose_logging` argument and/or `tls_ciphers` argument.\ Please upgrade your context factory class to handle them or ignore them.""" % ( settings['DOWNLOADER_CLIENTCONTEXTFACTORY'], ) warnings.warn(msg) self._default_maxsize = settings.getint('DOWNLOAD_MAXSIZE') self._default_warnsize = settings.getint('DOWNLOAD_WARNSIZE') self._fail_on_dataloss = settings.getbool('DOWNLOAD_FAIL_ON_DATALOSS') self._disconnect_timeout = 1
def _test_with_settings(mock, settings): create_instance(mock, settings, None, *args, **kwargs) if hasattr(mock, 'from_crawler'): self.assertEqual(mock.from_crawler.call_count, 0) if hasattr(mock, 'from_settings'): mock.from_settings.assert_called_once_with( settings, *args, **kwargs) self.assertEqual(mock.call_count, 0) else: mock.assert_called_once_with(*args, **kwargs)
def _test_with_settings(mock, settings): create_instance(mock, settings, None, *args, **kwargs) if hasattr(mock, 'from_crawler'): self.assertEqual(mock.from_crawler.call_count, 0) if hasattr(mock, 'from_settings'): mock.from_settings.assert_called_once_with(settings, *args, **kwargs) self.assertEqual(mock.call_count, 0) else: mock.assert_called_once_with(*args, **kwargs)
def test_create_instance(self): settings = mock.MagicMock() crawler = mock.MagicMock(spec_set=['settings']) args = (True, 100.) kwargs = {'key': 'val'} def _test_with_settings(mock, settings): create_instance(mock, settings, None, *args, **kwargs) if hasattr(mock, 'from_crawler'): self.assertEqual(mock.from_crawler.call_count, 0) if hasattr(mock, 'from_settings'): mock.from_settings.assert_called_once_with( settings, *args, **kwargs) self.assertEqual(mock.call_count, 0) else: mock.assert_called_once_with(*args, **kwargs) def _test_with_crawler(mock, settings, crawler): create_instance(mock, settings, crawler, *args, **kwargs) if hasattr(mock, 'from_crawler'): mock.from_crawler.assert_called_once_with( crawler, *args, **kwargs) if hasattr(mock, 'from_settings'): self.assertEqual(mock.from_settings.call_count, 0) self.assertEqual(mock.call_count, 0) elif hasattr(mock, 'from_settings'): mock.from_settings.assert_called_once_with( settings, *args, **kwargs) self.assertEqual(mock.call_count, 0) else: mock.assert_called_once_with(*args, **kwargs) # Check usage of correct constructor using four mocks: # 1. with no alternative constructors # 2. with from_settings() constructor # 3. with from_crawler() constructor # 4. with from_settings() and from_crawler() constructor spec_sets = ([], ['from_settings'], ['from_crawler'], ['from_settings', 'from_crawler']) for specs in spec_sets: m = mock.MagicMock(spec_set=specs) _test_with_settings(m, settings) m.reset_mock() _test_with_crawler(m, settings, crawler) # Check adoption of crawler settings m = mock.MagicMock(spec_set=['from_settings']) create_instance(m, None, crawler, *args, **kwargs) m.from_settings.assert_called_once_with(crawler.settings, *args, **kwargs) with self.assertRaises(ValueError): create_instance(m, None, None)
def test_create_instance(self): settings = mock.MagicMock() crawler = mock.MagicMock(spec_set=['settings']) args = (True, 100.) kwargs = {'key': 'val'} def _test_with_settings(mock, settings): create_instance(mock, settings, None, *args, **kwargs) if hasattr(mock, 'from_crawler'): self.assertEqual(mock.from_crawler.call_count, 0) if hasattr(mock, 'from_settings'): mock.from_settings.assert_called_once_with(settings, *args, **kwargs) self.assertEqual(mock.call_count, 0) else: mock.assert_called_once_with(*args, **kwargs) def _test_with_crawler(mock, settings, crawler): create_instance(mock, settings, crawler, *args, **kwargs) if hasattr(mock, 'from_crawler'): mock.from_crawler.assert_called_once_with(crawler, *args, **kwargs) if hasattr(mock, 'from_settings'): self.assertEqual(mock.from_settings.call_count, 0) self.assertEqual(mock.call_count, 0) elif hasattr(mock, 'from_settings'): mock.from_settings.assert_called_once_with(settings, *args, **kwargs) self.assertEqual(mock.call_count, 0) else: mock.assert_called_once_with(*args, **kwargs) # Check usage of correct constructor using four mocks: # 1. with no alternative constructors # 2. with from_settings() constructor # 3. with from_crawler() constructor # 4. with from_settings() and from_crawler() constructor spec_sets = ([], ['from_settings'], ['from_crawler'], ['from_settings', 'from_crawler']) for specs in spec_sets: m = mock.MagicMock(spec_set=specs) _test_with_settings(m, settings) m.reset_mock() _test_with_crawler(m, settings, crawler) # Check adoption of crawler settings m = mock.MagicMock(spec_set=['from_settings']) create_instance(m, None, crawler, *args, **kwargs) m.from_settings.assert_called_once_with(crawler.settings, *args, **kwargs) with self.assertRaises(ValueError): create_instance(m, None, None)
def test_extra_kw(self): try: crawler = get_crawler() create_instance( objcls=S3DownloadHandler, settings=None, crawler=crawler, extra_kw=True, ) except Exception as e: self.assertIsInstance(e, (TypeError, NotConfigured)) else: assert False
def from_crawler(cls, crawler): settings = crawler.settings connection_url = settings.get("RABBITMQ_CONNECTION_PARAMETERS") queue_class = load_object(settings.get("SCHEDULER_QUEUE_CLASS")) dupefilter_cls = load_object(settings["DUPEFILTER_CLASS"]) dupefilter = create_instance(dupefilter_cls, settings, crawler) pqclass = load_object(settings["SCHEDULER_PRIORITY_QUEUE"]) if pqclass is PriorityQueue: warnings.warn( "SCHEDULER_PRIORITY_QUEUE='queuelib.PriorityQueue'" " is no longer supported because of API changes; " "please use 'scrapy.pqueues.ScrapyPriorityQueue'", ScrapyDeprecationWarning, ) from scrapy.pqueues import ScrapyPriorityQueue pqclass = ScrapyPriorityQueue dqclass = load_object(settings["SCHEDULER_DISK_QUEUE"]) mqclass = load_object(settings["SCHEDULER_MEMORY_QUEUE"]) logunser = settings.getbool("SCHEDULER_DEBUG") return cls( dupefilter, connection_url, jobdir=job_dir(settings), logunser=logunser, stats=crawler.stats, pqclass=pqclass, dqclass=dqclass, mqclass=mqclass, crawler=crawler, queue_class=queue_class, )
def _mq(self): """ Create a new priority queue instance, with in-memory storage """ return create_instance(self.pqclass, settings=None, crawler=self.crawler, downstream_queue_cls=self.mqclass, key='')
def from_spider(cls, spider): settings = spider.settings # log dupefilter_debug = settings.get("DUPEFILTER_LOG", default_settings.DUPEFILTER_DEBUG) dupefilter_log = settings.get("DUPEFILTER_LOG", default_settings.DUPEFILTER_LOG) # filter key in server dupefilter_key = settings.get( "SCHEDULER_DUPEFILTER_KEY", default_settings.SCHEDULER_DUPEFILTER_KEY) fingerprint_by_kafka = settings.getbool( "FINGERPRINT_BY_KAFKA_MESSAGE", default_settings.FINGERPRINT_BY_KAFKA_MESSAGE) key = dupefilter_key % {'spider': spider.name} scheduler_filter_class = load_object( settings.get("SCHEDULER_FILTER_CLASS", default_settings.SCHEDULER_FILTER_CLASS)) filter_queue = create_instance(scheduler_filter_class, settings, None, key) return cls(filter_queue, dupefilter_debug, dupefilter_log, fingerprint_by_kafka)
def testPayload(self): s = "0123456789" * 10 settings = Settings({'DOWNLOADER_CLIENT_TLS_CIPHERS': self.custom_ciphers}) client_context_factory = create_instance(ScrapyClientContextFactory, settings=settings, crawler=None) return getPage( self.getURL("payload"), body=s, contextFactory=client_context_factory ).addCallback(self.assertEqual, to_bytes(s))
def _load_handler(self, scheme, skip_lazy=False): path = self._schemes[scheme] try: # 将路径对应的类导入 dhcls = load_object(path) if skip_lazy and getattr(dhcls, 'lazy', True): # 自定义懒加载或者类中自带这个属性,则跳过 return None # 实例化 dh = create_instance( objcls=dhcls, settings=self._crawler.settings, crawler=self._crawler, ) except NotConfigured as ex: # 报错,则加入到未配置的协议 self._notconfigured[scheme] = str(ex) return None except Exception as ex: logger.error('Loading "%(clspath)s" for scheme "%(scheme)s"', { "clspath": path, "scheme": scheme }, exc_info=True, extra={'crawler': self._crawler}) self._notconfigured[scheme] = str(ex) return None else: # 如果没报错,则加入到字典中,并返回实例 self._handlers[scheme] = dh return dh
def from_settings(cls, settings, crawler=None): ## 基于爬虫对象和配置创建中间件实例 ## 调用子类的 _get_mwlist_from_settings 方法,从配置中获取所有中间件路径的列表 mwlist = cls._get_mwlist_from_settings(settings) ## 存放可用的中间件类的实例 middlewares = [] ## 存放可用的中间件类的路径 enabled = [] ## 依次实例化 for clspath in mwlist: try: ## 根据中间件类路径加载中间件类 mwcls = load_object(clspath) ## 创建中间件类的实例 mw = create_instance(mwcls, settings, crawler) middlewares.append(mw) enabled.append(clspath) except NotConfigured as e: if e.args: clsname = clspath.split('.')[-1] logger.warning("Disabled %(clsname)s: %(eargs)s", {'clsname': clsname, 'eargs': e.args[0]}, extra={'crawler': crawler}) logger.info("Enabled %(componentname)ss:\n%(enabledlist)s", {'componentname': cls.component_name, 'enabledlist': pprint.pformat(enabled)}, extra={'crawler': crawler}) ## 调用构造方法 return cls(*middlewares)
def _mq(self): """ Create a new priority queue instance, with in-memory storage """ return create_instance(self.pqclass, None, self.crawler, self._newmq, serialize=False)
def qfactory(self, key): return create_instance( self.downstream_queue_cls, None, self.crawler, self.key + '/' + str(key), )
def setUp(self): site = server.Site(UriResource(), timeout=None) wrapper = WrappingFactory(site) self.port = reactor.listenTCP(0, wrapper, interface='127.0.0.1') self.portno = self.port.getHost().port self.download_handler = create_instance(self.download_handler_cls, None, get_crawler()) self.download_request = self.download_handler.download_request
def from_crawler(cls, crawler): settings = crawler.settings persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST) table = settings.get('SCHEDULER_QUEUE_TABLE', QUEUE_TABLE) table = table % {'spider': crawler.spider.name} queue_cls = load_object( settings.get('SCHEDULER_QUEUE_CLASS', QUEUE_CLASS)) idle_before_close = settings.get('SCHEDULER_IDLE_BEFORE_CLOSE', IDLE_BEFORE_CLOSE) conn = connection.from_crawler(crawler) dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) dupefilter = create_instance(dupefilter_cls, settings, crawler) instance = cls( crawler, conn, persist, table, queue_cls, idle_before_close, crawler.stats, dupefilter, ) return instance
def open_spider(self, spider: Spider, start_requests: Iterable = (), close_if_idle: bool = True): if self.slot is not None: raise RuntimeError( f"No free spider slot when opening {spider.name!r}") logger.info("Spider opened", extra={'spider': spider}) # p.15 创建延迟调用实例,为下一轮事件循环做准备 nextcall = CallLaterOnce(self._next_request) # p.16 创建 调度器 实例 scheduler = create_instance(self.scheduler_cls, settings=None, crawler=self.crawler) # p.17 挂载 爬虫中间件 ,并处理开始请求 start_requests = yield self.scraper.spidermw.process_start_requests( start_requests, spider) # p.18 封装 开始请求,延迟调用实例,调度器 self.slot = Slot(start_requests, close_if_idle, nextcall, scheduler) self.spider = spider # p.19 if hasattr(scheduler, "open"): yield scheduler.open(spider) # p.20 yield self.scraper.open_spider(spider) # p.21 启动信息收集 self.crawler.stats.open_spider(spider) # p.22 yield self.signals.send_catch_log_deferred(signals.spider_opened, spider=spider) self.slot.nextcall.schedule() self.slot.heartbeat.start(5)
def test_passing_objects_as_values(self): from scrapy.core.downloader.handlers.file import FileDownloadHandler from scrapy.utils.misc import create_instance from scrapy.utils.test import get_crawler class TestPipeline(): def process_item(self, i, s): return i settings = Settings({ 'ITEM_PIPELINES': { TestPipeline: 800, }, 'DOWNLOAD_HANDLERS': { 'ftp': FileDownloadHandler, }, }) self.assertIn('ITEM_PIPELINES', settings.attributes) mypipeline, priority = settings.getdict('ITEM_PIPELINES').popitem() self.assertEqual(priority, 800) self.assertEqual(mypipeline, TestPipeline) self.assertIsInstance(mypipeline(), TestPipeline) self.assertEqual(mypipeline().process_item('item', None), 'item') myhandler = settings.getdict('DOWNLOAD_HANDLERS').pop('ftp') self.assertEqual(myhandler, FileDownloadHandler) myhandler_instance = create_instance(myhandler, None, get_crawler()) self.assertIsInstance(myhandler_instance, FileDownloadHandler) self.assertTrue(hasattr(myhandler_instance, 'download_request'))
def from_crawler(cls, crawler): settings = crawler.settings # 从配置中获取指纹过滤器类, 见scrapy/dupefilters.py dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) dupefilter = create_instance(dupefilter_cls, settings, crawler) # 实例化指纹过滤器类, 最终复制为self.df # 任务队列, 见scrapy/squeues.py, 其中磁盘队列: 执行后会保存队列任务到磁盘; 内存队列: 重启消失 # 如果用户配置了JOBDIR, 则会同时影响过滤器, 磁盘队列的设置. pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE']) if pqclass is PriorityQueue: warnings.warn( "SCHEDULER_PRIORITY_QUEUE='queuelib.PriorityQueue'" " is no longer supported because of API changes; " "please use 'scrapy.pqueues.ScrapyPriorityQueue'", ScrapyDeprecationWarning) from scrapy.pqueues import ScrapyPriorityQueue pqclass = ScrapyPriorityQueue dqclass = load_object(settings['SCHEDULER_DISK_QUEUE']) mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE']) # 日志序列化开关 logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS', settings.getbool('SCHEDULER_DEBUG')) return cls(dupefilter, jobdir=job_dir(settings), logunser=logunser, stats=crawler.stats, pqclass=pqclass, dqclass=dqclass, mqclass=mqclass, crawler=crawler) # 实例化scheduler
def from_settings(cls, settings: Settings, crawler=None): mwlist = cls._get_mwlist_from_settings(settings) middlewares = [] enabled = [] for clspath in mwlist: # 将类名字符串转换成实例, 比如'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware'转换成实例 try: mwcls = load_object(clspath) mw = create_instance(mwcls, settings, crawler) # 转换成功后的实例放到这里 middlewares.append(mw) # 启用的类名 enabled.append(clspath) except NotConfigured as e: # 转换失败报错,并跳过 if e.args: clsname = clspath.split('.')[-1] logger.warning("Disabled %(clsname)s: %(eargs)s", { 'clsname': clsname, 'eargs': e.args[0] }, extra={'crawler': crawler}) logger.info("Enabled %(componentname)ss:\n%(enabledlist)s", { 'componentname': cls.component_name, 'enabledlist': pprint.pformat(enabled) }, extra={'crawler': crawler}) return cls(*middlewares)
def from_crawler(cls, crawler): settings = crawler.settings dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) dupefilter = create_instance(dupefilter_cls, settings, crawler) pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE']) if pqclass is PriorityQueue: warnings.warn( "SCHEDULER_PRIORITY_QUEUE='queuelib.PriorityQueue'" " is no longer supported because of API changes; " "please use 'scrapy.pqueues.ScrapyPriorityQueue'", ScrapyDeprecationWarning) from scrapy.pqueues import ScrapyPriorityQueue pqclass = ScrapyPriorityQueue dqclass = load_object(settings['SCHEDULER_DISK_QUEUE']) mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE']) logunser = settings.getbool('SCHEDULER_DEBUG') return cls(dupefilter, jobdir=job_dir(settings), logunser=logunser, stats=crawler.stats, pqclass=pqclass, dqclass=dqclass, mqclass=mqclass, crawler=crawler)
def setUp(self): from twisted.protocols.ftp import FTPRealm, FTPFactory from scrapy.core.downloader.handlers.ftp import FTPDownloadHandler # setup dirs and test file self.directory = self.mktemp() os.mkdir(self.directory) userdir = os.path.join(self.directory, self.username) os.mkdir(userdir) fp = FilePath(userdir) fp.child('file.txt').setContent(b"I have the power!") fp.child('file with spaces.txt').setContent(b"Moooooooooo power!") # setup server realm = FTPRealm(anonymousRoot=self.directory, userHome=self.directory) p = portal.Portal(realm) users_checker = checkers.InMemoryUsernamePasswordDatabaseDontUse() users_checker.addUser(self.username, self.password) p.registerChecker(users_checker, credentials.IUsernamePassword) self.factory = FTPFactory(portal=p) self.port = reactor.listenTCP(0, self.factory, interface="127.0.0.1") self.portNum = self.port.getHost().port crawler = get_crawler() self.download_handler = create_instance(FTPDownloadHandler, crawler.settings, crawler) self.addCleanup(self.port.stopListening)
def setUp(self): self.tmpname = self.mktemp() os.mkdir(self.tmpname) FilePath(self.tmpname).child("file").setContent(b"0123456789") r = static.File(self.tmpname) r.putChild(b"redirect", util.Redirect(b"/file")) r.putChild(b"wait", ForeverTakingResource()) r.putChild(b"hang-after-headers", ForeverTakingResource(write=True)) r.putChild(b"nolength", NoLengthResource()) r.putChild(b"host", HostHeaderResource()) r.putChild(b"payload", PayloadResource()) r.putChild(b"broken", BrokenDownloadResource()) r.putChild(b"chunked", ChunkedResource()) r.putChild(b"broken-chunked", BrokenChunkedResource()) r.putChild(b"contentlength", ContentLengthHeaderResource()) r.putChild(b"nocontenttype", EmptyContentTypeHeaderResource()) r.putChild(b"largechunkedfile", LargeChunkedFileResource()) r.putChild(b"echo", Echo()) self.site = server.Site(r, timeout=None) self.wrapper = WrappingFactory(self.site) self.host = 'localhost' if self.scheme == 'https': self.port = reactor.listenSSL(0, self.wrapper, ssl_context_factory( self.keyfile, self.certfile), interface=self.host) else: self.port = reactor.listenTCP(0, self.wrapper, interface=self.host) self.portno = self.port.getHost().port self.download_handler = create_instance(self.download_handler_cls, None, get_crawler()) self.download_request = self.download_handler.download_request
def start(self, stop_after_crawl=True): """ This method starts a :mod:`~twisted.internet.reactor`, adjusts its pool size to :setting:`REACTOR_THREADPOOL_MAXSIZE`, and installs a DNS cache based on :setting:`DNSCACHE_ENABLED` and :setting:`DNSCACHE_SIZE`. If ``stop_after_crawl`` is True, the reactor will be stopped after all crawlers have finished, using :meth:`join`. :param boolean stop_after_crawl: stop or not the reactor when all crawlers have finished """ from twisted.internet import reactor if stop_after_crawl: d = self.join() # Don't start the reactor if the deferreds are already fired if d.called: return d.addBoth(self._stop_reactor) resolver_class = load_object(self.settings["DNS_RESOLVER"]) resolver = create_instance(resolver_class, self.settings, self, reactor=reactor) resolver.install_on_reactor() tp = reactor.getThreadPool() tp.adjustPoolsize( maxthreads=self.settings.getint('REACTOR_THREADPOOL_MAXSIZE')) reactor.addSystemEventTrigger('before', 'shutdown', self.stop) reactor.run(installSignalHandlers=False) # blocking call
def setUp(self): from twisted.protocols.ftp import FTPRealm, FTPFactory from scrapy.core.downloader.handlers.ftp import FTPDownloadHandler # setup dir and test file self.directory = self.mktemp() os.mkdir(self.directory) fp = FilePath(self.directory) fp.child('file.txt').setContent(b"I have the power!") fp.child('file with spaces.txt').setContent(b"Moooooooooo power!") # setup server for anonymous access realm = FTPRealm(anonymousRoot=self.directory) p = portal.Portal(realm) p.registerChecker(checkers.AllowAnonymousAccess(), credentials.IAnonymous) self.factory = FTPFactory(portal=p, userAnonymous=self.username) self.port = reactor.listenTCP(0, self.factory, interface="127.0.0.1") self.portNum = self.port.getHost().port crawler = get_crawler() self.download_handler = create_instance(FTPDownloadHandler, crawler.settings, crawler) self.addCleanup(self.port.stopListening)
def from_settings(cls, settings: Settings, crawler=None): mwlist = cls._get_mwlist_from_settings(settings) middlewares = [] enabled = [] for clspath in mwlist: try: mwcls = load_object(clspath) mw = create_instance(mwcls, settings, crawler) middlewares.append(mw) enabled.append(clspath) except NotConfigured as e: if e.args: clsname = clspath.split('.')[-1] logger.warning("Disabled %(clsname)s: %(eargs)s", { 'clsname': clsname, 'eargs': e.args[0] }, extra={'crawler': crawler}) logger.info("Enabled %(componentname)ss:\n%(enabledlist)s", { 'componentname': cls.component_name, 'enabledlist': pprint.pformat(enabled) }, extra={'crawler': crawler}) return cls(*middlewares)
def _load_handler(self, scheme, skip_lazy=False): path = self._schemes[scheme] try: dhcls = load_object(path) if skip_lazy and getattr(dhcls, "lazy", True): return None dh = create_instance( objcls=dhcls, settings=self._crawler.settings, crawler=self._crawler, ) except NotConfigured as ex: self._notconfigured[scheme] = str(ex) return None except Exception as ex: logger.error( 'Loading "%(clspath)s" for scheme "%(scheme)s"', { "clspath": path, "scheme": scheme }, exc_info=True, extra={"crawler": self._crawler}, ) self._notconfigured[scheme] = str(ex) return None else: self._handlers[scheme] = dh return dh
def from_settings(cls, settings, crawler=None): # 实例化中间件,是一个公用方法 mwlist = cls._get_mwlist_from_settings( settings) # 从setting中获取中间件列表,这里的设置被各个模块各自重载了,加载的不一样 middlewares = [] enabled = [] for clspath in mwlist: try: mwcls = load_object(clspath) mw = create_instance( mwcls, settings, crawler) # 执行from_crawler或者from_settings,已经是创建实例了 middlewares.append(mw) # 中间件列表哦 enabled.append(clspath) except NotConfigured as e: if e.args: clsname = clspath.split('.')[-1] logger.warning("Disabled %(clsname)s: %(eargs)s", { 'clsname': clsname, 'eargs': e.args[0] }, extra={'crawler': crawler}) logger.info("Enabled %(componentname)ss:\n%(enabledlist)s", { 'componentname': cls.component_name, 'enabledlist': pprint.pformat(enabled) }, extra={'crawler': crawler}) return cls(*middlewares)
def __init__(self, spidercls, settings=None, init_reactor: bool = False): if isinstance(spidercls, Spider): raise ValueError( 'The spidercls argument must be a class, not an object') if isinstance(settings, dict) or settings is None: settings = Settings(settings) self.spidercls = spidercls self.settings = settings.copy() self.spidercls.update_settings(self.settings) self.signals = SignalManager(self) self.stats = load_object(self.settings['STATS_CLASS'])(self) handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL')) logging.root.addHandler(handler) d = dict(overridden_settings(self.settings)) logger.info("Overridden settings:\n%(settings)s", {'settings': pprint.pformat(d)}) if get_scrapy_root_handler() is not None: # scrapy root handler already installed: update it with new settings install_scrapy_root_handler(self.settings) # lambda is assigned to Crawler attribute because this way it is not # garbage collected after leaving __init__ scope self.__remove_handler = lambda: logging.root.removeHandler(handler) self.signals.connect(self.__remove_handler, signals.engine_stopped) lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.request_fingerprinter = create_instance( load_object(self.settings['REQUEST_FINGERPRINTER_CLASS']), settings=self.settings, crawler=self, ) reactor_class = self.settings.get("TWISTED_REACTOR") if init_reactor: # this needs to be done after the spider settings are merged, # but before something imports twisted.internet.reactor if reactor_class: install_reactor(reactor_class, self.settings["ASYNCIO_EVENT_LOOP"]) else: from twisted.internet import reactor # noqa: F401 log_reactor_info() if reactor_class: verify_installed_reactor(reactor_class) self.extensions = ExtensionManager.from_crawler(self) self.settings.freeze() self.crawling = False self.spider = None self.engine = None
def test_download_broken_content_allow_data_loss_via_setting(self, url='broken'): crawler = get_crawler(settings_dict={'DOWNLOAD_FAIL_ON_DATALOSS': False}) download_handler = create_instance(self.download_handler_cls, None, crawler) request = Request(self.getURL(url)) d = download_handler.download_request(request, Spider('foo')) d.addCallback(lambda r: r.flags) d.addCallback(self.assertEqual, ['dataloss']) return d
def from_crawler(cls, crawler): settings = crawler.settings dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) dupefilter = create_instance(dupefilter_cls, settings, crawler) pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE']) dqclass = load_object(settings['SCHEDULER_DISK_QUEUE']) mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE']) logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS', settings.getbool('SCHEDULER_DEBUG')) return cls(dupefilter, jobdir=job_dir(settings), logunser=logunser, stats=crawler.stats, pqclass=pqclass, dqclass=dqclass, mqclass=mqclass)
def _dq(self): """ Create a new priority queue instance, with disk storage """ state = self._read_dqs_state(self.dqdir) q = create_instance(self.pqclass, None, self.crawler, self._newdq, state, serialize=True) if q: logger.info("Resuming crawl (%(queuesize)d requests scheduled)", {'queuesize': len(q)}, extra={'spider': self.spider}) return q
def from_crawler(cls, crawler): settings = crawler.settings dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) dupefilter = create_instance(dupefilter_cls, settings, crawler) pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE']) if pqclass is PriorityQueue: warnings.warn("SCHEDULER_PRIORITY_QUEUE='queuelib.PriorityQueue'" " is no longer supported because of API changes; " "please use 'scrapy.pqueues.ScrapyPriorityQueue'", ScrapyDeprecationWarning) from scrapy.pqueues import ScrapyPriorityQueue pqclass = ScrapyPriorityQueue dqclass = load_object(settings['SCHEDULER_DISK_QUEUE']) mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE']) logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS', settings.getbool('SCHEDULER_DEBUG')) return cls(dupefilter, jobdir=job_dir(settings), logunser=logunser, stats=crawler.stats, pqclass=pqclass, dqclass=dqclass, mqclass=mqclass, crawler=crawler)
def from_settings(cls, settings, crawler=None): mwlist = cls._get_mwlist_from_settings(settings) middlewares = [] enabled = [] for clspath in mwlist: try: mwcls = load_object(clspath) mw = create_instance(mwcls, settings, crawler) middlewares.append(mw) enabled.append(clspath) except NotConfigured as e: if e.args: clsname = clspath.split('.')[-1] logger.warning("Disabled %(clsname)s: %(eargs)s", {'clsname': clsname, 'eargs': e.args[0]}, extra={'crawler': crawler}) logger.info("Enabled %(componentname)ss:\n%(enabledlist)s", {'componentname': cls.component_name, 'enabledlist': pprint.pformat(enabled)}, extra={'crawler': crawler}) return cls(*middlewares)
def _get_instance(self, objcls, *args, **kwargs): return create_instance( objcls, self.settings, getattr(self, 'crawler', None), *args, **kwargs)