Esempio n. 1
0
    def from_crawler(cls, crawler):
        settings = crawler.settings

        # ------------------------dupefilter-------------------------
        dupefilter_cls = load_object(
            settings.get("DUPEFILTER_CLASS",
                         default_settings.SCHEDULER_DUPEFILTER_CLASS))
        dupefilter = create_instance(dupefilter_cls, None, crawler)
        server = client_from_settings(settings)
        # ------------------------scheduler-------------------------
        scheduler_queue_class = load_object(
            default_settings.SCHEDULER_PRIORITY_QUEUE)
        scheduler_queue_key = settings.get(
            "SCHEDULER_QUEUE_KEY", default_settings.SCHEDULER_QUEUE_KEY)
        scheduler_serializer = load_object(
            settings.get("SCHEDULER_QUEUE_SERIALIZER",
                         default_settings.SCHEDULER_QUEUE_SERIALIZER))
        # TODO: Make scheduler_queue_key more diverse
        queue = create_instance(
            scheduler_queue_class, None, crawler, server,
            str(scheduler_queue_key % {'spider': crawler.spider.name}),
            scheduler_serializer)
        scheduler_queue_pop_timeout = settings.get(
            "SCHEDULER_QUEUE_POP_TIMEOUT",
            default_settings.SCHEDULER_QUEUE_POP_TIMEOUT)
        scheduler_clear_queue_at_open = settings.get(
            "SCHEDULER_CLEAR_QUEUE_AT_OPEN",
            default_settings.SCHEDULER_CLEAR_QUEUE_AT_OPEN)
        return cls(dupefilter,
                   queue=queue,
                   stats=crawler.stats,
                   crawler=crawler,
                   scheduler_queue_pop_timeout=scheduler_queue_pop_timeout,
                   scheduler_clear_queue_at_open=scheduler_clear_queue_at_open)
Esempio n. 2
0
def load_context_factory_from_settings(settings, crawler):
    ssl_method = openssl_methods[settings.get('DOWNLOADER_CLIENT_TLS_METHOD')]
    context_factory_cls = load_object(
        settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
    # try method-aware context factory
    try:
        context_factory = create_instance(
            objcls=context_factory_cls,
            settings=settings,
            crawler=crawler,
            method=ssl_method,
        )
    except TypeError:
        # use context factory defaults
        context_factory = create_instance(
            objcls=context_factory_cls,
            settings=settings,
            crawler=crawler,
        )
        msg = (
            f"{settings['DOWNLOADER_CLIENTCONTEXTFACTORY']} does not accept "
            "a `method` argument (type OpenSSL.SSL method, e.g. "
            "OpenSSL.SSL.SSLv23_METHOD) and/or a `tls_verbose_logging` "
            "argument and/or a `tls_ciphers` argument. Please, upgrade your "
            "context factory class to handle them or ignore them.")
        warnings.warn(msg)

    return context_factory
Esempio n. 3
0
    def __init__(self, settings):
        self._pool = HTTPConnectionPool(reactor, persistent=True)
        self._pool.maxPersistentPerHost = settings.getint(
            'CONCURRENT_REQUESTS_PER_DOMAIN')
        self._pool._factory.noisy = False

        self._sslMethod = openssl_methods[settings.get(
            'DOWNLOADER_CLIENT_TLS_METHOD')]
        self._contextFactoryClass = load_object(
            settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
        # try method-aware context factory
        try:
            self._contextFactory = create_instance(
                self._contextFactoryClass,
                settings=settings,
                crawler=None,
                method=self._sslMethod,
            )
        except TypeError:
            # use context factory defaults
            self._contextFactory = create_instance(
                self._contextFactoryClass,
                settings=settings,
                crawler=None,
            )
            msg = """
 '%s' does not accept `method` argument (type OpenSSL.SSL method,\
 e.g. OpenSSL.SSL.SSLv23_METHOD) and/or `tls_verbose_logging` argument and/or `tls_ciphers` argument.\
 Please upgrade your context factory class to handle them or ignore them.""" % (
                settings['DOWNLOADER_CLIENTCONTEXTFACTORY'], )
            warnings.warn(msg)
        self._default_maxsize = settings.getint('DOWNLOAD_MAXSIZE')
        self._default_warnsize = settings.getint('DOWNLOAD_WARNSIZE')
        self._fail_on_dataloss = settings.getbool('DOWNLOAD_FAIL_ON_DATALOSS')
        self._disconnect_timeout = 1
 def _test_with_settings(mock, settings):
     create_instance(mock, settings, None, *args, **kwargs)
     if hasattr(mock, 'from_crawler'):
         self.assertEqual(mock.from_crawler.call_count, 0)
     if hasattr(mock, 'from_settings'):
         mock.from_settings.assert_called_once_with(
             settings, *args, **kwargs)
         self.assertEqual(mock.call_count, 0)
     else:
         mock.assert_called_once_with(*args, **kwargs)
Esempio n. 5
0
 def _test_with_settings(mock, settings):
     create_instance(mock, settings, None, *args, **kwargs)
     if hasattr(mock, 'from_crawler'):
         self.assertEqual(mock.from_crawler.call_count, 0)
     if hasattr(mock, 'from_settings'):
         mock.from_settings.assert_called_once_with(settings, *args,
                                                    **kwargs)
         self.assertEqual(mock.call_count, 0)
     else:
         mock.assert_called_once_with(*args, **kwargs)
    def test_create_instance(self):
        settings = mock.MagicMock()
        crawler = mock.MagicMock(spec_set=['settings'])
        args = (True, 100.)
        kwargs = {'key': 'val'}

        def _test_with_settings(mock, settings):
            create_instance(mock, settings, None, *args, **kwargs)
            if hasattr(mock, 'from_crawler'):
                self.assertEqual(mock.from_crawler.call_count, 0)
            if hasattr(mock, 'from_settings'):
                mock.from_settings.assert_called_once_with(
                    settings, *args, **kwargs)
                self.assertEqual(mock.call_count, 0)
            else:
                mock.assert_called_once_with(*args, **kwargs)

        def _test_with_crawler(mock, settings, crawler):
            create_instance(mock, settings, crawler, *args, **kwargs)
            if hasattr(mock, 'from_crawler'):
                mock.from_crawler.assert_called_once_with(
                    crawler, *args, **kwargs)
                if hasattr(mock, 'from_settings'):
                    self.assertEqual(mock.from_settings.call_count, 0)
                self.assertEqual(mock.call_count, 0)
            elif hasattr(mock, 'from_settings'):
                mock.from_settings.assert_called_once_with(
                    settings, *args, **kwargs)
                self.assertEqual(mock.call_count, 0)
            else:
                mock.assert_called_once_with(*args, **kwargs)

        # Check usage of correct constructor using four mocks:
        #   1. with no alternative constructors
        #   2. with from_settings() constructor
        #   3. with from_crawler() constructor
        #   4. with from_settings() and from_crawler() constructor
        spec_sets = ([], ['from_settings'], ['from_crawler'],
                     ['from_settings', 'from_crawler'])
        for specs in spec_sets:
            m = mock.MagicMock(spec_set=specs)
            _test_with_settings(m, settings)
            m.reset_mock()
            _test_with_crawler(m, settings, crawler)

        # Check adoption of crawler settings
        m = mock.MagicMock(spec_set=['from_settings'])
        create_instance(m, None, crawler, *args, **kwargs)
        m.from_settings.assert_called_once_with(crawler.settings, *args,
                                                **kwargs)

        with self.assertRaises(ValueError):
            create_instance(m, None, None)
Esempio n. 7
0
    def test_create_instance(self):
        settings = mock.MagicMock()
        crawler = mock.MagicMock(spec_set=['settings'])
        args = (True, 100.)
        kwargs = {'key': 'val'}

        def _test_with_settings(mock, settings):
            create_instance(mock, settings, None, *args, **kwargs)
            if hasattr(mock, 'from_crawler'):
                self.assertEqual(mock.from_crawler.call_count, 0)
            if hasattr(mock, 'from_settings'):
                mock.from_settings.assert_called_once_with(settings, *args,
                                                           **kwargs)
                self.assertEqual(mock.call_count, 0)
            else:
                mock.assert_called_once_with(*args, **kwargs)

        def _test_with_crawler(mock, settings, crawler):
            create_instance(mock, settings, crawler, *args, **kwargs)
            if hasattr(mock, 'from_crawler'):
                mock.from_crawler.assert_called_once_with(crawler, *args,
                                                          **kwargs)
                if hasattr(mock, 'from_settings'):
                    self.assertEqual(mock.from_settings.call_count, 0)
                self.assertEqual(mock.call_count, 0)
            elif hasattr(mock, 'from_settings'):
                mock.from_settings.assert_called_once_with(settings, *args,
                                                           **kwargs)
                self.assertEqual(mock.call_count, 0)
            else:
                mock.assert_called_once_with(*args, **kwargs)

        # Check usage of correct constructor using four mocks:
        #   1. with no alternative constructors
        #   2. with from_settings() constructor
        #   3. with from_crawler() constructor
        #   4. with from_settings() and from_crawler() constructor
        spec_sets = ([], ['from_settings'], ['from_crawler'],
                     ['from_settings', 'from_crawler'])
        for specs in spec_sets:
            m = mock.MagicMock(spec_set=specs)
            _test_with_settings(m, settings)
            m.reset_mock()
            _test_with_crawler(m, settings, crawler)

        # Check adoption of crawler settings
        m = mock.MagicMock(spec_set=['from_settings'])
        create_instance(m, None, crawler, *args, **kwargs)
        m.from_settings.assert_called_once_with(crawler.settings, *args,
                                                **kwargs)

        with self.assertRaises(ValueError):
            create_instance(m, None, None)
 def test_extra_kw(self):
     try:
         crawler = get_crawler()
         create_instance(
             objcls=S3DownloadHandler,
             settings=None,
             crawler=crawler,
             extra_kw=True,
         )
     except Exception as e:
         self.assertIsInstance(e, (TypeError, NotConfigured))
     else:
         assert False
Esempio n. 9
0
    def from_crawler(cls, crawler):
        settings = crawler.settings
        connection_url = settings.get("RABBITMQ_CONNECTION_PARAMETERS")
        queue_class = load_object(settings.get("SCHEDULER_QUEUE_CLASS"))
        dupefilter_cls = load_object(settings["DUPEFILTER_CLASS"])
        dupefilter = create_instance(dupefilter_cls, settings, crawler)
        pqclass = load_object(settings["SCHEDULER_PRIORITY_QUEUE"])
        if pqclass is PriorityQueue:
            warnings.warn(
                "SCHEDULER_PRIORITY_QUEUE='queuelib.PriorityQueue'"
                " is no longer supported because of API changes; "
                "please use 'scrapy.pqueues.ScrapyPriorityQueue'",
                ScrapyDeprecationWarning,
            )
            from scrapy.pqueues import ScrapyPriorityQueue

            pqclass = ScrapyPriorityQueue

        dqclass = load_object(settings["SCHEDULER_DISK_QUEUE"])
        mqclass = load_object(settings["SCHEDULER_MEMORY_QUEUE"])
        logunser = settings.getbool("SCHEDULER_DEBUG")
        return cls(
            dupefilter,
            connection_url,
            jobdir=job_dir(settings),
            logunser=logunser,
            stats=crawler.stats,
            pqclass=pqclass,
            dqclass=dqclass,
            mqclass=mqclass,
            crawler=crawler,
            queue_class=queue_class,
        )
Esempio n. 10
0
 def _mq(self):
     """ Create a new priority queue instance, with in-memory storage """
     return create_instance(self.pqclass,
                            settings=None,
                            crawler=self.crawler,
                            downstream_queue_cls=self.mqclass,
                            key='')
Esempio n. 11
0
    def from_spider(cls, spider):
        settings = spider.settings
        # log
        dupefilter_debug = settings.get("DUPEFILTER_LOG",
                                        default_settings.DUPEFILTER_DEBUG)
        dupefilter_log = settings.get("DUPEFILTER_LOG",
                                      default_settings.DUPEFILTER_LOG)

        # filter key in server
        dupefilter_key = settings.get(
            "SCHEDULER_DUPEFILTER_KEY",
            default_settings.SCHEDULER_DUPEFILTER_KEY)
        fingerprint_by_kafka = settings.getbool(
            "FINGERPRINT_BY_KAFKA_MESSAGE",
            default_settings.FINGERPRINT_BY_KAFKA_MESSAGE)

        key = dupefilter_key % {'spider': spider.name}

        scheduler_filter_class = load_object(
            settings.get("SCHEDULER_FILTER_CLASS",
                         default_settings.SCHEDULER_FILTER_CLASS))

        filter_queue = create_instance(scheduler_filter_class, settings, None,
                                       key)
        return cls(filter_queue, dupefilter_debug, dupefilter_log,
                   fingerprint_by_kafka)
Esempio n. 12
0
 def testPayload(self):
     s = "0123456789" * 10
     settings = Settings({'DOWNLOADER_CLIENT_TLS_CIPHERS': self.custom_ciphers})
     client_context_factory = create_instance(ScrapyClientContextFactory, settings=settings, crawler=None)
     return getPage(
         self.getURL("payload"), body=s, contextFactory=client_context_factory
     ).addCallback(self.assertEqual, to_bytes(s))
Esempio n. 13
0
 def _load_handler(self, scheme, skip_lazy=False):
     path = self._schemes[scheme]
     try:
         # 将路径对应的类导入
         dhcls = load_object(path)
         if skip_lazy and getattr(dhcls, 'lazy', True):
             # 自定义懒加载或者类中自带这个属性,则跳过
             return None
         # 实例化
         dh = create_instance(
             objcls=dhcls,
             settings=self._crawler.settings,
             crawler=self._crawler,
         )
     except NotConfigured as ex:
         # 报错,则加入到未配置的协议
         self._notconfigured[scheme] = str(ex)
         return None
     except Exception as ex:
         logger.error('Loading "%(clspath)s" for scheme "%(scheme)s"', {
             "clspath": path,
             "scheme": scheme
         },
                      exc_info=True,
                      extra={'crawler': self._crawler})
         self._notconfigured[scheme] = str(ex)
         return None
     else:
         # 如果没报错,则加入到字典中,并返回实例
         self._handlers[scheme] = dh
         return dh
Esempio n. 14
0
    def from_settings(cls, settings, crawler=None):
        ## 基于爬虫对象和配置创建中间件实例

        ## 调用子类的 _get_mwlist_from_settings 方法,从配置中获取所有中间件路径的列表
        mwlist = cls._get_mwlist_from_settings(settings)
        ## 存放可用的中间件类的实例
        middlewares = []
        ## 存放可用的中间件类的路径
        enabled = []
        ## 依次实例化
        for clspath in mwlist:
            try:
                ## 根据中间件类路径加载中间件类
                mwcls = load_object(clspath)
                ## 创建中间件类的实例
                mw = create_instance(mwcls, settings, crawler)
                middlewares.append(mw)
                enabled.append(clspath)
            except NotConfigured as e:
                if e.args:
                    clsname = clspath.split('.')[-1]
                    logger.warning("Disabled %(clsname)s: %(eargs)s",
                                   {'clsname': clsname, 'eargs': e.args[0]},
                                   extra={'crawler': crawler})

        logger.info("Enabled %(componentname)ss:\n%(enabledlist)s",
                    {'componentname': cls.component_name,
                     'enabledlist': pprint.pformat(enabled)},
                    extra={'crawler': crawler})
        ## 调用构造方法
        return cls(*middlewares)
Esempio n. 15
0
 def _mq(self):
     """ Create a new priority queue instance, with in-memory storage """
     return create_instance(self.pqclass,
                            None,
                            self.crawler,
                            self._newmq,
                            serialize=False)
Esempio n. 16
0
 def qfactory(self, key):
     return create_instance(
         self.downstream_queue_cls,
         None,
         self.crawler,
         self.key + '/' + str(key),
     )
Esempio n. 17
0
 def setUp(self):
     site = server.Site(UriResource(), timeout=None)
     wrapper = WrappingFactory(site)
     self.port = reactor.listenTCP(0, wrapper, interface='127.0.0.1')
     self.portno = self.port.getHost().port
     self.download_handler = create_instance(self.download_handler_cls, None, get_crawler())
     self.download_request = self.download_handler.download_request
Esempio n. 18
0
    def from_crawler(cls, crawler):

        settings = crawler.settings
        persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST)
        table = settings.get('SCHEDULER_QUEUE_TABLE', QUEUE_TABLE)
        table = table % {'spider': crawler.spider.name}

        queue_cls = load_object(
            settings.get('SCHEDULER_QUEUE_CLASS', QUEUE_CLASS))
        idle_before_close = settings.get('SCHEDULER_IDLE_BEFORE_CLOSE',
                                         IDLE_BEFORE_CLOSE)
        conn = connection.from_crawler(crawler)
        dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])

        dupefilter = create_instance(dupefilter_cls, settings, crawler)

        instance = cls(
            crawler,
            conn,
            persist,
            table,
            queue_cls,
            idle_before_close,
            crawler.stats,
            dupefilter,
        )

        return instance
Esempio n. 19
0
 def open_spider(self,
                 spider: Spider,
                 start_requests: Iterable = (),
                 close_if_idle: bool = True):
     if self.slot is not None:
         raise RuntimeError(
             f"No free spider slot when opening {spider.name!r}")
     logger.info("Spider opened", extra={'spider': spider})
     # p.15 创建延迟调用实例,为下一轮事件循环做准备
     nextcall = CallLaterOnce(self._next_request)
     # p.16 创建 调度器 实例
     scheduler = create_instance(self.scheduler_cls,
                                 settings=None,
                                 crawler=self.crawler)
     # p.17 挂载 爬虫中间件 ,并处理开始请求
     start_requests = yield self.scraper.spidermw.process_start_requests(
         start_requests, spider)
     # p.18 封装 开始请求,延迟调用实例,调度器
     self.slot = Slot(start_requests, close_if_idle, nextcall, scheduler)
     self.spider = spider
     # p.19
     if hasattr(scheduler, "open"):
         yield scheduler.open(spider)
     # p.20
     yield self.scraper.open_spider(spider)
     # p.21 启动信息收集
     self.crawler.stats.open_spider(spider)
     # p.22
     yield self.signals.send_catch_log_deferred(signals.spider_opened,
                                                spider=spider)
     self.slot.nextcall.schedule()
     self.slot.heartbeat.start(5)
Esempio n. 20
0
    def test_passing_objects_as_values(self):
        from scrapy.core.downloader.handlers.file import FileDownloadHandler
        from scrapy.utils.misc import create_instance
        from scrapy.utils.test import get_crawler

        class TestPipeline():
            def process_item(self, i, s):
                return i

        settings = Settings({
            'ITEM_PIPELINES': {
                TestPipeline: 800,
            },
            'DOWNLOAD_HANDLERS': {
                'ftp': FileDownloadHandler,
            },
        })

        self.assertIn('ITEM_PIPELINES', settings.attributes)

        mypipeline, priority = settings.getdict('ITEM_PIPELINES').popitem()
        self.assertEqual(priority, 800)
        self.assertEqual(mypipeline, TestPipeline)
        self.assertIsInstance(mypipeline(), TestPipeline)
        self.assertEqual(mypipeline().process_item('item', None), 'item')

        myhandler = settings.getdict('DOWNLOAD_HANDLERS').pop('ftp')
        self.assertEqual(myhandler, FileDownloadHandler)
        myhandler_instance = create_instance(myhandler, None, get_crawler())
        self.assertIsInstance(myhandler_instance, FileDownloadHandler)
        self.assertTrue(hasattr(myhandler_instance, 'download_request'))
Esempio n. 21
0
    def from_crawler(cls, crawler):
        settings = crawler.settings
        # 从配置中获取指纹过滤器类, 见scrapy/dupefilters.py
        dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
        dupefilter = create_instance(dupefilter_cls, settings,
                                     crawler)  # 实例化指纹过滤器类, 最终复制为self.df
        # 任务队列, 见scrapy/squeues.py, 其中磁盘队列: 执行后会保存队列任务到磁盘; 内存队列: 重启消失
        # 如果用户配置了JOBDIR, 则会同时影响过滤器, 磁盘队列的设置.
        pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE'])
        if pqclass is PriorityQueue:
            warnings.warn(
                "SCHEDULER_PRIORITY_QUEUE='queuelib.PriorityQueue'"
                " is no longer supported because of API changes; "
                "please use 'scrapy.pqueues.ScrapyPriorityQueue'",
                ScrapyDeprecationWarning)
            from scrapy.pqueues import ScrapyPriorityQueue
            pqclass = ScrapyPriorityQueue

        dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
        mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
        # 日志序列化开关
        logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS',
                                    settings.getbool('SCHEDULER_DEBUG'))
        return cls(dupefilter,
                   jobdir=job_dir(settings),
                   logunser=logunser,
                   stats=crawler.stats,
                   pqclass=pqclass,
                   dqclass=dqclass,
                   mqclass=mqclass,
                   crawler=crawler)  # 实例化scheduler
Esempio n. 22
0
    def from_settings(cls, settings: Settings, crawler=None):
        mwlist = cls._get_mwlist_from_settings(settings)
        middlewares = []
        enabled = []
        for clspath in mwlist:
            # 将类名字符串转换成实例, 比如'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware'转换成实例
            try:
                mwcls = load_object(clspath)
                mw = create_instance(mwcls, settings, crawler)
                # 转换成功后的实例放到这里
                middlewares.append(mw)
                # 启用的类名
                enabled.append(clspath)
            except NotConfigured as e:
                # 转换失败报错,并跳过
                if e.args:
                    clsname = clspath.split('.')[-1]
                    logger.warning("Disabled %(clsname)s: %(eargs)s", {
                        'clsname': clsname,
                        'eargs': e.args[0]
                    },
                                   extra={'crawler': crawler})

        logger.info("Enabled %(componentname)ss:\n%(enabledlist)s", {
            'componentname': cls.component_name,
            'enabledlist': pprint.pformat(enabled)
        },
                    extra={'crawler': crawler})
        return cls(*middlewares)
Esempio n. 23
0
    def from_crawler(cls, crawler):
        settings = crawler.settings
        dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
        dupefilter = create_instance(dupefilter_cls, settings, crawler)
        pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE'])
        if pqclass is PriorityQueue:
            warnings.warn(
                "SCHEDULER_PRIORITY_QUEUE='queuelib.PriorityQueue'"
                " is no longer supported because of API changes; "
                "please use 'scrapy.pqueues.ScrapyPriorityQueue'",
                ScrapyDeprecationWarning)
            from scrapy.pqueues import ScrapyPriorityQueue
            pqclass = ScrapyPriorityQueue

        dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
        mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
        logunser = settings.getbool('SCHEDULER_DEBUG')
        return cls(dupefilter,
                   jobdir=job_dir(settings),
                   logunser=logunser,
                   stats=crawler.stats,
                   pqclass=pqclass,
                   dqclass=dqclass,
                   mqclass=mqclass,
                   crawler=crawler)
    def setUp(self):
        from twisted.protocols.ftp import FTPRealm, FTPFactory
        from scrapy.core.downloader.handlers.ftp import FTPDownloadHandler

        # setup dirs and test file
        self.directory = self.mktemp()
        os.mkdir(self.directory)
        userdir = os.path.join(self.directory, self.username)
        os.mkdir(userdir)
        fp = FilePath(userdir)
        fp.child('file.txt').setContent(b"I have the power!")
        fp.child('file with spaces.txt').setContent(b"Moooooooooo power!")

        # setup server
        realm = FTPRealm(anonymousRoot=self.directory, userHome=self.directory)
        p = portal.Portal(realm)
        users_checker = checkers.InMemoryUsernamePasswordDatabaseDontUse()
        users_checker.addUser(self.username, self.password)
        p.registerChecker(users_checker, credentials.IUsernamePassword)
        self.factory = FTPFactory(portal=p)
        self.port = reactor.listenTCP(0, self.factory, interface="127.0.0.1")
        self.portNum = self.port.getHost().port
        crawler = get_crawler()
        self.download_handler = create_instance(FTPDownloadHandler,
                                                crawler.settings, crawler)
        self.addCleanup(self.port.stopListening)
 def setUp(self):
     self.tmpname = self.mktemp()
     os.mkdir(self.tmpname)
     FilePath(self.tmpname).child("file").setContent(b"0123456789")
     r = static.File(self.tmpname)
     r.putChild(b"redirect", util.Redirect(b"/file"))
     r.putChild(b"wait", ForeverTakingResource())
     r.putChild(b"hang-after-headers", ForeverTakingResource(write=True))
     r.putChild(b"nolength", NoLengthResource())
     r.putChild(b"host", HostHeaderResource())
     r.putChild(b"payload", PayloadResource())
     r.putChild(b"broken", BrokenDownloadResource())
     r.putChild(b"chunked", ChunkedResource())
     r.putChild(b"broken-chunked", BrokenChunkedResource())
     r.putChild(b"contentlength", ContentLengthHeaderResource())
     r.putChild(b"nocontenttype", EmptyContentTypeHeaderResource())
     r.putChild(b"largechunkedfile", LargeChunkedFileResource())
     r.putChild(b"echo", Echo())
     self.site = server.Site(r, timeout=None)
     self.wrapper = WrappingFactory(self.site)
     self.host = 'localhost'
     if self.scheme == 'https':
         self.port = reactor.listenSSL(0,
                                       self.wrapper,
                                       ssl_context_factory(
                                           self.keyfile, self.certfile),
                                       interface=self.host)
     else:
         self.port = reactor.listenTCP(0, self.wrapper, interface=self.host)
     self.portno = self.port.getHost().port
     self.download_handler = create_instance(self.download_handler_cls,
                                             None, get_crawler())
     self.download_request = self.download_handler.download_request
Esempio n. 26
0
    def start(self, stop_after_crawl=True):
        """
        This method starts a :mod:`~twisted.internet.reactor`, adjusts its pool
        size to :setting:`REACTOR_THREADPOOL_MAXSIZE`, and installs a DNS cache
        based on :setting:`DNSCACHE_ENABLED` and :setting:`DNSCACHE_SIZE`.

        If ``stop_after_crawl`` is True, the reactor will be stopped after all
        crawlers have finished, using :meth:`join`.

        :param boolean stop_after_crawl: stop or not the reactor when all
            crawlers have finished
        """
        from twisted.internet import reactor
        if stop_after_crawl:
            d = self.join()
            # Don't start the reactor if the deferreds are already fired
            if d.called:
                return
            d.addBoth(self._stop_reactor)

        resolver_class = load_object(self.settings["DNS_RESOLVER"])
        resolver = create_instance(resolver_class,
                                   self.settings,
                                   self,
                                   reactor=reactor)
        resolver.install_on_reactor()
        tp = reactor.getThreadPool()
        tp.adjustPoolsize(
            maxthreads=self.settings.getint('REACTOR_THREADPOOL_MAXSIZE'))
        reactor.addSystemEventTrigger('before', 'shutdown', self.stop)
        reactor.run(installSignalHandlers=False)  # blocking call
    def setUp(self):
        from twisted.protocols.ftp import FTPRealm, FTPFactory
        from scrapy.core.downloader.handlers.ftp import FTPDownloadHandler

        # setup dir and test file
        self.directory = self.mktemp()
        os.mkdir(self.directory)

        fp = FilePath(self.directory)
        fp.child('file.txt').setContent(b"I have the power!")
        fp.child('file with spaces.txt').setContent(b"Moooooooooo power!")

        # setup server for anonymous access
        realm = FTPRealm(anonymousRoot=self.directory)
        p = portal.Portal(realm)
        p.registerChecker(checkers.AllowAnonymousAccess(),
                          credentials.IAnonymous)

        self.factory = FTPFactory(portal=p, userAnonymous=self.username)
        self.port = reactor.listenTCP(0, self.factory, interface="127.0.0.1")
        self.portNum = self.port.getHost().port
        crawler = get_crawler()
        self.download_handler = create_instance(FTPDownloadHandler,
                                                crawler.settings, crawler)
        self.addCleanup(self.port.stopListening)
Esempio n. 28
0
    def from_settings(cls, settings: Settings, crawler=None):
        mwlist = cls._get_mwlist_from_settings(settings)
        middlewares = []
        enabled = []
        for clspath in mwlist:
            try:
                mwcls = load_object(clspath)
                mw = create_instance(mwcls, settings, crawler)
                middlewares.append(mw)
                enabled.append(clspath)
            except NotConfigured as e:
                if e.args:
                    clsname = clspath.split('.')[-1]
                    logger.warning("Disabled %(clsname)s: %(eargs)s", {
                        'clsname': clsname,
                        'eargs': e.args[0]
                    },
                                   extra={'crawler': crawler})

        logger.info("Enabled %(componentname)ss:\n%(enabledlist)s", {
            'componentname': cls.component_name,
            'enabledlist': pprint.pformat(enabled)
        },
                    extra={'crawler': crawler})
        return cls(*middlewares)
Esempio n. 29
0
 def _load_handler(self, scheme, skip_lazy=False):
     path = self._schemes[scheme]
     try:
         dhcls = load_object(path)
         if skip_lazy and getattr(dhcls, "lazy", True):
             return None
         dh = create_instance(
             objcls=dhcls,
             settings=self._crawler.settings,
             crawler=self._crawler,
         )
     except NotConfigured as ex:
         self._notconfigured[scheme] = str(ex)
         return None
     except Exception as ex:
         logger.error(
             'Loading "%(clspath)s" for scheme "%(scheme)s"',
             {
                 "clspath": path,
                 "scheme": scheme
             },
             exc_info=True,
             extra={"crawler": self._crawler},
         )
         self._notconfigured[scheme] = str(ex)
         return None
     else:
         self._handlers[scheme] = dh
         return dh
Esempio n. 30
0
    def from_settings(cls, settings, crawler=None):  # 实例化中间件,是一个公用方法
        mwlist = cls._get_mwlist_from_settings(
            settings)  # 从setting中获取中间件列表,这里的设置被各个模块各自重载了,加载的不一样
        middlewares = []
        enabled = []
        for clspath in mwlist:
            try:
                mwcls = load_object(clspath)
                mw = create_instance(
                    mwcls, settings,
                    crawler)  # 执行from_crawler或者from_settings,已经是创建实例了
                middlewares.append(mw)  # 中间件列表哦
                enabled.append(clspath)
            except NotConfigured as e:
                if e.args:
                    clsname = clspath.split('.')[-1]
                    logger.warning("Disabled %(clsname)s: %(eargs)s", {
                        'clsname': clsname,
                        'eargs': e.args[0]
                    },
                                   extra={'crawler': crawler})

        logger.info("Enabled %(componentname)ss:\n%(enabledlist)s", {
            'componentname': cls.component_name,
            'enabledlist': pprint.pformat(enabled)
        },
                    extra={'crawler': crawler})
        return cls(*middlewares)
Esempio n. 31
0
    def __init__(self, spidercls, settings=None, init_reactor: bool = False):
        if isinstance(spidercls, Spider):
            raise ValueError(
                'The spidercls argument must be a class, not an object')

        if isinstance(settings, dict) or settings is None:
            settings = Settings(settings)

        self.spidercls = spidercls
        self.settings = settings.copy()
        self.spidercls.update_settings(self.settings)

        self.signals = SignalManager(self)

        self.stats = load_object(self.settings['STATS_CLASS'])(self)

        handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL'))
        logging.root.addHandler(handler)

        d = dict(overridden_settings(self.settings))
        logger.info("Overridden settings:\n%(settings)s",
                    {'settings': pprint.pformat(d)})

        if get_scrapy_root_handler() is not None:
            # scrapy root handler already installed: update it with new settings
            install_scrapy_root_handler(self.settings)
        # lambda is assigned to Crawler attribute because this way it is not
        # garbage collected after leaving __init__ scope
        self.__remove_handler = lambda: logging.root.removeHandler(handler)
        self.signals.connect(self.__remove_handler, signals.engine_stopped)

        lf_cls = load_object(self.settings['LOG_FORMATTER'])
        self.logformatter = lf_cls.from_crawler(self)

        self.request_fingerprinter = create_instance(
            load_object(self.settings['REQUEST_FINGERPRINTER_CLASS']),
            settings=self.settings,
            crawler=self,
        )

        reactor_class = self.settings.get("TWISTED_REACTOR")
        if init_reactor:
            # this needs to be done after the spider settings are merged,
            # but before something imports twisted.internet.reactor
            if reactor_class:
                install_reactor(reactor_class,
                                self.settings["ASYNCIO_EVENT_LOOP"])
            else:
                from twisted.internet import reactor  # noqa: F401
            log_reactor_info()
        if reactor_class:
            verify_installed_reactor(reactor_class)

        self.extensions = ExtensionManager.from_crawler(self)

        self.settings.freeze()
        self.crawling = False
        self.spider = None
        self.engine = None
Esempio n. 32
0
 def test_download_broken_content_allow_data_loss_via_setting(self, url='broken'):
     crawler = get_crawler(settings_dict={'DOWNLOAD_FAIL_ON_DATALOSS': False})
     download_handler = create_instance(self.download_handler_cls, None, crawler)
     request = Request(self.getURL(url))
     d = download_handler.download_request(request, Spider('foo'))
     d.addCallback(lambda r: r.flags)
     d.addCallback(self.assertEqual, ['dataloss'])
     return d
Esempio n. 33
0
 def from_crawler(cls, crawler):
     settings = crawler.settings
     dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
     dupefilter = create_instance(dupefilter_cls, settings, crawler)
     pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE'])
     dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
     mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
     logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS', settings.getbool('SCHEDULER_DEBUG'))
     return cls(dupefilter, jobdir=job_dir(settings), logunser=logunser,
                stats=crawler.stats, pqclass=pqclass, dqclass=dqclass, mqclass=mqclass)
Esempio n. 34
0
 def _dq(self):
     """ Create a new priority queue instance, with disk storage """
     state = self._read_dqs_state(self.dqdir)
     q = create_instance(self.pqclass,
                         None,
                         self.crawler,
                         self._newdq,
                         state,
                         serialize=True)
     if q:
         logger.info("Resuming crawl (%(queuesize)d requests scheduled)",
                     {'queuesize': len(q)}, extra={'spider': self.spider})
     return q
Esempio n. 35
0
    def from_crawler(cls, crawler):
        settings = crawler.settings
        dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
        dupefilter = create_instance(dupefilter_cls, settings, crawler)
        pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE'])
        if pqclass is PriorityQueue:
            warnings.warn("SCHEDULER_PRIORITY_QUEUE='queuelib.PriorityQueue'"
                          " is no longer supported because of API changes; "
                          "please use 'scrapy.pqueues.ScrapyPriorityQueue'",
                          ScrapyDeprecationWarning)
            from scrapy.pqueues import ScrapyPriorityQueue
            pqclass = ScrapyPriorityQueue

        dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
        mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
        logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS',
                                    settings.getbool('SCHEDULER_DEBUG'))
        return cls(dupefilter, jobdir=job_dir(settings), logunser=logunser,
                   stats=crawler.stats, pqclass=pqclass, dqclass=dqclass,
                   mqclass=mqclass, crawler=crawler)
Esempio n. 36
0
    def from_settings(cls, settings, crawler=None):
        mwlist = cls._get_mwlist_from_settings(settings)
        middlewares = []
        enabled = []
        for clspath in mwlist:
            try:
                mwcls = load_object(clspath)
                mw = create_instance(mwcls, settings, crawler)
                middlewares.append(mw)
                enabled.append(clspath)
            except NotConfigured as e:
                if e.args:
                    clsname = clspath.split('.')[-1]
                    logger.warning("Disabled %(clsname)s: %(eargs)s",
                                   {'clsname': clsname, 'eargs': e.args[0]},
                                   extra={'crawler': crawler})

        logger.info("Enabled %(componentname)ss:\n%(enabledlist)s",
                    {'componentname': cls.component_name,
                     'enabledlist': pprint.pformat(enabled)},
                    extra={'crawler': crawler})
        return cls(*middlewares)
Esempio n. 37
0
 def _mq(self):
     """ Create a new priority queue instance, with in-memory storage """
     return create_instance(self.pqclass, None, self.crawler, self._newmq,
                            serialize=False)
Esempio n. 38
0
 def _get_instance(self, objcls, *args, **kwargs):
     return create_instance(
         objcls, self.settings, getattr(self, 'crawler', None),
         *args, **kwargs)