class SingleInheritor(TaskToSingleResultSpider):
    name = "single_inheritor_example"

    custom_settings = {
        "ITEM_PIPELINES": {
            get_import_full_name(ItemProducerPipeline): 310,
        }
    }

    def __init__(self, *args, **kwargs):
        super(SingleInheritor, self).__init__(*args, **kwargs)
        self.task_queue_name = f"{self.name}_task_queue"
        self.result_queue_name = f"{self.name}_result_queue"

    def next_request(self, _delivery_tag, msg_body):
        data = json.loads(msg_body)
        return scrapy.Request(data["url"], callback=self.parse)

    @rmq_callback
    def parse(self, response):
        meta_description = response.xpath(
            '//meta[@name="description"]/@content').get(default=None)
        yield MetaDescriptionItem({"description": meta_description})

    @rmq_errback
    def _errback(self, failure):
        if failure.check(TunnelError):
            self.logger.info("TunnelError. Copy request")
            yield failure.request.copy()
        else:
            self.logger.warning(f"IN ERRBACK: {repr(failure)}")
    def update_settings(cls, settings):
        spider_middlewares = settings.getdict("SPIDER_MIDDLEWARES")
        spider_middlewares[get_import_full_name(
            TaskTossSpiderMiddleware)] = 140
        spider_middlewares[get_import_full_name(
            DeliveryTagSpiderMiddleware)] = 150

        spider_extensions = settings.getdict("EXTENSIONS")
        spider_extensions[get_import_full_name(RPCTaskConsumer)] = 20

        for custom_setting, value in (cls.custom_settings or {}).items():
            if custom_setting == "SPIDER_MIDDLEWARES":
                spider_middlewares = {**spider_middlewares, **value}
            elif custom_setting == "EXTENSIONS":
                spider_extensions = {**spider_extensions, **value}
            else:
                settings.set(custom_setting, value)
        settings.set("SPIDER_MIDDLEWARES", spider_middlewares)
        settings.set("EXTENSIONS", spider_extensions)
Ejemplo n.º 3
0
    def update_settings(cls, settings):
        cls.custom_settings: dict = cls.custom_settings or {}

        spider_middlewares: dict = cls.custom_settings.get(
            'SPIDER_MIDDLEWARES', {})
        spider_middlewares.update({
            get_import_full_name(rmq_reader_middleware.RmqReaderMiddleware):
            1
        })
        cls.custom_settings['SPIDER_MIDDLEWARES'] = spider_middlewares

        super().update_settings(settings)
def crawler():
    settings = get_project_settings()
    custom_settings = {
        "DOWNLOADER_MIDDLEWARES": {
            get_import_full_name(Response301DownloaderMiddleware): 1,
        },
        'CONCURRENT_REQUESTS': 1,
        'LOG_FILE': None,
        'LOG_LEVEL': 'DEBUG',
    }

    settings.setdict(custom_settings or {}, priority='spider')
    yield CrawlerProcess(settings=settings)
Ejemplo n.º 5
0
class MySpider(RmqSpider):
    name = 'myspider'
    message_type: Type[BaseRmqMessage] = BaseRmqMessage
    task_queue_name: str = QUEUE_NAME

    custom_settings = {
        "DOWNLOADER_MIDDLEWARES": {
            get_import_full_name(CustomDownloaderMiddleware): 1,
        }
    }

    def parse(self, response, **kwargs):
        self.logger.info("PARSE METHOD")
        yield from ()

    def next_request(self, message: BaseRmqMessage) -> Request:
        return Request('https://httpstat.us/200', dont_filter=True)