def test_spider_with_error_middleware(): error_middleware = Middleware() @error_middleware.request def error_request(spider_ins, request, response): pass @error_middleware.response async def error_response(spider_ins, request, response): raise TypeError('error') class SpiderDemo(Spider): start_urls = ['https://httpbin.org/get?p=0'] async def parse(self, response): pass SpiderDemo.start(middleware=error_middleware)
#!/usr/bin/env python import asyncio import os from ruia import Item, Middleware, Response, Request, Spider, TextField html_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data', 'for_spider_testing.html') with open(html_path, mode='r', encoding='utf-8') as file: HTML = file.read() middleware = Middleware() async def retry_func(request): request.request_config['TIMEOUT'] = 10 @middleware.request async def print_on_request(spider_ins, request): request.headers = { 'User-Agent': 'ruia ua' } @middleware.response async def print_on_response(spider_ins, request, response): assert isinstance(response.html, str) assert request.headers == { 'User-Agent': 'ruia ua' }
async def parse(self, res): pages = ['http://www.httpbin.org/get', 'http://www.httpbin.org/get'] for index, page in enumerate(pages): yield Request( page, callback=self.parse_item, metadata={'index': index}, request_config=self.request_config, ) async def parse_item(self, res): item_data = res.html return item_data middleware = Middleware() res_type_middleware = Middleware() @middleware.request async def print_on_request(request): request.headers = { 'User-Agent': 'ruia ua' } @middleware.response async def print_on_response(request, response): assert type(response.html) == dict
#!/usr/bin/env python from ruia import Middleware middleware01 = Middleware() middleware02 = Middleware() @middleware01.request async def print_on_request01(spider_ins, request): request.headers = {"User-Agent": "ruia ua"} @middleware01.response async def print_on_response01(spider_ins, request, response): assert isinstance(await response.text(), str) @middleware02.request async def print_on_request02(spider_ins, request): pass @middleware02.response async def print_on_response02(spider_ins, request, response): pass all_middleware = middleware01 + middleware02
#!/usr/bin/env python """ Created by howie.hu at 2018/10/17. """ from ruia import Middleware from owllook.spiders.spider_tools import get_proxy_ip owl_middleware = Middleware() @owl_middleware.request async def add_random_proxy(request): request.kwargs.update({'proxy': await update_proxy()}) request.request_config.update({'RETRY_FUNC': retry_func}) async def update_proxy(): proxy = await get_proxy_ip() if proxy: proxy = 'http://' + proxy else: proxy = None return proxy async def retry_func(request): proxy = await update_proxy() request.kwargs.update({'proxy': proxy}) return request
DEFAULT_OPTIONS = { 'payloads': BAIDU_ACCURATEBASIC_PAYLOAD, 'fail_images': [], 'image_hook_kwargs': {}, 'region': '' } service_type_dic = { BAIDU_ACCURATEBASIC_TYPE: BAIDU_ACCURATEBASIC_PAYLOAD, BAIDU_ACCURATE_TYPE: BAIDU_ACCURATE_PAYLOAD, BAIDU_GENERALBASIC_TYPE: BAIDU_GENERALBASIC_PAYLOAD, BAIDU_GENERAL_TYPE: BAIDU_GENERAL_PAYLOAD } ocr_middle = Middleware() logger = get_logger('Spider') logger_ocr = get_logger('Ocr') @classmethod def extension_start(cls, middleware: typing.Union[typing.Iterable, Middleware] = None, loop=None, after_start=None, before_stop=None, close_event_loop=True, **kwargs) -> Spider: try: