def __init__(self, url: str, method: str = 'GET', *, callback=None, encoding: Optional[str] = None, headers: dict = None, metadata: dict = None, request_config: dict = None, request_session=None, **kwargs): """ Initialization parameters """ self.url = url self.method = method.upper() if self.method not in self.METHOD: raise InvalidRequestMethod( f'{self.method} method is not supported') self.callback = callback self.encoding = encoding self.headers = headers or {} self.metadata = metadata or {} self.request_session = request_session self.request_config = self.REQUEST_CONFIG if request_config is None else request_config self.ssl = kwargs.pop('ssl', False) self.kwargs = kwargs self.close_request_session = False self.logger = get_logger(name=self.name) self.retry_times = self.request_config.get('RETRIES', 3)
def __init__(self, url: str, method: str = 'GET', *, callback=None, encoding: Optional[str] = None, headers: dict = {}, metadata: dict = {}, request_config: dict = {}, request_session=None, res_type: str = 'text', **kwargs): """ Initialization parameters """ self.url = url self.method = method.upper() if self.method not in self.METHOD: raise ValueError('%s method is not supported' % self.method) self.callback = callback self.encoding = encoding self.headers = headers self.metadata = metadata if metadata is not None else {} self.request_session = request_session self.request_config = request_config or self.REQUEST_CONFIG self.res_type = res_type self.kwargs = kwargs self.close_request_session = False self.logger = get_logger(name=self.name) self.retry_times = self.request_config.get('RETRIES', 3)
def __init__( self, middleware: typing.Union[typing.Iterable, Middleware] = None, loop=None, is_async_start: bool = False, cancel_tasks: bool = True, **spider_kwargs, ): """ Init spider object. :param middleware: a list of or a single Middleware :param loop: asyncio event llo :param is_async_start: start spider by using async :param spider_kwargs """ if not self.start_urls or not isinstance( self.start_urls, collectionsAbc.Iterable ): raise ValueError( "Ruia spider must have a param named start_urls, eg: start_urls = ['https://www.github.com']" ) self.loop = loop asyncio.set_event_loop(self.loop) # Init object-level properties self.callback_result_map = self.callback_result_map or {} self.request_config = self.request_config or {} self.headers = self.headers or {} self.metadata = self.metadata or {} self.aiohttp_kwargs = self.aiohttp_kwargs or {} self.spider_kwargs = spider_kwargs self.request_config = self.request_config or {} self.request_session = ClientSession() self.cancel_tasks = cancel_tasks self.is_async_start = is_async_start # set logger self.logger = get_logger(name=self.name) # customize middleware if isinstance(middleware, list): self.middleware = reduce(lambda x, y: x + y, middleware) else: self.middleware = middleware or Middleware() # async queue as a producer self.request_queue = asyncio.Queue() # semaphore, used for concurrency control self.sem = asyncio.Semaphore(self.concurrency)
def __init__(self, middleware=None, loop=None, is_async_start=False): self.is_async_start = is_async_start self.logger = get_logger(name=self.name) self.loop = loop asyncio.set_event_loop(self.loop) # customize middleware if isinstance(middleware, list): self.middleware = reduce(lambda x, y: x + y, middleware) else: self.middleware = middleware or Middleware() # async queue self.request_queue = asyncio.Queue() # semaphore self.sem = asyncio.Semaphore(getattr(self, 'concurrency', 3))
def __init__(self, middleware=None, loop=None): if not self.start_urls or not isinstance(self.start_urls, list): raise ValueError( "Spider must have a param named start_urls, eg: start_urls = ['https://www.github.com']" ) self.logger = get_logger(name=self.name) self.loop = loop or asyncio.new_event_loop() asyncio.set_event_loop(self.loop) # customize middleware if isinstance(middleware, list): self.middleware = reduce(lambda x, y: x + y, middleware) else: self.middleware = middleware or Middleware() # async queue self.request_queue = asyncio.Queue() # semaphore self.sem = asyncio.Semaphore(getattr(self, 'concurrency', 3))
def __init__( self, url: str, method: str = "GET", *, callback=None, encoding: Optional[str] = None, headers: dict = None, metadata: dict = None, request_config: dict = None, request_session=None, **aiohttp_kwargs, ): """ Initialization parameters :param url: Target url :param method: HTTP method :param callback: Callback func :param encoding: Html encoding :param headers: Request headers :param metadata: Send the data to callback func :param request_config: Manage the target request :param request_session: aiohttp.ClientSession :param aiohttp_kwargs: """ self.url = url self.method = method.upper() if self.method not in self.METHOD: raise InvalidRequestMethod( f"{self.method} method is not supported") self.callback = callback self.encoding = encoding self.headers = headers or {} self.metadata = metadata or {} self.request_session = request_session self.request_config = (self.REQUEST_CONFIG if request_config is None else request_config) self.ssl = aiohttp_kwargs.pop("ssl", False) self.aiohttp_kwargs = aiohttp_kwargs self.close_request_session = False self.logger = get_logger(name=self.name) self.retry_times = self.request_config.get("RETRIES", 3)
"BAIDU_BUSINESSLICENSE_TYPE", "BAIDU_RECEIPT_TYPE", "BAIDU_TRAINTICKET_TYPE", "BAIDU_TAXIRECEIPT_TYPE", "BAIDU_FORM_TYPE", "BAIDU_TABLERECOGNIZE_TYPE", "BAIDU_TABLERESULTGET_TYPE", "BAIDU_VINCODE_TYPE", "BAIDU_QUOTAINVOICE_TYPE", "BAIDU_HOUSEHOLDREGISTER_TYPE", "BAIDU_HKMACAUEXITENTRYPERMIT_TYPE", "BAIDU_TAIWANEXITENTRYPERMIT_TYPE", "BAIDU_BIRTHCERTIFICATE_TYPE", "BAIDU_VEHICLEINVOICE_TYPE", "BAIDU_VEHICLECERTIFICATE_TYPE", "BAIDU_INVOICE_TYPE", "BAIDU_AIRTICKET_TYPE", "BAIDU_INSURANCEDOCUMENTS_TYPE", "BAIDU_VATINVOICE_TYPE", "BAIDU_QRCODE_TYPE", "BAIDU_NUMBERS_TYPE", "BAIDU_LOTTERY_TYPE", "BAIDU_PASSPORT_TYPE", "BAIDU_BUSINESSCARD_TYPE", "BAIDU_HANDWRITING_TYPE", "BAIDU_CUSTOM_TYPE", "BAIDU_GENERALBASIC_PAYLOAD", "BAIDU_GENERAL_PAYLOAD", "BAIDU_ACCURATE_PAYLOAD", "BAIDU_ACCURATEBASIC_PAYLOAD" ] logger = get_logger('Ocr') baidu_ocr_urls = {} baidu_ocr_payloads = {} baidu_ocr_types = {} def register_baidu_service(global_obj): def wrapper(func): dic = func() global_obj.update(dic) return wrapper try: # Adaptive interface changes. It's recommended to do this via installed aip
class SpiderHook: """ SpiderHook is used for extend spider """ callback_result_map: dict = None logger = get_logger(name='Spider') async def _run_spider_hook(self, hook_func): """ Run hook before/after spider start crawling :param hook_func: aws function :return: """ if callable(hook_func): try: aws_hook_func = hook_func(self) if isawaitable(aws_hook_func): await aws_hook_func except Exception as e: self.logger.error(f'<Hook {hook_func.__name__}: {e}') async def process_failed_response(self, request, response): """ Corresponding processing for the failed response :param request: Request :param response: Response :return: """ pass async def process_succeed_response(self, request, response): """ Corresponding processing for the succeed response :param request: Request :param response: Response :return: """ pass async def process_item(self, item): """ Corresponding processing for the Item type :param item: Item :return: """ pass async def process_callback_result(self, callback_result): """ Corresponding processing for the invalid callback result :param item: :return: """ callback_result_name = type(callback_result).__name__ process_func_name = self.callback_result_map.get( callback_result_name, '') process_func = getattr(self, process_func_name, None) if process_func is not None: await process_func(callback_result) else: raise InvalidCallbackResult( f'<Parse invalid callback result type: {callback_result_name}>' )
def make(settings, images, out, header, type, pattern, pattern_clean, number, debug, coordinate, lag): cp = ConfigParser() logger = get_logger('Results') if settings: cp.add_section('baiduocr') file = open(os.path.join(os.path.expanduser('~'), 'ruia_ocr.cfg'), 'w+') configs = settings.split(';').split(';') for setting in configs: key, v = setting.split('=') cp.set('baiduocr', key, v) cp.write(file) file.close() else: cp.read(os.path.join(os.path.expanduser('~'), 'ruia_ocr.cfg')) service = BaiduOcrService(cp.get('baiduocr', 'app_id'), cp.get('baiduocr', 'api_key'), cp.get('baiduocr', 'secret_key'), type_dic.get(type), seq='\n') imgs = parse_path(images) if not debug: logging.root.setLevel(logging.FATAL) if number == -1: urls = get_file_paths(imgs) if os.path.isdir(imgs) else [imgs] else: urls = get_file_paths(imgs, num=int(number)) if os.path.isdir(imgs) else [imgs] @adapter_item(pattern=pattern, pattern_clean=pattern_clean) class OcrItem(Item): pass @adapter_spider(range=coordinate) class OcrSpider(Spider): ocr_service = service ocr_options = {'region': ''} start_urls = urls concurrency = 1 request_config = {'TIMEOUT': 40, 'DELAY': lag} dump_res: List[List[str]] = [] async def parse(self, response): item = await OcrItem.get_item(html=response.ocr_html) item.path = os.path.basename(response.metadata.get('image')) yield item async def process_item(self, item: Item): dumps = [getattr(item, attr) for attr in OcrItem.collect] dumps.insert(0, item.path) self.dump_res.append(dumps) spider = OcrSpider.start(middleware=ocr_middle) logging.root.setLevel(logging.DEBUG) for res in spider.dump_res: if debug: logger.info(res[0] + ': ' + ' || '.join(res[1:])) else: click.echo(res[0] + ': ' + ' || '.join(res[1:])) if out: parsed_out = parse_path(out) if out.endswith('txt'): with open(parsed_out, 'w+', encoding='utf8') as file: contents = [pic_contents[1:] for pic_contents in spider.dump_res] for content in contents: file.write('\n'.join(content) + '\n') elif out.endswith('xlsx'): wk = Workbook() sh = wk.active if header: spider.dump_res.insert(0, ['picture'] + header.split(',')) for res in spider.dump_res: sh.append(res) wk.save(parsed_out) if debug: logger.info('save as %s' % os.path.abspath(out)) else: click.echo('save as %s' % os.path.abspath(out))
from asyncio import Semaphore from inspect import isawaitable from urllib.parse import urlparse, quote, urlencode from ruia import Request from ruia.utils import get_logger from .exceptions import ServicePayloadsError, ImageTypeError from .configs import * service_type_dic = { BAIDU_ACCURATEBASIC_TYPE: BAIDU_ACCURATEBASIC_PAYLOAD, BAIDU_ACCURATE_TYPE: BAIDU_ACCURATE_PAYLOAD, BAIDU_GENERALBASIC_TYPE: BAIDU_GENERALBASIC_PAYLOAD, BAIDU_GENERAL_TYPE: BAIDU_GENERAL_PAYLOAD } logger = get_logger('Spider') try: # Adaptive interface changes. It's recommended to do this from aip.base import AipBase _access_token_url = AipBase._AipBase__accessTokenUrl except: # Fixed api implementation, not recommended _access_token_url = 'https://aip.baidubce.com/oauth/2.0/token' __all__ = ['BaiduOcrService', 'BaseOcrService'] def getAuthrHeaders(method, url,
#!/usr/bin/env python import os from importlib import util from ruia.utils import get_logger logger = get_logger('settings') class SettingsWrapper(object): """ SettingsWrapper returns a spider config """ def __init__(self, settings_name='settings.py'): self.my_settings = {} self.settings_name = settings_name self._load_settings() def __call__(self): return self.my_settings def settings(self): return self.my_settings def load_with_file(self, file_path): file_name = os.path.basename(file_path) if file_name[-3:] != '.py': logger.error("module name must be python file, such as : example.py")
'payloads': BAIDU_ACCURATEBASIC_PAYLOAD, 'fail_images': [], 'image_hook_kwargs': {}, 'region': '' } service_type_dic = { BAIDU_ACCURATEBASIC_TYPE: BAIDU_ACCURATEBASIC_PAYLOAD, BAIDU_ACCURATE_TYPE: BAIDU_ACCURATE_PAYLOAD, BAIDU_GENERALBASIC_TYPE: BAIDU_GENERALBASIC_PAYLOAD, BAIDU_GENERAL_TYPE: BAIDU_GENERAL_PAYLOAD } ocr_middle = Middleware() logger = get_logger('Spider') logger_ocr = get_logger('Ocr') @classmethod def extension_start(cls, middleware: typing.Union[typing.Iterable, Middleware] = None, loop=None, after_start=None, before_stop=None, close_event_loop=True, **kwargs) -> Spider: try: loop = loop or asyncio.new_event_loop() spider_ins = cls(middleware=middleware, loop=loop, **kwargs)