def request( self, url: str, method: str = "GET", *, callback=None, encoding: typing.Optional[str] = None, headers: dict = None, metadata: dict = None, request_config: dict = None, request_session=None, **aiohttp_kwargs, ): """Init a Request class for crawling html""" headers = headers or {} metadata = metadata or {} request_config = request_config or {} request_session = request_session or self.request_session headers.update(self.headers.copy()) request_config.update(self.request_config.copy()) aiohttp_kwargs.update(self.aiohttp_kwargs.copy()) return Request( url=url, method=method, callback=callback, encoding=encoding, headers=headers, metadata=metadata, request_config=request_config, request_session=request_session, **aiohttp_kwargs, )
async def _get_html(cls, html, url, **kwargs): if html is None and not url: raise ValueError("html(url or html_etree) is expected") if not html: request = Request(url, **kwargs) response = await request.fetch() html = response.html return etree.HTML(html)
async def _get_html(cls, html: str = '', url: str = '', **kwargs): if html or url: if url: sem = kwargs.pop('sem', None) request = Request(url, **kwargs) if sem: _, response = await request.fetch_callback(sem=sem) else: response = await request.fetch() html = response.html return etree.HTML(html) else: ValueError("html(url or html_etree) is expected")
async def _get_html(cls, html: str = "", url: str = "", **kwargs): if html and url: raise ValueError("<Item: html *or* url expected, not both.") if html or url: if url: sem = kwargs.pop("sem", None) request = Request(url, **kwargs) if sem: _, response = await request.fetch_callback(sem=sem) else: response = await request.fetch() html = response.html return etree.HTML(html) else: raise ValueError("<Item: html(url or html_etree) is expected.")
async def _get_html(cls, html: str = "", url: str = "", **kwargs): if html and url: raise ValueError("<Item: html *or* url expected, not both.") if html or url: if url: async with aiohttp.ClientSession() as session: sem = kwargs.pop("sem", None) request = Request(url, request_session=session, **kwargs) if sem: _, response = await request.fetch_callback(sem=sem) else: response = await request.fetch() html = await response.text() return etree.HTML(html) else: raise ValueError("<Item: html(url or html_etree) is expected.")
async def start_master(self): for url in self.start_urls: request_ins = Request( url=url, callback=self.parse, headers=getattr(self, 'headers', {}), metadata=getattr(self, 'metadata', {}), request_config=getattr(self, 'request_config'), request_session=getattr(self, 'request_session', None), res_type=getattr(self, 'res_type', 'text'), **getattr(self, 'kwargs', {})) # self.request_queue.put_nowait(request_ins.fetch_callback(self.sem)) self.request_queue.put_nowait(self.handle_request(request_ins)) workers = [ asyncio.ensure_future(self.start_worker()) for i in range(2) ] await self.request_queue.join() await self.stop(SIGINT)
def make_requests_from_url(self, url, **kwargs): headers = default_pop(kwargs, 'headers', getattr(self, 'headers', {})) metadata = default_pop(kwargs, 'metadata', getattr(self, 'metadata', {})) request_config = default_pop(kwargs, 'request_config', getattr(self, 'request_config')) request_session = default_pop(kwargs, 'request_session', getattr(self, 'request_session', None)) res_type = default_pop(kwargs, 'res_type', getattr(self, 'res_type', 'text')) kwargs.update(getattr(self, 'kwargs', {})) return Request(url=url, callback=self.parse, headers=headers, metadata=metadata, request_config=request_config, request_session=request_session, res_type=res_type, **kwargs)