Esempio n. 1
0
    def request(
        self,
        url: str,
        method: str = "GET",
        *,
        callback=None,
        encoding: typing.Optional[str] = None,
        headers: dict = None,
        metadata: dict = None,
        request_config: dict = None,
        request_session=None,
        **aiohttp_kwargs,
    ):
        """Init a Request class for crawling html"""
        headers = headers or {}
        metadata = metadata or {}
        request_config = request_config or {}
        request_session = request_session or self.request_session

        headers.update(self.headers.copy())
        request_config.update(self.request_config.copy())
        aiohttp_kwargs.update(self.aiohttp_kwargs.copy())

        return Request(
            url=url,
            method=method,
            callback=callback,
            encoding=encoding,
            headers=headers,
            metadata=metadata,
            request_config=request_config,
            request_session=request_session,
            **aiohttp_kwargs,
        )
Esempio n. 2
0
 async def _get_html(cls, html, url, **kwargs):
     if html is None and not url:
         raise ValueError("html(url or html_etree) is expected")
     if not html:
         request = Request(url, **kwargs)
         response = await request.fetch()
         html = response.html
     return etree.HTML(html)
Esempio n. 3
0
 async def _get_html(cls, html: str = '', url: str = '', **kwargs):
     if html or url:
         if url:
             sem = kwargs.pop('sem', None)
             request = Request(url, **kwargs)
             if sem:
                 _, response = await request.fetch_callback(sem=sem)
             else:
                 response = await request.fetch()
             html = response.html
         return etree.HTML(html)
     else:
         ValueError("html(url or html_etree) is expected")
Esempio n. 4
0
 async def _get_html(cls, html: str = "", url: str = "", **kwargs):
     if html and url:
         raise ValueError("<Item: html *or* url expected, not both.")
     if html or url:
         if url:
             sem = kwargs.pop("sem", None)
             request = Request(url, **kwargs)
             if sem:
                 _, response = await request.fetch_callback(sem=sem)
             else:
                 response = await request.fetch()
             html = response.html
         return etree.HTML(html)
     else:
         raise ValueError("<Item: html(url or html_etree) is expected.")
Esempio n. 5
0
 async def _get_html(cls, html: str = "", url: str = "", **kwargs):
     if html and url:
         raise ValueError("<Item: html *or* url expected, not both.")
     if html or url:
         if url:
             async with aiohttp.ClientSession() as session:
                 sem = kwargs.pop("sem", None)
                 request = Request(url, request_session=session, **kwargs)
                 if sem:
                     _, response = await request.fetch_callback(sem=sem)
                 else:
                     response = await request.fetch()
                 html = await response.text()
         return etree.HTML(html)
     else:
         raise ValueError("<Item: html(url or html_etree) is expected.")
Esempio n. 6
0
 async def start_master(self):
     for url in self.start_urls:
         request_ins = Request(
             url=url,
             callback=self.parse,
             headers=getattr(self, 'headers', {}),
             metadata=getattr(self, 'metadata', {}),
             request_config=getattr(self, 'request_config'),
             request_session=getattr(self, 'request_session', None),
             res_type=getattr(self, 'res_type', 'text'),
             **getattr(self, 'kwargs', {}))
         # self.request_queue.put_nowait(request_ins.fetch_callback(self.sem))
         self.request_queue.put_nowait(self.handle_request(request_ins))
     workers = [
         asyncio.ensure_future(self.start_worker()) for i in range(2)
     ]
     await self.request_queue.join()
     await self.stop(SIGINT)
Esempio n. 7
0
 def make_requests_from_url(self, url, **kwargs):
     headers = default_pop(kwargs, 'headers', getattr(self, 'headers', {}))
     metadata = default_pop(kwargs, 'metadata',
                            getattr(self, 'metadata', {}))
     request_config = default_pop(kwargs, 'request_config',
                                  getattr(self, 'request_config'))
     request_session = default_pop(kwargs, 'request_session',
                                   getattr(self, 'request_session', None))
     res_type = default_pop(kwargs, 'res_type',
                            getattr(self, 'res_type', 'text'))
     kwargs.update(getattr(self, 'kwargs', {}))
     return Request(url=url,
                    callback=self.parse,
                    headers=headers,
                    metadata=metadata,
                    request_config=request_config,
                    request_session=request_session,
                    res_type=res_type,
                    **kwargs)