def __init__( self, base_domain: str, domains: set[str], base_url: URL, directory: str, deep: int, save: bool, pool_count: int, disallow_ends: tuple ): self.scheme = base_url.scheme self.base_domain = base_domain self.domains = domains self.queue = Queue() self.crawled = [] self.working_links = set() self.disallow_ends = disallow_ends self.directory = directory + "/" self.save = save self.deep = deep self.count = 0 self.pool = ThreadPool(pool_count) self.robots_parser = RobotsTxtParser(base_url.human_repr()) try: self.robots_parser.get_robots_txt() except CrawlerError as e: logging.warning(f"-RobotsTxT {e.message}") self.robots_parser.death = True self.crawl_page(base_url.human_repr())
def test_human_repr_ipv6(): url = URL("http://[::1]:8080/path") s = url.human_repr() url2 = URL(s) assert url2 == url assert url2.host == "::1" assert s == "http://[::1]:8080/path"
def _req(client: httpx.Client, url: URL, headers: Dict): logger.debug(f'request url is {url}') res = client.get(url.human_repr(), headers=headers) if res.status_code != 200: res.raise_for_status() return res
def file_from_url(path: str, url: yarl.URL, *, mode: int = 0o644) -> dict: return { "path": path, "contents": { "source": url.human_repr() }, "mode": mode, "overwrite": True, }
def find_links(self, html: str, url: URL) -> set: result = set() host_name = str(url.origin()) soup = BeautifulSoup(html, "html.parser") for obj in soup.find_all("a", href=True): link = obj["href"].lower() if link.endswith(self.disallow_ends) or link.startswith("#"): continue link = URL(link) if self.check_domains(link) and link.human_repr().startswith( url.scheme ): result.add(self.get_normal_link(link)) elif link.scheme == "": if not link.human_repr().startswith("/"): result.add(url.human_repr() + link.human_repr()) else: result.add(f'{host_name}{link.human_repr()}') return result
def _download_img(self, url: URL) -> bytes: if not url.is_absolute(): url = self.url.join(url) logger.info(f'downloading image from [{url}]') remote_img = httpx.get(url.human_repr(), headers=header(url)) if remote_img.status_code != 200: remote_img.raise_for_status() return remote_img.content
def get_links(self, url: URL) -> set: site_info = self.get_content(url.human_repr()) if site_info.status_code != 200: logging.warning( f"{site_info.status_code} status code in get_links" f"({url.human_repr()})" ) return set() html = site_info.text if self.save: path = Path.cwd() / self.directory / url.host FileWorker.save_link(path, url, html) return self.find_links(html, url)
def _make_request( self, method: HttpMethod, url: URL, params: Optional[Dict[str, Any]] = None, json: Optional[Dict[str, Any]] = None, data: Optional[Union[str, bytes, Mapping[Any, Any]]] = None, auth: Optional[AuthBase] = None, raise_for_status: bool = True, ) -> requests.Response: response = requests.request( method=method.value, url=url.human_repr(), params=params, json=json, data=data, auth=auth, timeout=self._settings.timeout, ) if raise_for_status: response.raise_for_status() return response
def test_human_repr(): url = URL("http://вася:пароль@хост.домен:8080/путь/сюда?арг=вал#фраг") s = url.human_repr() assert URL(s) == url assert s == "http://вася:пароль@хост.домен:8080/путь/сюда?арг=вал#фраг"
def test_human_repr_default_port(): url = URL("http://вася:пароль@хост.домен/путь/сюда?арг=вал#фраг") s = url.human_repr() assert s == "http://вася:пароль@хост.домен/путь/сюда?арг=вал#фраг"
def test_human_repr_defaults(): url = URL("путь") s = url.human_repr() assert s == "путь"
def test_human_repr_defaults(): url = URL('путь') s = url.human_repr() assert s == 'путь'
def test_human_repr(): url = URL('http://вася:пароль@хост.домен:8080/путь/сюда?арг=вал#фраг') s = url.human_repr() assert s == 'http://вася:пароль@хост.домен:8080/путь/сюда?арг=вал#фраг'
class XMLTVListing(Hashable): # pylint: disable=too-many-instance-attributes """Hold the data and functions required to obtain an XMLTVListing. Currently it only supports the XML TV Listings format which holds Channel and Programme scheduling information. See the dtd_ for details. .. _dtd: https://github.com/AlekSi/xmltv/blob/master/xmltv.dtd Note: The :class:`~.channel.Channel` class provides the **primary** interface to channel data through the :class:`~.epg.EPG` class. This class provides the means to download and parse XML data to do with channels, but more importantly, programming schedules. While it can be used stand-alone, it is most effective when injected into the :class:`~.epg.EPG` object using :meth:`.epg.EPG.apply_XMLTVListing()`. """ def __init__( self, url: URL, path: PathLike = Path('.epg_data'), ) -> None: """Instantiate the object with the URL to fetch.""" self._url: URL self._path: Path if not isinstance(url, URL): self._url = URL(url) else: self._url = url # Validate path if not isinstance(path, Path) and not isinstance(path, str): raise TypeError('path must be a string or Path object.') # coerce it to a Path if it's a string if isinstance(path, str): self._path = Path(path) else: self._path = path if not self._path.is_dir(): self._path.mkdir() str_binary = str(self._url.human_repr()).encode('utf8') hashobj = hashlib.sha256(str_binary) self._hash = int.from_bytes(hashobj.digest(), 'big') self._filename = f'{hashobj.hexdigest()}.xml' self._full_path = self._path.joinpath(self._filename) self._last_modified: Optional[datetime] = None self._downloaded: bool = False self._downloading: bool = False LOGGER.debug(f'XMLTVListing initialised: {self}') def __hash__(self) -> int: """Define hash function for a Hashable object.""" return self._hash def __eq__(self, other) -> bool: """Define equality test for a Hashable object.""" # relying on lazy boolean evaluation here. return isinstance(other, XMLTVListing) and hash(other._url) == hash( self._url) # pylint: disable=protected-access def __repr__(self): """Print a human-friendly representation of this object.""" return f"<XMLTVListing: url='{self._url}', path='{self._path}', filename='{self._filename}'>" @property def last_modified(self): """Return the last modified date from the HTTP header of the last download. Returns the date and time when the data was last modified. Taken directly from the ``HTTP-Header``. If the header was not provided, it returns ``None``. Returns: datetime: A :py:class:`datetime.datetime` object or ``None``. """ return self._last_modified @property def url(self) -> URL: """Return the url of this XMLTVListing. Returns: :py:class:`yarl.URL` """ return self._url @property def downloaded(self) -> bool: """Return the status of XMLTV file download. Returns: bool: ``True`` if the file was downloaded successfully """ return self._downloaded @property def downloading(self) -> bool: """Return the status of XMLTV file download. Returns: bool: ``True`` if the file is currently being downlaoded """ return self._downloading @property def file_path(self) -> Path: """Return the full file path of this listing's XML file. Returns: :py:class:`pathlib.Path`: A `Path` to the location of the XML file (whether it has yet been :meth:`fetch`'ed or not). """ return self._full_path # TODO -- add error handling for # -- HTTP headers missing # -- timeouts # -- badURL, etc, etc. # TODO -- add retry support async def fetch(self) -> None: """Fetch the XMLTVListing file. This async method will download the (XML) file specified by the URL passed at instantiation. If the server does not support streaming downloads the method will will fall back to a more memory-intensive, vanilla HTTP download. Args: timeout (int): timeout in seconds for the HTTP session. Defaults to 60 seconds. range_size (int): the size, in bytes, of each chunk. Defaults to 256k (256*1024 bytes). Returns: None """ LOGGER.debug(f'Fetch({self}) call started.') self._downloading = True newfile = self._full_path.with_suffix('.tmp') # We support gzip encoding as www.xmltv.co.uk sends stuff gzipped. # the zlib decompressiion object supports streaming decompression which # why we use it. # We should probably support more encoding algorithms at some point. decompression_obj = zlib.decompressobj(wbits=16 + zlib.MAX_WBITS) compressed_payload = True # assume payload is compressed first. async def chunk_processor(decomp_obj, bytechunk): nonlocal compressed_payload async with await trio.open_file(newfile, 'ab') as output_file: if compressed_payload: try: LOGGER.debug( f'Got zipped chunk. size = {len(bytechunk)}. Unzipping...' ) await output_file.write( decomp_obj.decompress(bytechunk)) LOGGER.debug( f'Wrote unzipped chunk. size = {len(bytechunk)}') except zlib.error: await output_file.write(bytechunk) LOGGER.debug(f'Wrote chunk. size = {len(bytechunk)}') compressed_payload = False # it wasn't compressed, so fall back else: await output_file.write(bytechunk) LOGGER.debug(f'Wrote chunk. size = {len(bytechunk)}') resp = await asks.get( str(self._url), headers={'Accept-Encoding': 'gzip'}, # callback takes bytes only callback=partial(chunk_processor, decompression_obj)) LOGGER.debug(f'Got headers = {resp.headers}') # h11 makes header keys lowercase so we can reply on this if resp.headers.get("last-modified"): self._last_modified = parse_http_date( resp.headers.get("last-modified")) LOGGER.debug( f'Content last modified on: {resp.headers.get("last-modified")}.' ) shutil.move(newfile, self._full_path) self._downloading = False self._downloaded = True LOGGER.debug(f'Fetch finished on {self}') def parse(self) -> Iterator[Union[Channel, Programme]]: """Parse the XMLTVListing XML file and create an iterator over the data in it. Yields: (Any): Either a :class:`~pyskyq.channel.Channel` or a :class:`~pyskyq.programme.Programme` object is yielded. """ if not self.downloaded and not self.downloading: raise OSError( 'File not downloaded, or download is currently in flight.') else: LOGGER.debug(f'in parse_channels. file = {self.file_path}') for xml_type, xml_item in _xml_parse_and_remove( self._full_path): # type: ignore if xml_type == 'channel': LOGGER.debug('yielding channel...') yield channel_from_xmltv_list(xml_item) if xml_type == 'programme': LOGGER.debug('yielding programme...') yield programme_from_xmltv_list(xml_item)
class SpiderCrawler: def __init__(self, start_url, database, depth): self.client = httpx.AsyncClient() self.url = URL(start_url) self.db = database self.depth = depth @timer async def get_data_from_url(self): calls = 0 @not_retries async def load(url_: URL, level_): nonlocal calls calls += 1 try: title, html_body, soup = await self._load_and_parse(url_) except TypeError: # Can't download return asyncio.ensure_future( self.db.save_to_db(url_, title, html_body, parent=self.url.human_repr())) if level_ >= self.depth: return refs = self._ref_generator(soup.findAll('a')) todos = [load(ref, level_ + 1) for ref in refs] await asyncio.gather(*todos) try: await load(self.url, 0) finally: print("CALLS: ", calls) await self.client.aclose() await self.db.pg.pool.close() async def _load_and_parse(self, url_: URL): try: res = await self.client.get(str(url_)) except httpx.HTTPError: return except ValueError: return soup = BeautifulSoup(res, 'lxml') try: title = soup.title.text except AttributeError: title = None html_body = res.text return title, html_body, soup def _ref_generator(self, bs_result_set): for ref in bs_result_set: try: href = URL(ref.attrs['href']) if href.query_string: # Without QS continue if not href.is_absolute(): href = self.url.join(href) if href != self.url: yield href except KeyError: continue
def check_domains(self, link: URL) -> bool: for domain in self.domains: if domain in link.human_repr(): return True return False
class Request(Task): """Request is a Task that execute :meth:`fetch` method. Attributes: url: callback: should be a callable function or a list of functions. It will be passed to the corresponding response task. family: this family will be appended in families and also passed to corresponding response task. status_allowed: a list of allowed status integer. Otherwise any response task with `status!=200` will fail and retry. meta: a dictionary to deliver information. It will be passed to :attr:`Response.meta`. request_config: a dictionary, will be passed as keyword arguments to :meth:`aiohttp.ClientSession.request`. acceptable keyword: params - Dictionary or bytes to be sent in the query string of the new request data - Dictionary, bytes, or file-like object to send in the body of the request json - Any json compatible python object headers - Dictionary of HTTP Headers to send with the request cookies - Dict object to send with the request allow_redirects - If set to False, do not follow redirects timeout - Optional ClientTimeout settings structure, 5min total timeout by default. """ def __init__( self, url: _LooseURL, callback: _Functions = None, method: str = "GET", request_config: dict = None, status_allowed: list = None, encoding=None, links_to_abs=True, # Below are paras for parent class dont_filter: bool = False, ignore_exception: bool = False, meta: dict = None, priority: int = 0, family=None, family_for_response=None, recrawl=0, exetime=0, **kwargs, ): super().__init__( dont_filter=dont_filter, ignore_exception=ignore_exception, priority=priority, meta=meta, family=family, recrawl=recrawl, exetime=exetime, **kwargs, ) self.url = URL(url) self.method = method self.status_allowed = status_allowed self.callbacks = [] if callback: self.add_callback(callback) self.request_config = request_config if request_config else {} self.session = None self.client = None self.response: Response = None self.family_for_response = family_for_response self.encoding = encoding self.links_to_abs = links_to_abs self.inprogress = False # is this request start execution; for counter @property def url_str(self): return self.url.human_repr() @property def url_str_canonicalized(self): query_str = "&".join(sorted(self.url.raw_query_string.split("&"))) return ( str(self.url) .replace(self.url.raw_query_string, query_str) .replace("#" + self.url.raw_fragment, "") ) def add_callback(self, func: _Function): if isinstance(func, Iterable): for f in func: self.callbacks.append(f) else: self.callbacks.append(func) def reset_callback(self): self.callbacks = [] def _fingerprint(self): """fingerprint for a request task. .. todo::write a better hashing function for request. """ fp = hashlib.sha1() fp.update(self.url_str_canonicalized.encode()) fp.update(self.method.encode()) return fp.hexdigest() async def _execute(self, **kwargs): """Wraps :meth:`fetch`""" yield await self.fetch() async def send(self): """This method is used for independent usage of Request without Crawler. """ resp = None async for task in self.execute(): if isinstance(task, Response): resp = task return resp async def fetch(self): """Sends a request and return the response as a task.""" to_close = False if self.session is None: self.session = aiohttp.ClientSession() to_close = True try: async with self.session.request( self.method, self.url, **self.request_config ) as cresp: body = await cresp.read() encoding = self.encoding or cresp.get_encoding() self.response = Response( url=cresp.url, status=cresp.status, cookies=cresp.cookies, headers=cresp.headers.copy(), body=body, encoding=encoding, links_to_abs=self.links_to_abs, callbacks=self.callbacks.copy(), request=self, family=self.family_for_response, ) rt = self.response logger.info(f"<{self.response.status}> {self.response.url_str}") return rt except Exception as e: raise e finally: if to_close: await self.session.close() def __str__(self): return f"<Task {self.primary_family}> ({self.url.human_repr()})" def __getstate__(self): state = super().__getstate__() state.pop("session", None) state.pop("client", None) if 'exceptions' in state: state['exceptions'] = [] return state def __setstate__(self, state): super().__setstate__(state) self.__dict__["session"] = None self.__dict__["client"] = None