def extract_links(link: Link, html: typing.Union[str, bytes]) -> typing.List[str]: """Extract links from HTML document. Args: link: Original link of the HTML document. html: Content of the HTML document. check: If perform checks on extracted links, default to :data:`~darc.const.CHECK`. Returns: List of extracted links. See Also: * :func:`darc.parse.extract_links` """ soup = bs4.BeautifulSoup(html, 'html5lib') link_list = list() for child in soup.find_all( lambda tag: tag.has_attr('href') or tag.has_attr('src')): if (href := child.get('href', child.get('src'))) is None: continue temp_link = urljoin(link.url, href) link_list.append(temp_link)
def extract_links(link: 'darc_link.Link', html: 'Union[str, bytes]', check: bool = CHECK) -> 'List[darc_link.Link]': """Extract links from HTML document. Args: link: Original link of the HTML document. html: Content of the HTML document. check: If perform checks on extracted links, default to :data:`~darc.const.CHECK`. Returns: List of extracted links. See Also: * :func:`darc.parse._check` * :func:`darc.parse._check_ng` """ soup = bs4.BeautifulSoup(html, 'html5lib') temp_list = [] for child in soup.find_all( lambda tag: tag.has_attr('href') or tag.has_attr('src')): if (href := child.get('href', child.get('src'))) is None: continue temp_link = parse_link(urljoin(link.url, href), backref=link) temp_list.append(temp_link)
def fetch_hosts(link: 'darc_link.Link', force: bool = False) -> None: """Fetch ``hosts.txt``. Args: link: Link object to fetch for its ``hosts.txt``. force: Force refetch ``hosts.txt``. Returns: Content of the ``hosts.txt`` file. """ if force: logger.warning('[HOSTS] Force refetch %s', link.url) hosts_path = None if force else have_hosts(link) if hosts_path is not None: logger.warning('[HOSTS] Cached %s', link.url) # pylint: disable=no-member with open(hosts_path) as hosts_file: hosts_text = hosts_file.read() else: from darc.requests import i2p_session # pylint: disable=import-outside-toplevel hosts_link = parse_link(urljoin(link.url, '/hosts.txt'), backref=link) logger.info('[HOSTS] Subscribing %s', hosts_link.url) with i2p_session() as session: try: response = session.get(hosts_link.url) except requests.RequestException: logger.pexc(message=f'[HOSTS] Failed on {hosts_link.url}') return if not response.ok: logger.error('[HOSTS] Failed on %s [%d]', hosts_link.url, response.status_code) return ct_type = get_content_type(response) if ct_type not in ['text/text', 'text/plain']: logger.error('[HOSTS] Unresolved content type on %s (%s)', hosts_link.url, ct_type) return hosts_text = response.text save_hosts(hosts_link, hosts_text) logger.info('[HOSTS] Subscribed %s', hosts_link.url) from darc.db import save_requests # pylint: disable=import-outside-toplevel # add link to queue save_requests(read_hosts(link, hosts_text))
def fetch_hosts(link: Link): """Fetch ``hosts.txt``. Args: link: Link object to fetch for its ``hosts.txt``. Returns: Content of the ``hosts.txt`` file. """ hosts_path = have_hosts(link) if hosts_path is not None: print( stem.util.term.format(f'[HOSTS] Cached {link.url}', stem.util.term.Color.YELLOW)) # pylint: disable=no-member with open(hosts_path) as hosts_file: hosts_text = hosts_file.read() else: from darc.requests import i2p_session # pylint: disable=import-outside-toplevel hosts_link = parse_link(urljoin(link.url, '/hosts.txt')) print(f'[HOSTS] Subscribing {hosts_link.url}') with i2p_session() as session: try: response = session.get(hosts_link.url) except requests.RequestException as error: print(render_error( f'[HOSTS] Failed on {hosts_link.url} <{error}>', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member return if not response.ok: print(render_error( f'[HOSTS] Failed on {hosts_link.url} [{response.status_code}]', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member return ct_type = get_content_type(response) if ct_type not in ['text/text', 'text/plain']: print(render_error( f'[HOSTS] Unresolved content type on {hosts_link.url} ({ct_type}', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member return hosts_text = response.text save_hosts(hosts_link, hosts_text) print(f'[HOSTS] Subscribed {hosts_link.url}') from darc.db import save_requests # pylint: disable=import-outside-toplevel # add link to queue save_requests(read_hosts(hosts_text))