Ejemplo n.º 1
0
    def extract_links(link: Link,
                      html: typing.Union[str, bytes]) -> typing.List[str]:
        """Extract links from HTML document.

        Args:
            link: Original link of the HTML document.
            html: Content of the HTML document.
            check: If perform checks on extracted links,
                default to :data:`~darc.const.CHECK`.

        Returns:
            List of extracted links.

        See Also:
            * :func:`darc.parse.extract_links`

        """
        soup = bs4.BeautifulSoup(html, 'html5lib')

        link_list = list()
        for child in soup.find_all(
                lambda tag: tag.has_attr('href') or tag.has_attr('src')):
            if (href := child.get('href', child.get('src'))) is None:
                continue
            temp_link = urljoin(link.url, href)
            link_list.append(temp_link)
Ejemplo n.º 2
0
def extract_links(link: 'darc_link.Link',
                  html: 'Union[str, bytes]',
                  check: bool = CHECK) -> 'List[darc_link.Link]':
    """Extract links from HTML document.

    Args:
        link: Original link of the HTML document.
        html: Content of the HTML document.
        check: If perform checks on extracted links,
            default to :data:`~darc.const.CHECK`.

    Returns:
        List of extracted links.

    See Also:
        * :func:`darc.parse._check`
        * :func:`darc.parse._check_ng`

    """
    soup = bs4.BeautifulSoup(html, 'html5lib')

    temp_list = []
    for child in soup.find_all(
            lambda tag: tag.has_attr('href') or tag.has_attr('src')):
        if (href := child.get('href', child.get('src'))) is None:
            continue
        temp_link = parse_link(urljoin(link.url, href), backref=link)
        temp_list.append(temp_link)
Ejemplo n.º 3
0
def fetch_hosts(link: 'darc_link.Link', force: bool = False) -> None:
    """Fetch ``hosts.txt``.

    Args:
        link: Link object to fetch for its ``hosts.txt``.
        force: Force refetch ``hosts.txt``.

    Returns:
        Content of the ``hosts.txt`` file.

    """
    if force:
        logger.warning('[HOSTS] Force refetch %s', link.url)

    hosts_path = None if force else have_hosts(link)
    if hosts_path is not None:

        logger.warning('[HOSTS] Cached %s', link.url)  # pylint: disable=no-member
        with open(hosts_path) as hosts_file:
            hosts_text = hosts_file.read()

    else:

        from darc.requests import i2p_session  # pylint: disable=import-outside-toplevel

        hosts_link = parse_link(urljoin(link.url, '/hosts.txt'), backref=link)
        logger.info('[HOSTS] Subscribing %s', hosts_link.url)

        with i2p_session() as session:
            try:
                response = session.get(hosts_link.url)
            except requests.RequestException:
                logger.pexc(message=f'[HOSTS] Failed on {hosts_link.url}')
                return

        if not response.ok:
            logger.error('[HOSTS] Failed on %s [%d]', hosts_link.url, response.status_code)
            return

        ct_type = get_content_type(response)
        if ct_type not in ['text/text', 'text/plain']:
            logger.error('[HOSTS] Unresolved content type on %s (%s)', hosts_link.url, ct_type)
            return

        hosts_text = response.text
        save_hosts(hosts_link, hosts_text)

        logger.info('[HOSTS] Subscribed %s', hosts_link.url)

    from darc.db import save_requests  # pylint: disable=import-outside-toplevel

    # add link to queue
    save_requests(read_hosts(link, hosts_text))
Ejemplo n.º 4
0
def fetch_hosts(link: Link):
    """Fetch ``hosts.txt``.

    Args:
        link: Link object to fetch for its ``hosts.txt``.

    Returns:
        Content of the ``hosts.txt`` file.

    """
    hosts_path = have_hosts(link)
    if hosts_path is not None:

        print(
            stem.util.term.format(f'[HOSTS] Cached {link.url}',
                                  stem.util.term.Color.YELLOW))  # pylint: disable=no-member

        with open(hosts_path) as hosts_file:
            hosts_text = hosts_file.read()

    else:

        from darc.requests import i2p_session  # pylint: disable=import-outside-toplevel

        hosts_link = parse_link(urljoin(link.url, '/hosts.txt'))
        print(f'[HOSTS] Subscribing {hosts_link.url}')

        with i2p_session() as session:
            try:
                response = session.get(hosts_link.url)
            except requests.RequestException as error:
                print(render_error(
                    f'[HOSTS] Failed on {hosts_link.url} <{error}>',
                    stem.util.term.Color.RED),
                      file=sys.stderr)  # pylint: disable=no-member
                return

        if not response.ok:
            print(render_error(
                f'[HOSTS] Failed on {hosts_link.url} [{response.status_code}]',
                stem.util.term.Color.RED),
                  file=sys.stderr)  # pylint: disable=no-member
            return

        ct_type = get_content_type(response)
        if ct_type not in ['text/text', 'text/plain']:
            print(render_error(
                f'[HOSTS] Unresolved content type on {hosts_link.url} ({ct_type}',
                stem.util.term.Color.RED),
                  file=sys.stderr)  # pylint: disable=no-member
            return

        hosts_text = response.text
        save_hosts(hosts_link, hosts_text)

        print(f'[HOSTS] Subscribed {hosts_link.url}')

    from darc.db import save_requests  # pylint: disable=import-outside-toplevel

    # add link to queue
    save_requests(read_hosts(hosts_text))