Beispiel #1
0
def zeronet_bootstrap() -> None:
    """Bootstrap wrapper for ZeroNet.

    The function will bootstrap the ZeroNet proxy. It will retry for
    :data:`~darc.proxy.zeronet.ZERONET_RETRY` times in case of failure.

    Also, it will **NOT** re-bootstrap the proxy as is guaranteed by
    :data:`~darc.proxy.zeronet._ZERONET_BS_FLAG`.

    Warns:
        ZeroNetBootstrapFailed: If failed to bootstrap ZeroNet proxy.

    Raises:
        :exc:`UnsupportedPlatform`: If the system is not supported, i.e. not macOS or Linux.

    See Also:
        * :func:`darc.proxy.zeronet._zeronet_bootstrap`
        * :data:`darc.proxy.zeronet.ZERONET_RETRY`
        * :data:`darc.proxy.zeronet._ZERONET_BS_FLAG`

    """
    # don't re-bootstrap
    if _ZERONET_BS_FLAG:
        return

    logger.info('-*- ZeroNet Bootstrap -*-')
    for _ in range(ZERONET_RETRY+1):
        try:
            _zeronet_bootstrap()
            break
        except Exception:
            if DEBUG:
                logger.ptb('[Error bootstraping ZeroNet proxy]')
            logger.pexc(LOG_WARNING, category=ZeroNetBootstrapFailed, line='zeronet_bootstrap()')
    logger.pline(LOG_INFO, logger.horizon)
Beispiel #2
0
def have_hostname(link: 'Link') -> 'Tuple[bool, bool]':
    """Check if current link is a new host.

    Args:
        link: Link to check against.

    Returns:
        A tuple of two :obj:`bool` values representing
        if such link is a known host and needs force
        refetch respectively.

    See Also:
        * :func:`darc.db._have_hostname_db`
        * :func:`darc.db._have_hostname_redis`

    """
    if FLAG_DB:
        with database.connection_context():
            try:
                return _have_hostname_db(link)
            except Exception:
                logger.pexc(LOG_WARNING,
                            category=DatabaseOperaionFailed,
                            line=f'_have_hostname_db({link.url})')
                return False, False
    return _have_hostname_redis(link)
Beispiel #3
0
def _db_operation(operation: 'Callable[..., _T]', *args: 'Any',
                  **kwargs: 'Any') -> '_T':
    """Retry operation on database.

    Args:
        operation: Callable / method to perform.
        *args: Arbitrary positional arguments.

    Keyword Args:
        **kwargs: Arbitrary keyword arguments.

    Returns:
        Any return value from a successful
        ``operation`` call.

    """
    _arg_msg = None

    while True:
        try:
            value = operation(*args, **kwargs)
        except peewee.PeeweeException:
            if _arg_msg is None:
                _arg_msg = _gen_arg_msg(*args, **kwargs)

            model = cast('MethodType', operation).__self__.__class__.__name__
            logger.pexc(LOG_WARNING,
                        category=DatabaseOperaionFailed,
                        line=f'{model}.{operation.__name__}({_arg_msg})')

            if RETRY_INTERVAL is not None:
                time.sleep(RETRY_INTERVAL)
            continue
        break
    return value
Beispiel #4
0
def tor_bootstrap() -> None:
    """Bootstrap wrapper for Tor.

    The function will bootstrap the Tor proxy. It will retry for
    :data:`~darc.proxy.tor.TOR_RETRY` times in case of failure.

    Also, it will **NOT** re-bootstrap the proxy as is guaranteed by
    :data:`~darc.proxy.tor._TOR_BS_FLAG`.

    Warns:
        TorBootstrapFailed: If failed to bootstrap Tor proxy.

    See Also:
        * :func:`darc.proxy.tor._tor_bootstrap`
        * :data:`darc.proxy.tor.TOR_RETRY`
        * :data:`darc.proxy.tor._TOR_BS_FLAG`

    """
    # don't re-bootstrap
    if _TOR_BS_FLAG:
        return

    logger.info('-*- Tor Bootstrap -*-')
    for _ in range(TOR_RETRY + 1):
        try:
            _tor_bootstrap()
            break
        except Exception:
            if DEBUG:
                logger.ptb('[Error bootstraping Tor proxy]')
            logger.pexc(LOG_WARNING,
                        category=TorBootstrapFailed,
                        line='tor_bootstrap()')
    logger.pline(LOG_INFO, logger.horizon)
Beispiel #5
0
def _check_ng(temp_list: 'List[darc_link.Link]') -> 'List[darc_link.Link]':
    """Check content type of links through ``HEAD`` requests.

    Args:
        temp_list: List of links to be checked.

    Returns:
        List of links matches the requirements.

    See Also:
        * :func:`darc.parse.match_host`
        * :func:`darc.parse.match_proxy`
        * :func:`darc.parse.match_mime`

    """
    from darc.crawl import request_session  # pylint: disable=import-outside-toplevel

    session_map = {}  # type: Dict[str, FuturesSession]
    result_list = []
    for link in temp_list:
        if match_host(link.host):
            continue
        if match_proxy(link.proxy):
            continue

        # get session
        session = session_map.get(link.proxy)
        if session is None:
            session = request_session(link, futures=True)
            session_map[link.proxy] = session

        result = session.head(link.url, allow_redirects=True)
        result_list.append(result)

        logger.info('[HEAD] Checking content type from %s', link.url)

    link_list = []
    for result in concurrent.futures.as_completed(result_list):  # type: ignore
        try:
            response = result.result()  # type: Response
        except requests.RequestException as error:
            if error.response is None:
                logger.pexc(message='[HEAD] Checking failed')
                continue
            logger.pexc(message=f'[HEAD] Failed on {error.response.url}')
            link_list.append(error.response.url)
            continue
        ct_type = get_content_type(response)

        logger.info('[HEAD] Checked content type from %s (%s)', response.url,
                    ct_type)

        if match_mime(ct_type):
            continue
        temp_link = parse_link(response.request.url)  # type: ignore
        link_list.append(temp_link)
    return link_list
Beispiel #6
0
def fetch_hosts(link: 'darc_link.Link', force: bool = False) -> None:
    """Fetch ``hosts.txt``.

    Args:
        link: Link object to fetch for its ``hosts.txt``.
        force: Force refetch ``hosts.txt``.

    Returns:
        Content of the ``hosts.txt`` file.

    """
    if force:
        logger.warning('[HOSTS] Force refetch %s', link.url)

    hosts_path = None if force else have_hosts(link)
    if hosts_path is not None:

        logger.warning('[HOSTS] Cached %s', link.url)  # pylint: disable=no-member
        with open(hosts_path) as hosts_file:
            hosts_text = hosts_file.read()

    else:

        from darc.requests import i2p_session  # pylint: disable=import-outside-toplevel

        hosts_link = parse_link(urljoin(link.url, '/hosts.txt'), backref=link)
        logger.info('[HOSTS] Subscribing %s', hosts_link.url)

        with i2p_session() as session:
            try:
                response = session.get(hosts_link.url)
            except requests.RequestException:
                logger.pexc(message=f'[HOSTS] Failed on {hosts_link.url}')
                return

        if not response.ok:
            logger.error('[HOSTS] Failed on %s [%d]', hosts_link.url, response.status_code)
            return

        ct_type = get_content_type(response)
        if ct_type not in ['text/text', 'text/plain']:
            logger.error('[HOSTS] Unresolved content type on %s (%s)', hosts_link.url, ct_type)
            return

        hosts_text = response.text
        save_hosts(hosts_link, hosts_text)

        logger.info('[HOSTS] Subscribed %s', hosts_link.url)

    from darc.db import save_requests  # pylint: disable=import-outside-toplevel

    # add link to queue
    save_requests(read_hosts(link, hosts_text))
Beispiel #7
0
def renew_tor_session() -> None:
    """Renew Tor session."""
    global _TOR_CTRL  # pylint: disable=global-statement

    try:
        # Tor controller process
        if _TOR_CTRL is None:
            _TOR_CTRL = stem.control.Controller.from_port(port=int(TOR_CTRL))
            _TOR_CTRL.authenticate(TOR_PASS)
        _TOR_CTRL.signal(stem.Signal.NEWNYM)  # pylint: disable=no-member
    except Exception:
        logger.pexc(
            LOG_WARNING,
            category=TorRenewFailed,
            line=
            '_TOR_CTRL = stem.control.Controller.from_port(port=int(TOR_CTRL))'
        )
Beispiel #8
0
def save_requests(entries: 'Union[Link, List[Link]]',
                  single: bool = False,
                  score: 'Optional[float]' = None,
                  nx: bool = False,
                  xx: bool = False) -> None:
    """Save link to the :mod:`requests` database.

    The function updates the ``queue_requests`` database.

    Args:
        entries: Links to be added to the :mod:`requests` database.
            It can be either a :obj:`list` of links, or a single
            link string (if ``single`` set as :data:`True`).
        single: Indicate if ``entries`` is a :obj:`list` of links
            or a single link string.
        score: Score to for the Redis sorted set.
        nx: Only create new elements and not to
            update scores for elements that already exist.
        xx: Only update scores of elements that
            already exist. New elements will not be added.

    Notes:
        The ``entries`` will be dumped through :mod:`pickle` so that
        :mod:`darc` do not need to parse them again.

    When ``entries`` is a list of :class:`~darc.link.Link` instances,
    we tries to perform *bulk* update to easy the memory consumption.
    The *bulk* size is defined by :data:`~darc.db.BULK_SIZE`.

    See Also:
        * :func:`darc.db._save_requests_db`
        * :func:`darc.db._save_requests_redis`

    """
    if FLAG_DB:
        with database.connection_context():
            try:
                return _save_requests_db(entries, single, score, nx,
                                         xx)  # type: ignore[call-overload]
            except Exception:
                _arg_msg = _gen_arg_msg(entries, single, score, nx, xx)
                logger.pexc(LOG_WARNING,
                            category=DatabaseOperaionFailed,
                            line=f'_save_requests_db({_arg_msg})')
                return None
    return _save_requests_redis(entries, single, score, nx, xx)
Beispiel #9
0
def process_crawler() -> None:
    """A worker to run the :func:`~darc.crawl.crawler` process.

    Warns:
        HookExecutionFailed: When hook function raises an error.

    """
    logger.info('[CRAWLER] Starting mainloop...')
    logger.debug('[CRAWLER] Starting first round...')

    # start mainloop
    while True:
        # requests crawler
        link_pool = load_requests()
        if not link_pool:
            if DARC_WAIT is not None:
                time.sleep(DARC_WAIT)
            continue

        for link in link_pool:
            crawler(link)

        time2break = False
        for hook in _HOOK_REGISTRY:
            try:
                hook('crawler', link_pool)
            except WorkerBreak:
                time2break = True
            except Exception:
                logger.pexc(LOG_WARNING, '[CRAWLER] hook execution failed',
                            HookExecutionFailed)

        # marked to break by hook function
        if time2break:
            break

        # quit in reboot mode
        if REBOOT:
            break

        # renew Tor session
        renew_tor_session()
        logger.debug('[CRAWLER] Starting next round...')

    logger.info('[CRAWLER] Stopping mainloop...')
Beispiel #10
0
def _load_selenium_redis() -> 'List[Link]':
    """Load link from the :mod:`selenium` database.

    The function reads the ``queue_selenium`` database.

    Args:
        check: If perform checks on loaded links,
            default to :data:`~darc.const.CHECK`.

    Returns:
        List of loaded links from the :mod:`selenium` database.

    Note:
        At runtime, the function will load links with maximum number
        at :data:`~darc.db.MAX_POOL` to limit the memory usage.

    """
    now = time.time()
    if TIME_CACHE is None:
        sec_delta = 0  # type: float
        max_score = now
    else:
        sec_delta = TIME_CACHE.total_seconds()
        max_score = now - sec_delta

    try:
        with _redis_get_lock('queue_selenium'):
            temp_pool = [_redis_command('get', name) for name in _redis_command('zrangebyscore', 'queue_selenium',
                                                                                min=0, max=max_score, start=0, num=MAX_POOL)]  # type: List[bytes] # pylint: disable=line-too-long
            link_pool = [
                pickle.loads(link) for link in filter(None, temp_pool)
            ]  # nosec: B301
            if TIME_CACHE is not None:
                new_score = now + sec_delta
                _save_selenium_redis(link_pool,
                                     score=new_score)  # force update records
    except pottery_exceptions.PotteryError:
        logger.pexc(
            LOG_WARNING,
            f'[SELENIUM] Failed to acquire Redis lock after {LOCK_TIMEOUT} second(s)',
            LockWarning, "_redis_get_lock('queue_selenium')")
        link_pool = []
    return link_pool
Beispiel #11
0
    def crawler(
        timestamp: 'datetime', session: 'Session',
        link: 'darc_link.Link') -> 'NoReturn':  # pylint: disable=unused-argument
        """Crawler hook for data URIs.

        Args:
            timestamp: Timestamp of the worker node reference.
            session (:class:`requests.Session`): Session object with proxy settings.
            link: Link object to be crawled.

        Raises:
            LinkNoReturn: This link has no return response.

        """
        try:
            save_data(link)
        except ValueError:
            logger.pexc(
                message=f'[REQUESTS] Failed to save data URI from {link.url}')
        raise LinkNoReturn(link)
Beispiel #12
0
def _redis_command(command: str, *args: 'Any', **kwargs: 'Any') -> 'Any':
    """Wrapper function for Redis command.

    Args:
        command: Command name.
        *args: Arbitrary arguments for the Redis command.

    Keyword Args:
        **kwargs: Arbitrary keyword arguments for the Redis command.

    Return:
        Values returned from the Redis command.

    Warns:
        RedisCommandFailed: Warns at each round when the command failed.

    See Also:
        Between each retry, the function sleeps for :data:`~darc.db.RETRY_INTERVAL`
        second(s) if such value is **NOT** :data:`None`.

    """
    _arg_msg = None

    method = getattr(redis, command)
    while True:
        try:
            value = method(*args, **kwargs)
        except (redis_lib.exceptions.RedisError,
                pottery_exceptions.PotteryError):
            if _arg_msg is None:
                _arg_msg = _gen_arg_msg(*args, **kwargs)

            logger.pexc(LOG_WARNING,
                        category=RedisCommandFailed,
                        line=f'value = redis.{command}({_arg_msg})')

            if RETRY_INTERVAL is not None:
                time.sleep(RETRY_INTERVAL)
            continue
        break
    return value
Beispiel #13
0
def drop_selenium(link: 'Link') -> None:  # pylint: disable=inconsistent-return-statements
    """Remove link from the :mod:`selenium` database.

    Args:
        link: Link to be removed.

    See Also:
        * :func:`darc.db._drop_selenium_db`
        * :func:`darc.db._drop_selenium_redis`

    """
    if FLAG_DB:
        with database.connection_context():
            try:
                return _drop_selenium_db(link)
            except Exception:
                logger.pexc(LOG_WARNING,
                            category=DatabaseOperaionFailed,
                            line=f'_drop_selenium_db({link.url})')
                return None
    return _drop_selenium_redis(link)
Beispiel #14
0
def drop_hostname(link: 'Link') -> None:
    """Remove link from the hostname database.

    Args:
        link: Link to be removed.

    See Also:
        * :func:`darc.db._drop_hostname_db`
        * :func:`darc.db._drop_hostname_redis`

    """
    if FLAG_DB:
        with database.connection_context():
            try:
                return _drop_hostname_db(link)
            except Exception:
                logger.pexc(LOG_WARNING,
                            category=DatabaseOperaionFailed,
                            line=f'_drop_hostname_db({link.url})')
                return None
    return _drop_hostname_redis(link)
Beispiel #15
0
def freenet_bootstrap() -> None:
    """Bootstrap wrapper for Freenet.

    The function will bootstrap the Freenet proxy. It will retry for
    :data:`~darc.proxy.freenet.FREENET_RETRY` times in case of failure.

    Also, it will **NOT** re-bootstrap the proxy as is guaranteed by
    :data:`~darc.proxy.freenet._FREENET_BS_FLAG`.

    Warns:
        FreenetBootstrapFailed: If failed to bootstrap Freenet proxy.

    Raises:
        :exc:`UnsupportedPlatform`: If the system is not supported, i.e. not macOS or Linux.

    See Also:
        * :func:`darc.proxy.freenet._freenet_bootstrap`
        * :data:`darc.proxy.freenet.FREENET_RETRY`
        * :data:`darc.proxy.freenet._FREENET_BS_FLAG`

    """
    if _unsupported:
        raise UnsupportedPlatform(f'unsupported system: {platform.system()}')

    # don't re-bootstrap
    if _FREENET_BS_FLAG:
        return

    logger.info('-*- Freenet Bootstrap -*-')
    for _ in range(FREENET_RETRY + 1):
        try:
            _freenet_bootstrap()
            break
        except Exception:
            if DEBUG:
                logger.ptb('[Error bootstraping Freenet proxy]')
            logger.pexc(LOG_WARNING,
                        category=FreenetBootstrapFailed,
                        line='freenet_bootstrap()')
    logger.pline(LOG_INFO, logger.horizon)
Beispiel #16
0
def load_requests(check: bool = CHECK) -> 'List[Link]':
    """Load link from the :mod:`requests` database.

    Args:
        check: If perform checks on loaded links,
            default to :data:`~darc.const.CHECK`.

    Returns:
        List of loaded links from the :mod:`requests` database.

    Note:
        At runtime, the function will load links with maximum number
        at :data:`~darc.db.MAX_POOL` to limit the memory usage.

    See Also:
        * :func:`darc.db._load_requests_db`
        * :func:`darc.db._load_requests_redis`

    """
    if FLAG_DB:
        with database.connection_context():
            try:
                link_pool = _load_requests_db()
            except Exception:
                logger.pexc(LOG_WARNING,
                            category=DatabaseOperaionFailed,
                            line='_load_requests_db()')
                link_pool = []
    else:
        link_pool = _load_requests_redis()

    if check:
        link_pool = _check(link_pool)

    logger.plog(LOG_VERBOSE,
                '-*- [REQUESTS] LINK POOL -*-',
                object=sorted(link.url for link in link_pool))
    return link_pool
Beispiel #17
0
def main(argv: 'Optional[List[str]]' = None) -> int:
    """Entrypoint.

    Args:
        argv: Optional command line arguments.

    Returns:
        Exit code.

    """
    parser = get_parser()
    args = parser.parse_args(argv)

    pid = os.getpid()
    with open(PATH_ID, 'w') as file:
        print(pid, file=file)

    # wait for Redis
    if _WAIT_REDIS:
        if not FLAG_DB:
            _redis_command('set', 'darc', pid)

    if FLAG_DB:
        while True:
            try:
                with DB:
                    _db_operation(DB.create_tables, [
                        HostnameQueueModel,
                        RequestsQueueModel,
                        SeleniumQueueModel,
                    ])
            except Exception:
                logger.pexc(LOG_WARNING,
                            category=DatabaseOperaionFailed,
                            line='DB.create_tables([HostnameQueueModel, ...]')
                continue
            break

    if SAVE_DB:
        while True:
            try:
                with DB_WEB:
                    _db_operation(DB_WEB.create_tables, [
                        HostnameModel,
                        URLModel,
                        URLThroughModel,
                        RobotsModel,
                        SitemapModel,
                        HostsModel,
                        RequestsModel,
                        RequestsHistoryModel,
                        SeleniumModel,
                    ])
            except Exception:
                logger.pexc(LOG_WARNING,
                            category=DatabaseOperaionFailed,
                            line='DB.create_tables([HostnameModel, ...]')
                continue
            break

    logger.debug('-*- Initialisation -*-')
    if DEBUG and not FLAG_DB:
        # nuke the db
        _redis_command('delete', 'queue_hostname')
        _redis_command('delete', 'queue_requests')
        _redis_command('delete', 'queue_selenium')

    link_list = []
    for link in filter(
            None, map(lambda s: s.strip(),
                      args.link)):  # type: ignore[name-defined,var-annotated]
        logger.pline(LOG_DEBUG, link)
        link_list.append(link)

    if args.file is not None:
        for path in args.file:
            with open(path) as file:
                for line in filter(None, map(lambda s: s.strip(), file)):
                    if line.startswith('#'):
                        continue
                    logger.pline(LOG_DEBUG, line)
                    link_list.append(line)

    # write to database
    link_pool = [parse_link(link, backref=None) for link in link_list]
    save_requests(link_pool, score=0, nx=True)
    logger.pline(LOG_DEBUG, logger.horizon)

    # init link file
    if not os.path.isfile(PATH_LN):
        with open(PATH_LN, 'w') as file:
            print('proxy,scheme,host,hash,link', file=file)

    try:
        process(args.type)
    except BaseException:
        traceback.print_exc()
    _exit()

    return 0
Beispiel #18
0
def crawler(link: 'darc_link.Link') -> None:
    """Single :mod:`requests` crawler for an entry link.

    Args:
        link: URL to be crawled by :mod:`requests`.

    The function will first parse the URL using
    :func:`~darc.link.parse_link`, and check if need to crawl the
    URL (c.f. :data:`~darc.const.PROXY_WHITE_LIST`, :data:`~darc.const.PROXY_BLACK_LIST`,
    :data:`~darc.const.LINK_WHITE_LIST` and :data:`~darc.const.LINK_BLACK_LIST`);
    if true, then crawl the URL with :mod:`requests`.

    If the URL is from a brand new host, :mod:`darc` will first try
    to fetch and save ``robots.txt`` and sitemaps of the host
    (c.f. :func:`~darc.proxy.null.save_robots` and :func:`~darc.proxy.null.save_sitemap`),
    and extract then save the links from sitemaps (c.f. :func:`~darc.proxy.null.read_sitemap`)
    into link database for future crawling (c.f. :func:`~darc.db.save_requests`).

    .. note::

       A host is new if :func:`~darc.db.have_hostname` returns :data:`True`.

       If :func:`darc.proxy.null.fetch_sitemap` and/or :func:`darc.proxy.i2p.fetch_hosts`
       failed when fetching such documents, the host will be removed from the
       hostname database through :func:`~darc.db.drop_hostname`, and considered
       as new when next encounter.

    Also, if the submission API is provided, :func:`~darc.submit.submit_new_host`
    will be called and submit the documents just fetched.

    If ``robots.txt`` presented, and :data:`~darc.const.FORCE` is
    :data:`False`, :mod:`darc` will check if allowed to crawl the URL.

    .. note::

        The root path (e.g. ``/`` in https://www.example.com/) will always
        be crawled ignoring ``robots.txt``.

    At this point, :mod:`darc` will call the customised hook function
    from :mod:`darc.sites` to crawl and get the final response object.
    :mod:`darc` will save the session cookies and header information,
    using :func:`~darc.save.save_headers`.

    .. note::

        If :exc:`requests.exceptions.InvalidSchema` is raised, the link
        will be saved by :func:`~darc.proxy.null.save_invalid`. Further
        processing is dropped, and the link will be removed from the
        :mod:`requests` database through :func:`~darc.db.drop_requests`.

        If :exc:`~darc.error.LinkNoReturn` is raised, the link will be
        removed from the :mod:`requests` database through
        :func:`~darc.db.drop_requests`.

    If the content type of response document is not ignored (c.f.
    :data:`~darc.const.MIME_WHITE_LIST` and :data:`~darc.const.MIME_BLACK_LIST`),
    :func:`~darc.submit.submit_requests` will be called and submit the document
    just fetched.

    If the response document is HTML (``text/html`` and ``application/xhtml+xml``),
    :func:`~darc.parse.extract_links` will be called then to extract
    all possible links from the HTML document and save such links into
    the database (c.f. :func:`~darc.db.save_requests`).

    And if the response status code is between ``400`` and ``600``,
    the URL will be saved back to the link database
    (c.f. :func:`~darc.db.save_requests`). If **NOT**, the URL will
    be saved into :mod:`selenium` link database to proceed next steps
    (c.f. :func:`~darc.db.save_selenium`).

    """
    logger.info('[REQUESTS] Requesting %s', link.url)
    try:
        if match_proxy(link.proxy):
            logger.warning('[REQUESTS] Ignored proxy type from %s (%s)',
                           link.url, link.proxy)
            drop_requests(link)
            return

        if match_host(link.host):
            logger.warning('[REQUESTS] Ignored hostname from %s (%s)',
                           link.url, link.proxy)
            drop_requests(link)
            return

        # timestamp
        timestamp = datetime.now()

        # get the session object in advance
        session = request_session(link)

        # check whether schema supported by :mod:`requests`
        try:
            session.get_adapter(link.url)  # test for adapter
            requests_supported = True
        except requests.exceptions.InvalidSchema:
            requests_supported = False

        # if need to test for new host
        if requests_supported:
            # if it's a new host
            flag_have, force_fetch = have_hostname(link)
            if not flag_have or force_fetch:
                partial = False

                if link.proxy not in ('zeronet', 'freenet'):
                    # fetch sitemap.xml
                    try:
                        fetch_sitemap(link, force=force_fetch)
                    except Exception:
                        logger.ptb('[Error fetching sitemap of %s]', link.url)
                        partial = True

                if link.proxy == 'i2p':
                    # fetch hosts.txt
                    try:
                        fetch_hosts(link, force=force_fetch)
                    except Exception:
                        logger.ptb('[Error subscribing hosts from %s]',
                                   link.url)
                        partial = True

                # submit data / drop hostname from db
                if partial:
                    drop_hostname(link)
                submit_new_host(timestamp,
                                link,
                                partial=partial,
                                force=force_fetch)

            if not FORCE and not check_robots(link):
                logger.warning('[REQUESTS] Robots disallowed link from %s',
                               link.url)
                return

        # reuse the session object
        with session:
            try:
                # requests session hook
                response = crawler_hook(timestamp, session, link)
            except requests.exceptions.InvalidSchema:
                logger.pexc(message=f'[REQUESTS] Fail to crawl {link.url}')
                save_invalid(link)
                drop_requests(link)
                return
            except requests.RequestException as error:
                logger.pexc(message=f'[REQUESTS] Fail to crawl {link.url}')
                save_requests(link, single=True)
                return
            except LinkNoReturn as error:
                logger.pexc(LOG_WARNING,
                            f'[REQUESTS] Removing from database: {link.url}')
                if error.drop:
                    drop_requests(link)
                return

            # save headers
            save_headers(timestamp, link, response, session)

            # check content type
            ct_type = get_content_type(response)
            if ct_type not in ['text/html', 'application/xhtml+xml']:
                logger.warning('[REQUESTS] Generic content type from %s (%s)',
                               link.url, ct_type)

                # probably hosts.txt
                if link.proxy == 'i2p' and ct_type in [
                        'text/plain', 'text/text'
                ]:
                    text = response.text
                    save_requests(read_hosts(link, text))

                if match_mime(ct_type):
                    drop_requests(link)
                    return

                # submit data
                data = response.content
                submit_requests(timestamp,
                                link,
                                response,
                                session,
                                data,
                                mime_type=ct_type,
                                html=False)

                return

            html = response.content
            if not html:
                logger.error('[REQUESTS] Empty response from %s', link.url)
                save_requests(link, single=True)
                return

            # submit data
            submit_requests(timestamp,
                            link,
                            response,
                            session,
                            html,
                            mime_type=ct_type,
                            html=True)

            # add link to queue
            save_requests(extract_links(link, html), score=0, nx=True)

            if not response.ok:
                logger.error('[REQUESTS] Failed on %s [%d]', link.url,
                             response.status_code)
                save_requests(link, single=True)
                return

            # add link to queue
            save_selenium(link, single=True, score=0, nx=True)
    except Exception:
        if SAVE_DB:
            with contextlib.suppress(Exception):
                host = HostnameModel.get_or_none(
                    HostnameModel.hostname ==
                    link.host)  # type: Optional[HostnameModel]
                if host is not None:
                    host.alive = False
                    host.save()

            with contextlib.suppress(Exception):
                url = URLModel.get_or_none(
                    URLModel.hash == link.name)  # type: Optional[URLModel]
                if url is not None:
                    url.alias = False
                    url.save()

        logger.ptb('[Error from %s]', link.url)
        save_requests(link, single=True)

    logger.info('[REQUESTS] Requested %s', link.url)
Beispiel #19
0
def fetch_sitemap(link: 'darc_link.Link', force: bool = False) -> None:
    """Fetch sitemap.

    The function will first fetch the ``robots.txt``, then
    fetch the sitemaps accordingly.

    Args:
        link: Link object to fetch for its sitemaps.
        force: Force refetch its sitemaps.

    Returns:
        Contents of ``robots.txt`` and sitemaps.

    See Also:
        * :func:`darc.proxy.null.read_robots`
        * :func:`darc.proxy.null.read_sitemap`
        * :func:`darc.parse.get_sitemap`

    """
    if force:
        logger.warning('[ROBOTS] Force refetch %s', link.url)

    robots_path = None if force else have_robots(link)
    if robots_path is not None:

        logger.warning('[ROBOTS] Cached %s', link.url)
        with open(robots_path) as file:
            robots_text = file.read()

    else:

        robots_link = parse_link(urljoin(link.url, '/robots.txt'), backref=link)
        logger.info('[ROBOTS] Checking %s', robots_link.url)

        with request_session(robots_link) as session:
            try:
                response = session.get(robots_link.url)
            except requests.RequestException:
                logger.pexc(message=f'[ROBOTS] Failed on {robots_link.url}')
                return

        if response.ok:
            ct_type = get_content_type(response)
            if ct_type not in ['text/text', 'text/plain']:
                logger.error('[ROBOTS] Unresolved content type on %s (%s)', robots_link.url, ct_type)
                robots_text = ''
            else:
                robots_text = response.text
                save_robots(robots_link, robots_text)
                logger.info('[ROBOTS] Checked %s', robots_link.url)
        else:
            logger.error('[ROBOTS] Failed on %s [%d]', robots_link.url, response.status_code)
            robots_text = ''

    if force:
        logger.warning('[SITEMAP] Force refetch %s', link.url)

    sitemaps = read_robots(link, robots_text, host=link.host)
    for sitemap_link in sitemaps:
        sitemap_path = None if force else have_sitemap(sitemap_link)
        if sitemap_path is not None:

            logger.warning('[SITEMAP] Cached %s', sitemap_link.url)
            with open(sitemap_path) as file:
                sitemap_text = file.read()

        else:

            logger.info('[SITEMAP] Fetching %s', sitemap_link.url)

            with request_session(sitemap_link) as session:
                try:
                    response = session.get(sitemap_link.url)
                except requests.RequestException:
                    logger.pexc(message=f'[SITEMAP] Failed on {sitemap_link.url}')
                    continue

            if not response.ok:
                logger.error('[SITEMAP] Failed on %s [%d]', sitemap_link.url, response.status_code)
                continue

            # check content type
            ct_type = get_content_type(response)
            if ct_type == 'application/gzip':
                try:
                    sitemap_text = gzip.decompress(response.content).decode()
                except UnicodeDecodeError:
                    sitemap_text = response.text
            elif ct_type in ['text/xml', 'text/html']:
                sitemap_text = response.text
                save_sitemap(sitemap_link, sitemap_text)
            else:
                logger.error('[SITEMAP] Unresolved content type on %s (%s)', sitemap_link.url, ct_type)
                continue

            logger.info('[SITEMAP] Fetched %s', sitemap_link.url)

        # get more sitemaps
        sitemaps.extend(get_sitemap(sitemap_link, sitemap_text, host=link.host))

        # add link to queue
        save_requests(read_sitemap(link, sitemap_text))
Beispiel #20
0
def loader(link: 'darc_link.Link') -> None:
    """Single :mod:`selenium` loader for an entry link.

    Args:
        Link: URL to be crawled by :mod:`selenium`.

    The function will first parse the URL using :func:`~darc.link.parse_link`
    and start loading the URL using :mod:`selenium` with Google Chrome.

    At this point, :mod:`darc` will call the customised hook function
    from :mod:`darc.sites` to load and return the original
    :class:`selenium.webdriver.chrome.webdriver.WebDriver` object.

    .. note::

        If :exc:`~darc.error.LinkNoReturn` is raised, the link will be
        removed from the :mod:`selenium` database through
        :func:`~darc.db.drop_selenium`.

    If successful, the rendered source HTML document will be saved, and a
    full-page screenshot will be taken and saved.

    .. note::

       When taking full-page screenshot, :func:`~darc.crawl.loader` will
       use :javascript:`document.body.scrollHeight` to get the total
       height of web page. If the page height is *less than* **1,000 pixels**,
       then :mod:`darc` will by default set the height as **1,000 pixels**.

       Later :mod:`darc` will tell :mod:`selenium` to resize the window (in
       *headless* mode) to **1,024 pixels** in width and **110%** of the
       page height in height, and take a *PNG* screenshot.

    If the submission API is provided, :func:`~darc.submit.submit_selenium`
    will be called and submit the document just loaded.

    Later, :func:`~darc.parse.extract_links` will be called then to
    extract all possible links from the HTML document and save such
    links into the :mod:`requests` database (c.f. :func:`~darc.db.save_requests`).

    .. seealso::

       * :data:`darc.const.SE_EMPTY`
       * :data:`darc.const.SE_WAIT`

    """
    logger.info('[SELENIUM] Loading %s', link.url)
    try:
        # timestamp
        timestamp = datetime.now()

        # retrieve source from Chrome
        with request_driver(link) as driver:
            try:
                # selenium driver hook
                driver = loader_hook(timestamp, driver, link)
            except urllib3_exceptions.HTTPError:
                logger.pexc(message=f'[SELENIUM] Fail to load {link.url}')
                save_selenium(link, single=True)
                return
            except selenium_exceptions.WebDriverException as error:
                logger.pexc(message=f'[SELENIUM] Fail to load {link.url}')
                save_selenium(link, single=True)
                return
            except LinkNoReturn as error:
                logger.pexc(LOG_WARNING,
                            f'[SELENIUM] Removing from database: {link.url}')
                if error.drop:
                    drop_selenium(link)
                return

            # get HTML source
            html = driver.page_source

            if html == SE_EMPTY:
                logger.error('[SELENIUM] Empty page from %s', link.url)
                save_selenium(link, single=True)
                return

            screenshot = None
            try:
                # get maximum height
                height = driver.execute_script(
                    'return document.body.scrollHeight')

                # resize window (with some magic numbers)
                driver.set_window_size(1024,
                                       math.ceil(max(height, 1000) * 1.1))

                # take a full page screenshot
                screenshot = driver.get_screenshot_as_base64()
            except Exception:
                logger.pexc(
                    message=
                    f'[SELENIUM] Fail to save screenshot from {link.url}')

            # submit data
            submit_selenium(timestamp, link, html, screenshot)

            # add link to queue
            save_requests(extract_links(link, html), score=0, nx=True)
    except Exception:
        logger.ptb('[Error from %s]', link.url)
        save_selenium(link, single=True)

    logger.info('[SELENIUM] Loaded %s', link.url)