Esempio n. 1
0
def _check_ng(temp_list: typing.List[Link]) -> typing.List[Link]:
    """Check content type of links through ``HEAD`` requests.

    Args:
        temp_list: List of links to be checked.

    Returns:
        List of links matches the requirements.

    See Also:
        * :func:`darc.parse.match_host`
        * :func:`darc.parse.match_proxy`
        * :func:`darc.parse.match_mime`

    """
    from darc.crawl import request_session  # pylint: disable=import-outside-toplevel

    session_map = dict()
    result_list = list()
    for link in temp_list:
        if match_host(link.host):
            continue
        if match_proxy(link.proxy):
            continue

        # get session
        session = session_map.get(link.proxy)
        if session is None:
            session = request_session(link, futures=True)
            session_map[link.proxy] = session

        result = session.head(link.url, allow_redirects=True)
        result_list.append(result)

        print(f'[HEAD] Checking content type from {link.url}')

    link_list = list()
    for result in concurrent.futures.as_completed(result_list):
        try:
            response: typing.Response = result.result()
        except requests.RequestException as error:
            if error.response is None:
                print(render_error(f'[HEAD] Checking failed <{error}>',
                                   stem.util.term.Color.RED))  # pylint: disable=no-member
                continue
            print(render_error(f'[HEAD] Failed on {error.response.url} <{error}>',
                               stem.util.term.Color.RED))  # pylint: disable=no-member
            link_list.append(error.response.url)
            continue
        ct_type = get_content_type(response)

        print(f'[HEAD] Checked content type from {response.url} ({ct_type})')

        if match_mime(ct_type):
            continue
        temp_link = parse_link(response.request.url)
        link_list.append(temp_link)
    return link_list
Esempio n. 2
0
    def process_loader(
            cls, timestamp: typing.Datetime, driver: typing.Driver, link: Link,
            record: CacheRecord) -> None:  # pylint: disable=unused-argument
        """Process the :class:`WebDriver <selenium.webdriver.Chrome>` object.

        Args:
            timestamp: Timestamp of the worker node reference.
            driver (selenium.webdriver.Chrome): Web driver object with proxy settings
                and cookies presets.
            link: Link object to be loaded.
            record: Cached record from the remote database.

        """
        driver.get(link.url)

        # wait for page to finish loading
        if SE_WAIT is not None:
            time.sleep(SE_WAIT)

        # get HTML source
        html = driver.page_source
        if html == SE_EMPTY:
            print(render_error(f'[SELENIUM] Empty page from {link.url}',
                               stem.util.term.Color.RED),
                  file=sys.stderr)  # pylint: disable=no-member
            save_selenium(link, single=True)
            return

        screenshot = None
        try:
            # get maximum height
            height = driver.execute_script('return document.body.scrollHeight')

            # resize window (with some magic numbers)
            if height < 1000:
                height = 1000
            driver.set_window_size(1024, math.ceil(height * 1.1))

            # take a full page screenshot
            screenshot = driver.get_screenshot_as_base64()
        except Exception as error:
            print(render_error(
                f'[SELENIUM] Fail to save screenshot from {link.url} <{error}>',
                stem.util.term.Color.RED),
                  file=sys.stderr)  # pylint: disable=no-member

        # submit data
        submit_selenium(timestamp, link, html, screenshot)

        # add link to queue
        extracted_links = cls._extract_links(link, html)
        save_requests(extracted_links, score=0, nx=True)
Esempio n. 3
0
def freenet_bootstrap() -> None:
    """Bootstrap wrapper for Freenet.

    The function will bootstrap the Freenet proxy. It will retry for
    :data:`~darc.proxy.freenet.FREENET_RETRY` times in case of failure.

    Also, it will **NOT** re-bootstrap the proxy as is guaranteed by
    :data:`~darc.proxy.freenet._FREENET_BS_FLAG`.

    Warns:
        FreenetBootstrapFailed: If failed to bootstrap Freenet proxy.

    Raises:
        :exc:`UnsupportedPlatform`: If the system is not supported, i.e. not macOS or Linux.

    See Also:
        * :func:`darc.proxy.freenet._freenet_bootstrap`
        * :data:`darc.proxy.freenet.FREENET_RETRY`
        * :data:`darc.proxy.freenet._FREENET_BS_FLAG`

    """
    if _unsupported:
        raise UnsupportedPlatform(f'unsupported system: {platform.system()}')

    # don't re-bootstrap
    if _FREENET_BS_FLAG:
        return

    print(
        stem.util.term.format('-*- Freenet Bootstrap -*-',
                              stem.util.term.Color.MAGENTA))  # pylint: disable=no-member
    for _ in range(FREENET_RETRY + 1):
        try:
            _freenet_bootstrap()
            break
        except Exception as error:
            if DEBUG:
                message = '[Error bootstraping Freenet proxy]' + os.linesep + traceback.format_exc(
                )
                print(render_error(message, stem.util.term.Color.RED),
                      end='',
                      file=sys.stderr)  # pylint: disable=no-member

            warning = warnings.formatwarning(str(error),
                                             FreenetBootstrapFailed, __file__,
                                             147, 'freenet_bootstrap()')
            print(render_error(warning, stem.util.term.Color.YELLOW),
                  end='',
                  file=sys.stderr)  # pylint: disable=no-member
    print(
        stem.util.term.format('-' * shutil.get_terminal_size().columns,
                              stem.util.term.Color.MAGENTA))  # pylint: disable=no-member
Esempio n. 4
0
def have_hostname(link: Link) -> typing.Tuple[bool, bool]:
    """Check if current link is a new host.

    Args:
        link: Link to check against.

    Returns:
        A tuple of two :obj:`bool` values representing
        if such link is a known host and needs force
        refetch respectively.

    See Also:
        * :func:`darc.db._have_hostname_db`
        * :func:`darc.db._have_hostname_redis`

    """
    if FLAG_DB:
        with database.connection_context():
            try:
                return _have_hostname_db(link)
            except Exception as error:
                warning = warnings.formatwarning(str(error), DatabaseOperaionFailed, __file__, 236,
                                                 f'_have_hostname_db({link})')
                print(render_error(warning, stem.util.term.Color.YELLOW), end='', file=sys.stderr)  # pylint: disable=no-member
                return False, False
    return _have_hostname_redis(link)
Esempio n. 5
0
def _load_requests_redis() -> typing.List[Link]:
    """Load link from the :mod:`requests` database.

    The function reads the ``queue_requests`` database.

    Returns:
        List of loaded links from the :mod:`requests` database.

    Note:
        At runtime, the function will load links with maximum number
        at :data:`~darc.db.MAX_POOL` to limit the memory usage.

    """
    now = time.time()
    if TIME_CACHE is None:
        sec_delta = 0  # type: ignore
        max_score = now
    else:
        sec_delta = TIME_CACHE.total_seconds()
        max_score = now - sec_delta

    try:
        with _redis_get_lock('lock_queue_requests', blocking_timeout=LOCK_TIMEOUT):  # type: ignore
            temp_pool: typing.List[bytes] = [_redis_command('get', name) for name in _redis_command('zrangebyscore', 'queue_requests',  # pylint: disable=line-too-long
                                                                                                    min=0, max=max_score, start=0, num=MAX_POOL)]  # pylint: disable=line-too-long
            link_pool = [pickle.loads(link) for link in filter(None, temp_pool)]  # nosec
            if TIME_CACHE is not None:
                new_score = now + sec_delta
                _save_requests_redis(link_pool, score=new_score)  # force update records
    except redis_lock.LockError:
        warning = warnings.formatwarning(f'[REQUESTS] Failed to acquire Redis lock after {LOCK_TIMEOUT} second(s)',
                                         LockWarning, __file__, 949, "_redis_get_lock('lock_queue_requests')")
        print(render_error(warning, stem.util.term.Color.YELLOW), end='', file=sys.stderr)  # pylint: disable=no-member
        link_pool = list()
    return link_pool
Esempio n. 6
0
def submit(api: str, domain: Domain, data: typing.Dict[str, typing.Any]):
    """Submit data.

    Args:
        api: API URL.
        domain (``'new_host'``, ``'requests'`` or ``'selenium'``): Domain of the submit data.
        data: Submit data.

    See Also:
        * :data:`darc.submit.API_RETRY`
        * :func:`darc.submit.save_submit`
        * :func:`darc.submit.submit_new_host`
        * :func:`darc.submit.submit_requests`
        * :func:`darc.submit.submit_selenium`

    """
    with null_session() as session:
        for _ in range(API_RETRY + 1):
            try:
                response = session.post(api, json=data)
                if response.ok:
                    return
            except requests.RequestException as error:
                warning = warnings.formatwarning(
                    error, APIRequestFailed, __file__, 150,
                    f'[{domain.upper()}] response = requests.post(api, json=data)'
                )
                print(render_error(warning, stem.util.term.Color.YELLOW),
                      end='',
                      file=sys.stderr)  # pylint: disable=no-member
    save_submit(domain, data)
Esempio n. 7
0
def _db_operation(operation: typing.Callable[..., typing.T], *args, **kwargs) -> typing.T:  # type: ignore
    """Retry operation on database.

    Args:
        operation: Callable / method to perform.
        *args: Arbitrary positional arguments.

    Keyword Args:
        **kwargs: Arbitrary keyword arguments.

    Returns:
        Any return value from a successful
        ``operation`` call.

    """
    _arg_msg = None

    while True:
        try:
            value = operation(*args, **kwargs)
        except Exception as error:
            if _arg_msg is None:
                _arg_msg = _gen_arg_msg(*args, **kwargs)

            model = typing.cast(typing.MethodType, operation).__self__.__class__.__name__
            warning = warnings.formatwarning(str(error), DatabaseOperaionFailed, __file__, 166,
                                             f'{model}.{operation.__name__}({_arg_msg})')
            print(render_error(warning, stem.util.term.Color.YELLOW), end='', file=sys.stderr)  # pylint: disable=no-member

            if RETRY_INTERVAL is not None:
                time.sleep(RETRY_INTERVAL)
            continue
        break
    return value
Esempio n. 8
0
def _zeronet_bootstrap():
    """ZeroNet bootstrap.

    The bootstrap arguments are defined as :data:`~darc.proxy.zeronet._ZERONET_ARGS`.

    Raises:
        subprocess.CalledProcessError: If the return code of :data:`~darc.proxy.zeronet._ZERONET_PROC` is non-zero.

    See Also:
        * :func:`darc.proxy.zeronet.zeronet_bootstrap`
        * :data:`darc.proxy.zeronet.BS_WAIT`
        * :data:`darc.proxy.zeronet._ZERONET_BS_FLAG`
        * :data:`darc.proxy.zeronet._ZERONET_PROC`

    """
    global _ZERONET_BS_FLAG, _ZERONET_PROC

    # launch Tor first
    tor_bootstrap()

    # launch ZeroNet process
    _ZERONET_PROC = subprocess.Popen(
        _ZERONET_ARGS,
        stdin=subprocess.PIPE,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
    )

    try:
        stdout, stderr = _ZERONET_PROC.communicate(timeout=BS_WAIT)
    except subprocess.TimeoutExpired as error:
        stdout, stderr = error.stdout, error.stderr
    if VERBOSE:
        if stdout is not None:
            print(render_error(stdout, stem.util.term.Color.BLUE))  # pylint: disable=no-member
    if stderr is not None:
        print(render_error(stderr, stem.util.term.Color.RED))  # pylint: disable=no-member

    returncode = _ZERONET_PROC.returncode
    if returncode is not None and returncode != 0:
        raise subprocess.CalledProcessError(returncode, _ZERONET_ARGS,
                                            _ZERONET_PROC.stdout,
                                            _ZERONET_PROC.stderr)

    # update flag
    _ZERONET_BS_FLAG = True
Esempio n. 9
0
def tor_bootstrap():
    """Bootstrap wrapper for Tor.

    The function will bootstrap the Tor proxy. It will retry for
    :data:`~darc.proxy.tor.TOR_RETRY` times in case of failure.

    Also, it will **NOT** re-bootstrap the proxy as is guaranteed by
    :data:`~darc.proxy.tor._TOR_BS_FLAG`.

    Warns:
        TorBootstrapFailed: If failed to bootstrap Tor proxy.

    See Also:
        * :func:`darc.proxy.tor._tor_bootstrap`
        * :data:`darc.proxy.tor.TOR_RETRY`
        * :data:`darc.proxy.tor._TOR_BS_FLAG`

    """
    # don't re-bootstrap
    if _TOR_BS_FLAG:
        return

    print(
        stem.util.term.format('-*- Tor Bootstrap -*-',
                              stem.util.term.Color.MAGENTA))  # pylint: disable=no-member
    for _ in range(TOR_RETRY + 1):
        try:
            _tor_bootstrap()
            break
        except Exception as error:
            if DEBUG:
                message = '[Error bootstraping Tor proxy]' + os.linesep + traceback.format_exc(
                )
                print(render_error(message, stem.util.term.Color.RED),
                      end='',
                      file=sys.stderr)  # pylint: disable=no-member

            warning = warnings.formatwarning(error, TorBootstrapFailed,
                                             __file__, 170, 'tor_bootstrap()')
            print(render_error(warning, stem.util.term.Color.YELLOW),
                  end='',
                  file=sys.stderr)  # pylint: disable=no-member
    print(
        stem.util.term.format('-' * shutil.get_terminal_size().columns,
                              stem.util.term.Color.MAGENTA))  # pylint: disable=no-member
Esempio n. 10
0
def _freenet_bootstrap() -> None:
    """Freenet bootstrap.

    The bootstrap arguments are defined as :data:`~darc.proxy.freenet._FREENET_ARGS`.

    Raises:
        subprocess.CalledProcessError: If the return code of :data:`~darc.proxy.freenet._FREENET_PROC` is non-zero.

    See Also:
        * :func:`darc.proxy.freenet.freenet_bootstrap`
        * :data:`darc.proxy.freenet.BS_WAIT`
        * :data:`darc.proxy.freenet._FREENET_BS_FLAG`
        * :data:`darc.proxy.freenet._FREENET_PROC`

    """
    global _FREENET_BS_FLAG, _FREENET_PROC  # pylint: disable=global-statement

    # launch Freenet process
    _FREENET_PROC = subprocess.Popen(  # nosec
        _FREENET_ARGS,
        stdin=subprocess.PIPE,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
    )

    try:
        stdout, stderr = _FREENET_PROC.communicate(timeout=BS_WAIT)
    except subprocess.TimeoutExpired as error:
        stdout, stderr = error.stdout, error.stderr
    if VERBOSE:
        if stdout is not None:
            print(render_error(stdout, stem.util.term.Color.BLUE))  # pylint: disable=no-member
    if stderr is not None:
        print(render_error(stderr, stem.util.term.Color.RED))  # pylint: disable=no-member

    returncode = _FREENET_PROC.returncode
    if returncode != 0:
        raise subprocess.CalledProcessError(
            returncode, _FREENET_ARGS,
            typing.cast(typing.IO[bytes], _FREENET_PROC.stdout).read(),
            typing.cast(typing.IO[bytes], _FREENET_PROC.stderr).read())

    # update flag
    _FREENET_BS_FLAG = True
Esempio n. 11
0
def load_selenium(check: bool = CHECK) -> typing.List[Link]:
    """Load link from the :mod:`selenium` database.

    Args:
        check: If perform checks on loaded links,
            default to :data:`~darc.const.CHECK`.

    Returns:
        List of loaded links from the :mod:`selenium` database.

    Note:
        At runtime, the function will load links with maximum number
        at :data:`~darc.db.MAX_POOL` to limit the memory usage.

    See Also:
        * :func:`darc.db._load_selenium_db`
        * :func:`darc.db._load_selenium_redis`

    """
    if FLAG_DB:
        with database.connection_context():
            try:
                link_pool = _load_selenium_db()
            except Exception as error:
                warning = warnings.formatwarning(str(error), DatabaseOperaionFailed, __file__, 983,
                                                 '_load_selenium_db()')
                print(render_error(warning, stem.util.term.Color.YELLOW), end='', file=sys.stderr)  # pylint: disable=no-member
                link_pool = list()
    else:
        link_pool = _load_selenium_redis()

    if check:
        link_pool = _check(link_pool)

    if VERBOSE:
        print(stem.util.term.format('-*- [SELENIUM] LINK POOL -*-',
                                    stem.util.term.Color.MAGENTA))  # pylint: disable=no-member
        print(render_error(pprint.pformat(sorted(link.url for link in link_pool)),
                           stem.util.term.Color.MAGENTA))  # pylint: disable=no-member
        print(stem.util.term.format('-' * shutil.get_terminal_size().columns,
                                    stem.util.term.Color.MAGENTA))  # pylint: disable=no-member
    return link_pool
Esempio n. 12
0
def _redis_command(command: str, *args, **kwargs) -> typing.Any:
    """Wrapper function for Redis command.

    Args:
        command: Command name.
        *args: Arbitrary arguments for the Redis command.

    Keyword Args:
        **kwargs: Arbitrary keyword arguments for the Redis command.

    Return:
        Values returned from the Redis command.

    Warns:
        RedisCommandFailed: Warns at each round when the command failed.

    See Also:
        Between each retry, the function sleeps for :data:`~darc.db.REDIS_RETRY`
        second(s) if such value is **NOT** :data:`None`.

    """
    _arg_msg = None

    method = getattr(redis, command)
    while True:
        try:
            value = method(*args, **kwargs)
        except Exception as error:
            if _arg_msg is None:
                _args = ', '.join(map(repr, args))
                _kwargs = ', '.join(f'{k}={v!r}' for k, v in kwargs.items())
                if _kwargs:
                    if _args:
                        _args += ', '
                    _args += _kwargs
                _arg_msg = textwrap.shorten(_args,
                                            shutil.get_terminal_size().columns)

            warning = warnings.formatwarning(
                error, RedisCommandFailed, __file__, 85,
                f'value = redis.{command}({_arg_msg})')
            print(render_error(warning, stem.util.term.Color.YELLOW),
                  end='',
                  file=sys.stderr)  # pylint: disable=no-member

            if REDIS_RETRY is not None:
                time.sleep(REDIS_RETRY)
            continue
        break
    return value
Esempio n. 13
0
def _load_selenium_redis() -> typing.List[Link]:
    """Load link from the :mod:`selenium` database.

    The function reads the ``queue_selenium`` database.

    Args:
        check: If perform checks on loaded links,
            default to :data:`~darc.const.CHECK`.

    Returns:
        List of loaded links from the :mod:`selenium` database.

    Note:
        At runtime, the function will load links with maximum number
        at :data:`~darc.db.MAX_POOL` to limit the memory usage.

    """
    now = time.time()
    if TIME_CACHE is None:
        sec_delta = 0
        max_score = now
    else:
        sec_delta = TIME_CACHE.total_seconds()
        max_score = now - sec_delta

    try:
        with _redis_get_lock('lock_queue_selenium',
                             blocking_timeout=LOCK_TIMEOUT):
            link_pool = [
                pickle.loads(link) for link in _redis_command('zrangebyscore',
                                                              'queue_selenium',
                                                              min=0,
                                                              max=max_score,
                                                              start=0,
                                                              num=MAX_POOL)
            ]
            if TIME_CACHE is not None:
                new_score = now + sec_delta
                _save_selenium_redis(link_pool,
                                     score=new_score)  # force update records
    except redis_lock.LockError:
        warning = warnings.formatwarning(
            f'[SELENIUM] Failed to acquire Redis lock after {LOCK_TIMEOUT} second(s)',
            LockWarning, __file__, 299,
            "_redis_get_lock('lock_queue_selenium')")
        print(render_error(warning, stem.util.term.Color.YELLOW),
              end='',
              file=sys.stderr)  # pylint: disable=no-member
        link_pool = list()
    return link_pool
Esempio n. 14
0
def renew_tor_session():
    """Renew Tor session."""
    global _TOR_CTRL

    try:
        # Tor controller process
        if _TOR_CTRL is None:
            _TOR_CTRL = stem.control.Controller.from_port(port=int(TOR_CTRL))
            _TOR_CTRL.authenticate(TOR_PASS)
        _TOR_CTRL.signal(stem.Signal.NEWNYM)  # pylint: disable=no-member
    except Exception as error:
        warning = warnings.formatwarning(
            error, TorRenewFailed, __file__, 88,
            '_TOR_CTRL = stem.control.Controller.from_port(port=int(TOR_CTRL))'
        )
        print(render_error(warning, stem.util.term.Color.YELLOW),
              end='',
              file=sys.stderr)  # pylint: disable=no-member
Esempio n. 15
0
def crawler(session: typing.Session, link: Link) -> typing.NoReturn:  # pylint: disable=unused-argument
    """Crawler hook for data URIs.

    Args:
        session (:class:`requests.Session`): Session object with proxy settings.
        link: Link object to be crawled.

    Raises:
        LinkNoReturn: This link has no return response.

    """
    try:
        save_data(link)
    except ValueError as error:
        print(render_error(
            f'[REQUESTS] Failed to save data URI from {link.url} <{error}>',
            stem.util.term.Color.RED),
              file=sys.stderr)  # pylint: disable=no-member
    raise LinkNoReturn
Esempio n. 16
0
def save_requests(entries: typing.Union[Link, typing.List[Link]], single: bool = False,  # pylint: disable=inconsistent-return-statements
                  score: typing.Optional[float] = None, nx: bool = False, xx: bool = False) -> None:
    """Save link to the :mod:`requests` database.

    The function updates the ``queue_requests`` database.

    Args:
        entries: Links to be added to the :mod:`requests` database.
            It can be either a :obj:`list` of links, or a single
            link string (if ``single`` set as :data:`True`).
        single: Indicate if ``entries`` is a :obj:`list` of links
            or a single link string.
        score: Score to for the Redis sorted set.
        nx: Only create new elements and not to
            update scores for elements that already exist.
        xx: Only update scores of elements that
            already exist. New elements will not be added.

    Notes:
        The ``entries`` will be dumped through :mod:`pickle` so that
        :mod:`darc` do not need to parse them again.

    When ``entries`` is a list of :class:`~darc.link.Link` instances,
    we tries to perform *bulk* update to easy the memory consumption.
    The *bulk* size is defined by :data:`~darc.db.BULK_SIZE`.

    See Also:
        * :func:`darc.db._save_requests_db`
        * :func:`darc.db._save_requests_redis`

    """
    if FLAG_DB:
        with database.connection_context():
            try:
                return _save_requests_db(entries, single, score, nx, xx)
            except Exception as error:
                warning = warnings.formatwarning(str(error), DatabaseOperaionFailed, __file__, 505,
                                                 '_save_requests_db(...)')
                print(render_error(warning, stem.util.term.Color.YELLOW), end='', file=sys.stderr)  # pylint: disable=no-member
                return
    return _save_requests_redis(entries, single, score, nx, xx)
Esempio n. 17
0
def drop_selenium(link: Link) -> None:  # pylint: disable=inconsistent-return-statements
    """Remove link from the :mod:`selenium` database.

    Args:
        link: Link to be removed.

    See Also:
        * :func:`darc.db._drop_selenium_db`
        * :func:`darc.db._drop_selenium_redis`

    """
    if FLAG_DB:
        with database.connection_context():
            try:
                return _drop_selenium_db(link)
            except Exception as error:
                warning = warnings.formatwarning(str(error), DatabaseOperaionFailed, __file__, 433,
                                                 f'_drop_selenium_db({link})')
                print(render_error(warning, stem.util.term.Color.YELLOW), end='', file=sys.stderr)  # pylint: disable=no-member
                return
    return _drop_selenium_redis(link)
Esempio n. 18
0
def load_requests(check: bool = CHECK) -> typing.List[Link]:
    """Load link from the :mod:`requests` database.

    Args:
        check: If perform checks on loaded links,
            default to :data:`~darc.const.CHECK`.

    Returns:
        List of loaded links from the :mod:`requests` database.

    Note:
        At runtime, the function will load links with maximum number
        at :data:`~darc.db.MAX_POOL` to limit the memory usage.

    See Also:
        * :func:`darc.db._load_requests_db`
        * :func:`darc.db._load_requests_redis`

    """
    if FLAG_DB:
        with database.connection_context():
            link_pool = _load_requests_db()
    else:
        link_pool = _load_requests_redis()

    if check:
        link_pool = _check(link_pool)

    if VERBOSE:
        print(
            stem.util.term.format('-*- [REQUESTS] LINK POOL -*-',
                                  stem.util.term.Color.MAGENTA))  # pylint: disable=no-member
        print(
            render_error(
                pprint.pformat(sorted(link.url for link in link_pool)),
                stem.util.term.Color.MAGENTA))  # pylint: disable=no-member
        print(
            stem.util.term.format('-' * shutil.get_terminal_size().columns,
                                  stem.util.term.Color.MAGENTA))  # pylint: disable=no-member
    return link_pool
Esempio n. 19
0
def _redis_command(command: str, *args, **kwargs) -> typing.Any:  # type: ignore
    """Wrapper function for Redis command.

    Args:
        command: Command name.
        *args: Arbitrary arguments for the Redis command.

    Keyword Args:
        **kwargs: Arbitrary keyword arguments for the Redis command.

    Return:
        Values returned from the Redis command.

    Warns:
        RedisCommandFailed: Warns at each round when the command failed.

    See Also:
        Between each retry, the function sleeps for :data:`~darc.db.RETRY_INTERVAL`
        second(s) if such value is **NOT** :data:`None`.

    """
    _arg_msg = None

    method = getattr(redis, command)
    while True:
        try:
            value = method(*args, **kwargs)
        except Exception as error:
            if _arg_msg is None:
                _arg_msg = _gen_arg_msg(*args, **kwargs)

            warning = warnings.formatwarning(str(error), RedisCommandFailed, __file__, 131,
                                             f'value = redis.{command}({_arg_msg})')
            print(render_error(warning, stem.util.term.Color.YELLOW), end='', file=sys.stderr)  # pylint: disable=no-member

            if RETRY_INTERVAL is not None:
                time.sleep(RETRY_INTERVAL)
            continue
        break
    return value
Esempio n. 20
0
        _FREENET_ARGS = list()
else:
    _FREENET_ARGS = [os.path.join(FREENET_PATH, 'run.sh'), 'start']
_FREENET_ARGS.extend(FREENET_ARGS)

if DEBUG:
    print(
        stem.util.term.format('-*- FREENET PROXY -*-',
                              stem.util.term.Color.MAGENTA))  # pylint: disable=no-member
    if _unsupported:
        print(
            stem.util.term.format(f'unsupported system: {platform.system()}',
                                  stem.util.term.Color.RED))  # pylint: disable=no-member
    else:
        print(
            render_error(pprint.pformat(_FREENET_ARGS),
                         stem.util.term.Color.MAGENTA))  # pylint: disable=no-member
    print(
        stem.util.term.format('-' * shutil.get_terminal_size().columns,
                              stem.util.term.Color.MAGENTA))  # pylint: disable=no-member


def _freenet_bootstrap() -> None:
    """Freenet bootstrap.

    The bootstrap arguments are defined as :data:`~darc.proxy.freenet._FREENET_ARGS`.

    Raises:
        subprocess.CalledProcessError: If the return code of :data:`~darc.proxy.freenet._FREENET_PROC` is non-zero.

    See Also:
        * :func:`darc.proxy.freenet.freenet_bootstrap`
Esempio n. 21
0
def loader(link: Link):
    """Single :mod:`selenium` loader for a entry link.

    Args:
        Link: URL to be crawled by :mod:`selenium`.

    The function will first parse the URL using :func:`~darc.link.parse_link`
    and start loading the URL using :mod:`selenium` with Google Chrome.

    At this point, :mod:`darc` will call the customised hook function
    from :mod:`darc.sites` to load and return the original
    :class:`selenium.webdriver.Chrome` object.

    .. note::

        If :exc:`~darc.error.LinkNoReturn` is raised, the link will be
        removed from the :mod:`selenium` database through
        :func:`~darc.db.drop_selenium`.

    If successful, the rendered source HTML document will be saved, and a
    full-page screenshot will be taken and saved.

    .. note::

       When taking full-page screenshot, :func:`~darc.crawl.loader` will
       use :javascript:`document.body.scrollHeight` to get the total
       height of web page. If the page height is *less than* **1,000 pixels**,
       then :mod:`darc` will by default set the height as **1,000 pixels**.

       Later :mod:`darc` will tell :mod:`selenium` to resize the window (in
       *headless* mode) to **1,024 pixels** in width and **110%** of the
       page height in height, and take a *PNG* screenshot.

    If the submission API is provided, :func:`~darc.submit.submit_selenium`
    will be called and submit the document just loaded.

    Later, :func:`~darc.parse.extract_links` will be called then to
    extract all possible links from the HTML document and save such
    links into the :mod:`requests` database (c.f. :func:`~darc.db.save_requests`).

    .. seealso::

       * :data:`darc.const.SE_EMPTY`
       * :data:`darc.const.SE_WAIT`

    """
    print(f'[SELENIUM] Loading {link.url}')
    try:
        # timestamp
        timestamp = datetime.now()

        # retrieve source from Chrome
        with request_driver(link) as driver:
            try:
                # selenium driver hook
                driver = loader_hook(link, driver)
            except urllib3.exceptions.HTTPError as error:
                print(render_error(
                    f'[SELENIUM] Fail to load {link.url} <{error}>',
                    stem.util.term.Color.RED),
                      file=sys.stderr)  # pylint: disable=no-member
                save_selenium(link, single=True)
                return
            except selenium.common.exceptions.WebDriverException as error:
                print(render_error(
                    f'[SELENIUM] Fail to load {link.url} <{error}>',
                    stem.util.term.Color.RED),
                      file=sys.stderr)  # pylint: disable=no-member
                save_selenium(link, single=True)
                return
            except LinkNoReturn:
                print(render_error(
                    f'[SELENIUM] Removing from database: {link.url}',
                    stem.util.term.Color.YELLOW),
                      file=sys.stderr)  # pylint: disable=no-member
                drop_selenium(link)
                return

            # get HTML source
            html = driver.page_source

            if html == SE_EMPTY:
                print(render_error(f'[SELENIUM] Empty page from {link.url}',
                                   stem.util.term.Color.RED),
                      file=sys.stderr)  # pylint: disable=no-member
                save_selenium(link, single=True)
                return

            screenshot = None
            try:
                # get maximum height
                height = driver.execute_script(
                    'return document.body.scrollHeight')

                # resize window (with some magic numbers)
                if height < 1000:
                    height = 1000
                driver.set_window_size(1024, math.ceil(height * 1.1))

                # take a full page screenshot
                screenshot = driver.get_screenshot_as_base64()
            except Exception as error:
                print(render_error(
                    f'[SELENIUM] Fail to save screenshot from {link.url} <{error}>',
                    stem.util.term.Color.RED),
                      file=sys.stderr)  # pylint: disable=no-member

            # submit data
            submit_selenium(timestamp, link, html, screenshot)

            # add link to queue
            save_requests(extract_links(link, html), score=0, nx=True)
    except Exception:
        error = f'[Error from {link.url}]' + os.linesep + traceback.format_exc() + '-' * shutil.get_terminal_size().columns  # pylint: disable=line-too-long
        print(render_error(error, stem.util.term.Color.CYAN), file=sys.stderr)  # pylint: disable=no-member
        save_selenium(link, single=True)
    print(f'[SELENIUM] Loaded {link.url}')
Esempio n. 22
0
def fetch_hosts(link: Link):
    """Fetch ``hosts.txt``.

    Args:
        link: Link object to fetch for its ``hosts.txt``.

    Returns:
        Content of the ``hosts.txt`` file.

    """
    hosts_path = have_hosts(link)
    if hosts_path is not None:

        print(
            stem.util.term.format(f'[HOSTS] Cached {link.url}',
                                  stem.util.term.Color.YELLOW))  # pylint: disable=no-member

        with open(hosts_path) as hosts_file:
            hosts_text = hosts_file.read()

    else:

        from darc.requests import i2p_session  # pylint: disable=import-outside-toplevel

        hosts_link = parse_link(urljoin(link.url, '/hosts.txt'))
        print(f'[HOSTS] Subscribing {hosts_link.url}')

        with i2p_session() as session:
            try:
                response = session.get(hosts_link.url)
            except requests.RequestException as error:
                print(render_error(
                    f'[HOSTS] Failed on {hosts_link.url} <{error}>',
                    stem.util.term.Color.RED),
                      file=sys.stderr)  # pylint: disable=no-member
                return

        if not response.ok:
            print(render_error(
                f'[HOSTS] Failed on {hosts_link.url} [{response.status_code}]',
                stem.util.term.Color.RED),
                  file=sys.stderr)  # pylint: disable=no-member
            return

        ct_type = get_content_type(response)
        if ct_type not in ['text/text', 'text/plain']:
            print(render_error(
                f'[HOSTS] Unresolved content type on {hosts_link.url} ({ct_type}',
                stem.util.term.Color.RED),
                  file=sys.stderr)  # pylint: disable=no-member
            return

        hosts_text = response.text
        save_hosts(hosts_link, hosts_text)

        print(f'[HOSTS] Subscribed {hosts_link.url}')

    from darc.db import save_requests  # pylint: disable=import-outside-toplevel

    # add link to queue
    save_requests(read_hosts(hosts_text))
Esempio n. 23
0
def submit_new_host(time: typing.Datetime, link: Link, partial: bool = False):
    """Submit new host.

    When a new host is discovered, the :mod:`darc` crawler will submit the
    host information. Such includes ``robots.txt`` (if exists) and
    ``sitemap.xml`` (if any).

    Args:
        time (datetime.datetime): Timestamp of submission.
        link: Link object of submission.
        partial: If the data is not complete, i.e. failed when fetching
            ``robots.txt``, ``hosts.txt`` and/or sitemaps.

    If :data:`~darc.submit.API_NEW_HOST` is :data:`None`, the data for submission
    will directly be save through :func:`~darc.submit.save_submit`.

    The data submitted should have following format::

        {
            // partial flag - true / false
            "$PARTIAL$": ...,
            // metadata of URL
            "[metadata]": {
                // original URL - <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
                "url": ...,
                // proxy type - null / tor / i2p / zeronet / freenet
                "proxy": ...,
                // hostname / netloc, c.f. ``urllib.parse.urlparse``
                "host": ...,
                // base folder, relative path (to data root path ``PATH_DATA``) in containter - <proxy>/<scheme>/<host>
                "base": ...,
                // sha256 of URL as name for saved files (timestamp is in ISO format)
                //   JSON log as this one - <base>/<name>_<timestamp>.json
                //   HTML from requests - <base>/<name>_<timestamp>_raw.html
                //   HTML from selenium - <base>/<name>_<timestamp>.html
                //   generic data files - <base>/<name>_<timestamp>.dat
                "name": ...
            },
            // requested timestamp in ISO format as in name of saved file
            "Timestamp": ...,
            // original URL
            "URL": ...,
            // robots.txt from the host (if not exists, then ``null``)
            "Robots": {
                // path of the file, relative path (to data root path ``PATH_DATA``) in container
                //   - <proxy>/<scheme>/<host>/robots.txt
                "path": ...,
                // content of the file (**base64** encoded)
                "data": ...,
            },
            // sitemaps from the host (if none, then ``null``)
            "Sitemaps": [
                {
                    // path of the file, relative path (to data root path ``PATH_DATA``) in container
                    //   - <proxy>/<scheme>/<host>/sitemap_<name>.txt
                    "path": ...,
                    // content of the file (**base64** encoded)
                    "data": ...,
                },
                ...
            ],
            // hosts.txt from the host (if proxy type is ``i2p``; if not exists, then ``null``)
            "Hosts": {
                // path of the file, relative path (to data root path ``PATH_DATA``) in container
                //   - <proxy>/<scheme>/<host>/hosts.txt
                "path": ...,
                // content of the file (**base64** encoded)
                "data": ...,
            }
        }

    See Also:
        * :data:`darc.submit.API_NEW_HOST`
        * :func:`darc.submit.submit`
        * :func:`darc.submit.save_submit`
        * :func:`darc.submit.get_metadata`
        * :func:`darc.submit.get_robots`
        * :func:`darc.submit.get_sitemap`
        * :func:`darc.submit.get_hosts`

    """
    metadata = get_metadata(link)
    ts = time.isoformat()

    robots = get_robots(link)
    sitemap = get_sitemap(link)
    hosts = get_hosts(link)

    if SAVE_DB:
        model, _ = HostnameModel.get_or_create(
            hostname=link.host,
            defaults=dict(
                proxy=HostnameModel.Proxy[link.proxy.upper()],
                discovery=time,
                last_seen=time,
                alive=False,
                since=time,
            ))

        if robots is not None:
            RobotsModel.create(
                host=model,
                timestamp=time,
                document=base64.b64decode(robots['data']).decode(),
            )

        if sitemap is not None:
            SitemapModel.create(
                host=model,
                timestamp=time,
                document=base64.b64decode(sitemap['data']).decode(),
            )

        if hosts is not None:
            HostsModel.create(
                host=model,
                timestamp=time,
                document=base64.b64decode(hosts['data']).decode(),
            )

    data = {
        '$PARTIAL$': partial,
        '[metadata]': metadata,
        'Timestamp': ts,
        'URL': link.host,
        'Robots': robots,
        'Sitemaps': sitemap,
        'Hosts': hosts,
    }

    if DEBUG:
        print(
            stem.util.term.format('-*- NEW HOST DATA -*-',
                                  stem.util.term.Color.MAGENTA))  # pylint: disable=no-member
        print(render_error(pprint.pformat(data), stem.util.term.Color.MAGENTA))  # pylint: disable=no-member
        print(
            stem.util.term.format('-' * shutil.get_terminal_size().columns,
                                  stem.util.term.Color.MAGENTA))  # pylint: disable=no-member

    if API_NEW_HOST is None:
        save_submit('new_host', data)
        return

    # submit data
    submit(API_NEW_HOST, 'new_host', data)
Esempio n. 24
0
def submit_requests(time: typing.Datetime,
                    link: Link,
                    response: typing.Response,
                    session: typing.Session,
                    content: bytes,
                    mime_type: str,
                    html: bool = True):
    """Submit requests data.

    When crawling, we'll first fetch the URl using :mod:`requests`, to check
    its availability and to save its HTTP headers information. Such information
    will be submitted to the web UI.

    Args:
        time (datetime.datetime): Timestamp of submission.
        link: Link object of submission.
        response (requests.Response): Response object of submission.
        session (requests.Session): Session object of submission.
        content: Raw content of from the response.
        mime_type: Content type.
        html: If current document is HTML or other files.

    If :data:`~darc.submit.API_REQUESTS` is :data:`None`, the data for submission
    will directly be save through :func:`~darc.submit.save_submit`.

    The data submitted should have following format::

        {
            // metadata of URL
            "[metadata]": {
                // original URL - <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
                "url": ...,
                // proxy type - null / tor / i2p / zeronet / freenet
                "proxy": ...,
                // hostname / netloc, c.f. ``urllib.parse.urlparse``
                "host": ...,
                // base folder, relative path (to data root path ``PATH_DATA``) in containter - <proxy>/<scheme>/<host>
                "base": ...,
                // sha256 of URL as name for saved files (timestamp is in ISO format)
                //   JSON log as this one - <base>/<name>_<timestamp>.json
                //   HTML from requests - <base>/<name>_<timestamp>_raw.html
                //   HTML from selenium - <base>/<name>_<timestamp>.html
                //   generic data files - <base>/<name>_<timestamp>.dat
                "name": ...
            },
            // requested timestamp in ISO format as in name of saved file
            "Timestamp": ...,
            // original URL
            "URL": ...,
            // request method
            "Method": "GET",
            // response status code
            "Status-Code": ...,
            // response reason
            "Reason": ...,
            // response cookies (if any)
            "Cookies": {
                ...
            },
            // session cookies (if any)
            "Session": {
                ...
            },
            // request headers (if any)
            "Request": {
                ...
            },
            // response headers (if any)
            "Response": {
                ...
            },
            // Content type
            "Content-Type": ...,
            // requested file (if not exists, then ``null``)
            "Document": {
                // path of the file, relative path (to data root path ``PATH_DATA``) in container
                //   - <proxy>/<scheme>/<host>/<name>_<timestamp>_raw.html
                // or if the document is of generic content type, i.e. not HTML
                //   - <proxy>/<scheme>/<host>/<name>_<timestamp>.dat
                "path": ...,
                // content of the file (**base64** encoded)
                "data": ...,
            },
            // redirection history (if any)
            "History": [
                // same record data as the original response
                {"...": "..."}
            ]
        }

    See Also:
        * :data:`darc.submit.API_REQUESTS`
        * :func:`darc.submit.submit`
        * :func:`darc.submit.save_submit`
        * :func:`darc.submit.get_metadata`
        * :func:`darc.submit.get_raw`
        * :func:`darc.crawl.crawler`

    """
    metadata = get_metadata(link)
    ts = time.isoformat()

    if html:
        path = f'{link.base}/{link.name}_{ts}_raw.html'
    else:
        path = f'{link.base}/{link.name}_{ts}.dat'

    if SAVE_DB:
        url, _ = URLModel.get_or_create(
            hash=link.name,
            defaults=dict(
                url=link.url,
                hostname=HostnameModel.get(
                    HostnameModel.hostname == link.host),
                proxy=URLModel.Proxy[link.proxy.upper()],
                discovery=time,
                last_seen=time,
                alive=False,
                since=time,
            ))

        model = RequestsModel.create(
            url=url,
            timestamp=time,
            method=response.request.method,
            document=content,
            mime_type=mime_type,
            is_html=html,
            status_code=response.status_code,
            reason=response.reason,
            cookies=response.cookies.get_dict(),
            session=response.cookies.get_dict(),
            request=dict(response.request.headers),
            response=dict(response.headers),
        )

        for index, history in enumerate(response.history):
            RequestsHistoryModel.create(
                index=index,
                model=model,
                url=history.url,
                timestamp=time,
                method=history.request.method,
                document=history.content,
                status_code=history.status_code,
                reason=history.reason,
                cookies=history.cookies.get_dict(),
                request=dict(history.request.headers),
                response=dict(history.headers),
            )

    data = {
        '[metadata]':
        metadata,
        'Timestamp':
        ts,
        'URL':
        link.url,
        'Method':
        response.request.method,
        'Status-Code':
        response.status_code,
        'Reason':
        response.reason,
        'Cookies':
        response.cookies.get_dict(),
        'Session':
        session.cookies.get_dict(),
        'Request':
        dict(response.request.headers),
        'Response':
        dict(response.headers),
        'Content-Type':
        mime_type,
        'Document':
        dict(
            path=os.path.relpath(path, PATH_DB),
            data=base64.b64encode(content).decode(),
        ),
        'History': [{
            'URL': history.url,
            'Method': history.request.method,
            'Status-Code': history.status_code,
            'Reason': history.reason,
            'Cookies': history.cookies.get_dict(),
            'Request': dict(history.request.headers),
            'Response': dict(history.headers),
            'Document': base64.b64encode(history.content).decode(),
        } for history in response.history],
    }

    if DEBUG:
        print(
            stem.util.term.format('-*- REQUESTS DATA -*-',
                                  stem.util.term.Color.MAGENTA))  # pylint: disable=no-member
        print(render_error(pprint.pformat(data), stem.util.term.Color.MAGENTA))  # pylint: disable=no-member
        print(
            stem.util.term.format('-' * shutil.get_terminal_size().columns,
                                  stem.util.term.Color.MAGENTA))  # pylint: disable=no-member

    if API_REQUESTS is None:
        save_submit('requests', data)
        return

    # submit data
    submit(API_REQUESTS, 'requests', data)
Esempio n. 25
0
def submit_selenium(time: typing.Datetime, link: Link, html: str,
                    screenshot: typing.Optional[str]):
    """Submit selenium data.

    After crawling with :mod:`requests`, we'll then render the URl using
    :mod:`selenium` with Google Chrome and its web driver, to provide a fully
    rendered web page. Such information will be submitted to the web UI.

    Args:
        time (datetime.datetime): Timestamp of submission.
        link: Link object of submission.
        html: HTML source of the web page.
        screenshot: *base64* encoded screenshot.

    If :data:`~darc.submit.API_SELENIUM` is :data:`None`, the data for submission
    will directly be save through :func:`~darc.submit.save_submit`.

    Note:
        This information is optional, only provided if the content type from
        :mod:`requests` is HTML, status code not between ``400`` and ``600``, and
        HTML data not empty.

    The data submitted should have following format::

        {
            // metadata of URL
            "[metadata]": {
                // original URL - <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
                "url": ...,
                // proxy type - null / tor / i2p / zeronet / freenet
                "proxy": ...,
                // hostname / netloc, c.f. ``urllib.parse.urlparse``
                "host": ...,
                // base folder, relative path (to data root path ``PATH_DATA``) in containter - <proxy>/<scheme>/<host>
                "base": ...,
                // sha256 of URL as name for saved files (timestamp is in ISO format)
                //   JSON log as this one - <base>/<name>_<timestamp>.json
                //   HTML from requests - <base>/<name>_<timestamp>_raw.html
                //   HTML from selenium - <base>/<name>_<timestamp>.html
                //   generic data files - <base>/<name>_<timestamp>.dat
                "name": ...
            },
            // requested timestamp in ISO format as in name of saved file
            "Timestamp": ...,
            // original URL
            "URL": ...,
            // rendered HTML document (if not exists, then ``null``)
            "Document": {
                // path of the file, relative path (to data root path ``PATH_DATA``) in container
                //   - <proxy>/<scheme>/<host>/<name>_<timestamp>.html
                "path": ...,
                // content of the file (**base64** encoded)
                "data": ...,
            },
            // web page screenshot (if not exists, then ``null``)
            "Screenshot": {
                // path of the file, relative path (to data root path ``PATH_DATA``) in container
                //   - <proxy>/<scheme>/<host>/<name>_<timestamp>.png
                "path": ...,
                // content of the file (**base64** encoded)
                "data": ...,
            }
        }

    See Also:
        * :data:`darc.submit.API_SELENIUM`
        * :func:`darc.submit.submit`
        * :func:`darc.submit.save_submit`
        * :func:`darc.submit.get_metadata`
        * :func:`darc.submit.get_html`
        * :func:`darc.submit.get_screenshot`
        * :func:`darc.crawl.loader`

    """
    metadata = get_metadata(link)
    ts = time.isoformat()

    if screenshot is None:
        ss = None
    else:
        ss = dict(
            path=os.path.relpath(f'{link.base}/{link.name}_{ts}.png', PATH_DB),
            data=screenshot,
        )

    if SAVE_DB:
        SeleniumModel.create(
            url=URLModel.get(URLModel.hash == link.name),
            timestamp=time,
            document=html,
            screenshot=base64.b64decode(screenshot),
        )

    data = {
        '[metadata]':
        metadata,
        'Timestamp':
        ts,
        'URL':
        link.url,
        'Document':
        dict(
            path=os.path.relpath(f'{link.base}/{link.name}_{ts}.html',
                                 PATH_DB),
            data=base64.b64encode(html.encode()).decode(),
        ),
        'Screenshot':
        ss,
    }

    if DEBUG:
        print(
            stem.util.term.format('-*- SELENIUM DATA -*-',
                                  stem.util.term.Color.MAGENTA))  # pylint: disable=no-member
        print(render_error(pprint.pformat(data), stem.util.term.Color.MAGENTA))  # pylint: disable=no-member
        print(
            stem.util.term.format('-' * shutil.get_terminal_size().columns,
                                  stem.util.term.Color.MAGENTA))  # pylint: disable=no-member

    if API_SELENIUM is None:
        save_submit('selenium', data)
        return

    # submit data
    submit(API_REQUESTS, 'selenium', data)
Esempio n. 26
0
def main(argv: typing.Optional[typing.List[str]] = None) -> int:
    """Entrypoint.

    Args:
        argv: Optional command line arguments.

    Returns:
        Exit code.

    """
    parser = get_parser()
    args = parser.parse_args(argv)

    pid = os.getpid()
    with open(PATH_ID, 'w') as file:
        print(pid, file=file)

    # wait for Redis
    if _WAIT_REDIS:
        if not FLAG_DB:
            _redis_command('set', 'darc', pid)

    if FLAG_DB:
        while True:
            try:
                with DB:
                    _db_operation(DB.create_tables, [
                        HostnameQueueModel,
                        RequestsQueueModel,
                        SeleniumQueueModel,
                    ])
            except Exception as error:
                warning = warnings.formatwarning(
                    error,
                    DatabaseOperaionFailed,
                    __file__,
                    102,  # type: ignore[arg-type]
                    'DB.create_tables([HostnameQueueModel, ...])')
                print(render_error(warning, stem.util.term.Color.YELLOW),
                      end='',
                      file=sys.stderr)  # pylint: disable=no-member
                continue
            break

    if SAVE_DB:
        while True:
            try:
                with DB_WEB:
                    _db_operation(DB_WEB.create_tables, [
                        HostnameModel,
                        URLModel,
                        RobotsModel,
                        SitemapModel,
                        HostsModel,
                        RequestsModel,
                        RequestsHistoryModel,
                        SeleniumModel,
                    ])
            except Exception as error:
                warning = warnings.formatwarning(
                    error,
                    DatabaseOperaionFailed,
                    __file__,
                    117,  # type: ignore[arg-type]
                    'DB.create_tables([HostnameModel, ...])')
                print(render_error(warning, stem.util.term.Color.YELLOW),
                      end='',
                      file=sys.stderr)  # pylint: disable=no-member
                continue
            break

    if DEBUG:
        print(
            stem.util.term.format('-*- Initialisation -*-',
                                  stem.util.term.Color.MAGENTA))  # pylint: disable=no-member

        # nuke the db
        if not FLAG_DB:
            _redis_command('delete', 'queue_hostname')
            _redis_command('delete', 'queue_requests')
            _redis_command('delete', 'queue_selenium')

    link_list = list()
    for link in filter(
            None, map(lambda s: s.strip(),
                      args.link)):  # type: ignore[name-defined,var-annotated]
        if DEBUG:
            print(stem.util.term.format(link, stem.util.term.Color.MAGENTA))  # pylint: disable=no-member
        link_list.append(link)

    if args.file is not None:
        for path in args.file:
            with open(path) as file:
                for line in filter(None, map(lambda s: s.strip(), file)):
                    if line.startswith('#'):
                        continue
                    if DEBUG:
                        print(
                            stem.util.term.format(
                                line, stem.util.term.Color.MAGENTA))  # pylint: disable=no-member
                    link_list.append(line)

    # write to database
    link_pool = [parse_link(link) for link in link_list]
    save_requests(link_pool, score=0, nx=True)

    if DEBUG:
        print(
            stem.util.term.format('-' * shutil.get_terminal_size().columns,
                                  stem.util.term.Color.MAGENTA))  # pylint: disable=no-member

    # init link file
    if not os.path.isfile(PATH_LN):
        with open(PATH_LN, 'w') as file:
            print('proxy,scheme,host,hash,link', file=file)

    try:
        process(args.type)
    except BaseException:
        traceback.print_exc()
    _exit()

    return 0
Esempio n. 27
0
_TOR_CTRL = None
# Tor daemon process
_TOR_PROC = None
# Tor bootstrap config
_TOR_CONFIG = {
    'SocksPort': TOR_PORT,
    'ControlPort': TOR_CTRL,
}
_TOR_CONFIG.update(TOR_CFG)

if DEBUG:
    print(
        stem.util.term.format('-*- TOR PROXY -*-',
                              stem.util.term.Color.MAGENTA))  # pylint: disable=no-member
    print(
        render_error(pprint.pformat(_TOR_CONFIG),
                     stem.util.term.Color.MAGENTA))  # pylint: disable=no-member
    print(
        stem.util.term.format('-' * shutil.get_terminal_size().columns,
                              stem.util.term.Color.MAGENTA))  # pylint: disable=no-member


def renew_tor_session():
    """Renew Tor session."""
    global _TOR_CTRL

    try:
        # Tor controller process
        if _TOR_CTRL is None:
            _TOR_CTRL = stem.control.Controller.from_port(port=int(TOR_CTRL))
            _TOR_CTRL.authenticate(TOR_PASS)
        _TOR_CTRL.signal(stem.Signal.NEWNYM)  # pylint: disable=no-member
Esempio n. 28
0
os.makedirs(PATH_MISC, exist_ok=True)

# link file mapping
PATH_LN = os.path.join(PATH_DB, 'link.csv')

# PID file
PATH_ID = os.path.join(PATH_DB, 'darc.pid')

# extract link pattern
_LINK_WHITE_LIST = json.loads(os.getenv('LINK_WHITE_LIST', '[]'))
if DEBUG:
    print(
        stem.util.term.format('-*- LINK WHITE LIST -*-',
                              stem.util.term.Color.MAGENTA))  # pylint: disable=no-member
    print(
        render_error(pprint.pformat(_LINK_WHITE_LIST),
                     stem.util.term.Color.MAGENTA))  # pylint: disable=no-member
    print(
        stem.util.term.format('-' * shutil.get_terminal_size().columns,
                              stem.util.term.Color.MAGENTA))  # pylint: disable=no-member
LINK_WHITE_LIST = [
    re.compile(link, re.IGNORECASE) for link in _LINK_WHITE_LIST
]

# link black list
_LINK_BLACK_LIST = json.loads(os.getenv('LINK_BLACK_LIST', '[]'))
if DEBUG:
    print(
        stem.util.term.format('-*- LINK BLACK LIST -*-',
                              stem.util.term.Color.MAGENTA))  # pylint: disable=no-member
    print(
        render_error(pprint.pformat(_LINK_BLACK_LIST),
Esempio n. 29
0
def fetch_sitemap(link: Link, force: bool = False) -> None:
    """Fetch sitemap.

    The function will first fetch the ``robots.txt``, then
    fetch the sitemaps accordingly.

    Args:
        link: Link object to fetch for its sitemaps.
        force: Force refetch its sitemaps.

    Returns:
        Contents of ``robots.txt`` and sitemaps.

    See Also:
        * :func:`darc.proxy.null.read_robots`
        * :func:`darc.proxy.null.read_sitemap`
        * :func:`darc.parse.get_sitemap`

    """
    if force:
        print(stem.util.term.format(f'[ROBOTS] Force refetch {link.url}',
                                    stem.util.term.Color.YELLOW))  # pylint: disable=no-member

    robots_path = None if force else have_robots(link)
    if robots_path is not None:

        print(stem.util.term.format(f'[ROBOTS] Cached {link.url}',
                                    stem.util.term.Color.YELLOW))  # pylint: disable=no-member
        with open(robots_path) as file:
            robots_text = file.read()

    else:

        robots_link = parse_link(urljoin(link.url, '/robots.txt'))
        print(f'[ROBOTS] Checking {robots_link.url}')

        with request_session(robots_link) as session:
            try:
                response = session.get(robots_link.url)
            except requests.RequestException as error:
                print(render_error(f'[ROBOTS] Failed on {robots_link.url} <{error}>',
                                   stem.util.term.Color.RED), file=sys.stderr)  # pylint: disable=no-member
                return

        if response.ok:
            ct_type = get_content_type(response)
            if ct_type not in ['text/text', 'text/plain']:
                print(render_error(f'[ROBOTS] Unresolved content type on {robots_link.url} ({ct_type})',
                                   stem.util.term.Color.RED), file=sys.stderr)  # pylint: disable=no-member
                robots_text = ''
            else:
                robots_text = response.text
                save_robots(robots_link, robots_text)
                print(f'[ROBOTS] Checked {robots_link.url}')
        else:
            print(render_error(f'[ROBOTS] Failed on {robots_link.url} [{response.status_code}]',
                               stem.util.term.Color.RED), file=sys.stderr)  # pylint: disable=no-member
            robots_text = ''

    if force:
        print(stem.util.term.format(f'[SITEMAP] Force refetch {link.url}',
                                    stem.util.term.Color.YELLOW))  # pylint: disable=no-member

    sitemaps = read_robots(link, robots_text, host=link.host)
    for sitemap_link in sitemaps:
        sitemap_path = None if force else have_sitemap(sitemap_link)
        if sitemap_path is not None:

            print(stem.util.term.format(f'[SITEMAP] Cached {sitemap_link.url}',
                                        stem.util.term.Color.YELLOW))  # pylint: disable=no-member
            with open(sitemap_path) as file:
                sitemap_text = file.read()

        else:

            print(f'[SITEMAP] Fetching {sitemap_link.url}')

            with request_session(sitemap_link) as session:
                try:
                    response = session.get(sitemap_link.url)
                except requests.RequestException as error:
                    print(render_error(f'[SITEMAP] Failed on {sitemap_link.url} <{error}>',
                                       stem.util.term.Color.RED), file=sys.stderr)  # pylint: disable=no-member
                    continue

            if not response.ok:
                print(render_error(f'[SITEMAP] Failed on {sitemap_link.url} [{response.status_code}]',
                                   stem.util.term.Color.RED), file=sys.stderr)  # pylint: disable=no-member
                continue

            # check content type
            ct_type = get_content_type(response)
            if ct_type == 'application/gzip':
                try:
                    sitemap_text = gzip.decompress(response.content).decode()
                except UnicodeDecodeError:
                    sitemap_text = response.text
            elif ct_type in ['text/xml', 'text/html']:
                sitemap_text = response.text
                save_sitemap(sitemap_link, sitemap_text)
            else:
                print(render_error(f'[SITEMAP] Unresolved content type on {sitemap_link.url} ({ct_type})',
                                   stem.util.term.Color.RED), file=sys.stderr)  # pylint: disable=no-member
                continue

            print(f'[SITEMAP] Fetched {sitemap_link.url}')

        # get more sitemaps
        sitemaps.extend(get_sitemap(sitemap_link, sitemap_text, host=link.host))

        # add link to queue
        save_requests(read_sitemap(link, sitemap_text))
Esempio n. 30
0
def crawler(link: Link):
    """Single :mod:`requests` crawler for a entry link.

    Args:
        link: URL to be crawled by :mod:`requests`.

    The function will first parse the URL using
    :func:`~darc.link.parse_link`, and check if need to crawl the
    URL (c.f. :data:`~darc.const.PROXY_WHITE_LIST`, :data:`~darc.const.PROXY_BLACK_LIST`,
    :data:`~darc.const.LINK_WHITE_LIST` and :data:`~darc.const.LINK_BLACK_LIST`);
    if true, then crawl the URL with :mod:`requests`.

    If the URL is from a brand new host, :mod:`darc` will first try
    to fetch and save ``robots.txt`` and sitemaps of the host
    (c.f. :func:`~darc.proxy.null.save_robots` and :func:`~darc.proxy.null.save_sitemap`),
    and extract then save the links from sitemaps (c.f. :func:`~darc.proxy.null.read_sitemap`)
    into link database for future crawling (c.f. :func:`~darc.db.save_requests`).

    .. note::

       A host is new if :func:`~darc.db.have_hostname` returns :data:`True`.

       If :func:`darc.proxy.null.fetch_sitemap` and/or :func:`darc.proxy.i2p.fetch_hosts`
       failed when fetching such documents, the host will be removed from the
       hostname database through :func:`~darc.db.drop_hostname`, and considered
       as new when next encounter.

    Also, if the submission API is provided, :func:`~darc.submit.submit_new_host`
    will be called and submit the documents just fetched.

    If ``robots.txt`` presented, and :data:`~darc.const.FORCE` is
    :data:`False`, :mod:`darc` will check if allowed to crawl the URL.

    .. note::

        The root path (e.g. ``/`` in https://www.example.com/) will always
        be crawled ignoring ``robots.txt``.

    At this point, :mod:`darc` will call the customised hook function
    from :mod:`darc.sites` to crawl and get the final response object.
    :mod:`darc` will save the session cookies and header information,
    using :func:`~darc.save.save_headers`.

    .. note::

        If :exc:`requests.exceptions.InvalidSchema` is raised, the link
        will be saved by :func:`~darc.proxy.null.save_invalid`. Further
        processing is dropped, and the link will be removed from the
        :mod:`requests` database through :func:`~darc.db.drop_requests`.

        If :exc:`~darc.error.LinkNoReturn` is raised, the link will be
        removed from the :mod:`requests` database through
        :func:`~darc.db.drop_requests`.

    If the content type of response document is not ignored (c.f.
    :data:`~darc.const.MIME_WHITE_LIST` and :data:`~darc.const.MIME_BLACK_LIST`),
    :func:`~darc.submit.submit_requests` will be called and submit the document
    just fetched.

    If the response document is HTML (``text/html`` and ``application/xhtml+xml``),
    :func:`~darc.parse.extract_links` will be called then to extract
    all possible links from the HTML document and save such links into
    the database (c.f. :func:`~darc.db.save_requests`).

    And if the response status code is between ``400`` and ``600``,
    the URL will be saved back to the link database
    (c.f. :func:`~darc.db.save_requests`). If **NOT**, the URL will
    be saved into :mod:`selenium` link database to proceed next steps
    (c.f. :func:`~darc.db.save_selenium`).

    """
    print(f'[REQUESTS] Requesting {link.url}')
    try:
        if match_proxy(link.proxy):
            print(render_error(
                f'[REQUESTS] Ignored proxy type from {link.url} ({link.proxy})',
                stem.util.term.Color.YELLOW),
                  file=sys.stderr)  # pylint: disable=no-member
            drop_requests(link)
            return

        if match_host(link.host):
            print(render_error(
                f'[REQUESTS] Ignored hostname from {link.url} ({link.proxy})',
                stem.util.term.Color.YELLOW),
                  file=sys.stderr)  # pylint: disable=no-member
            drop_requests(link)
            return

        # timestamp
        timestamp = datetime.now()

        # if it's a new host
        if not have_hostname(link):
            partial = False

            if link.proxy not in ('zeronet', 'freenet'):
                # fetch sitemap.xml
                try:
                    fetch_sitemap(link)
                except Exception:
                    error = f'[Error fetching sitemap of {link.url}]' + os.linesep + traceback.format_exc() + '-' * shutil.get_terminal_size().columns  # pylint: disable=line-too-long
                    print(render_error(error, stem.util.term.Color.CYAN),
                          file=sys.stderr)  # pylint: disable=no-member
                    partial = True

            if link.proxy == 'i2p':
                # fetch hosts.txt
                try:
                    fetch_hosts(link)
                except Exception:
                    error = f'[Error subscribing hosts from {link.url}]' + os.linesep + traceback.format_exc() + '-' * shutil.get_terminal_size().columns  # pylint: disable=line-too-long
                    print(render_error(error, stem.util.term.Color.CYAN),
                          file=sys.stderr)  # pylint: disable=no-member
                    partial = True

            # submit data / drop hostname from db
            if partial:
                drop_hostname(link)
            submit_new_host(timestamp, link, partial=partial)

        if not FORCE and not check_robots(link):
            print(render_error(
                f'[REQUESTS] Robots disallowed link from {link.url}',
                stem.util.term.Color.YELLOW),
                  file=sys.stderr)  # pylint: disable=no-member
            return

        with request_session(link) as session:
            try:
                # requests session hook
                response = crawler_hook(link, session)
            except requests.exceptions.InvalidSchema as error:
                print(render_error(
                    f'[REQUESTS] Failed on {link.url} <{error}>',
                    stem.util.term.Color.RED),
                      file=sys.stderr)  # pylint: disable=no-member
                save_invalid(link)
                drop_requests(link)
                return
            except requests.RequestException as error:
                print(render_error(
                    f'[REQUESTS] Failed on {link.url} <{error}>',
                    stem.util.term.Color.RED),
                      file=sys.stderr)  # pylint: disable=no-member
                save_requests(link, single=True)
                return
            except LinkNoReturn:
                print(render_error(
                    f'[REQUESTS] Removing from database: {link.url}',
                    stem.util.term.Color.YELLOW),
                      file=sys.stderr)  # pylint: disable=no-member
                drop_requests(link)
                return

            # save headers
            save_headers(timestamp, link, response, session)

            # check content type
            ct_type = get_content_type(response)
            if ct_type not in ['text/html', 'application/xhtml+xml']:
                print(render_error(
                    f'[REQUESTS] Generic content type from {link.url} ({ct_type})',
                    stem.util.term.Color.YELLOW),
                      file=sys.stderr)  # pylint: disable=no-member

                # probably hosts.txt
                if link.proxy == 'i2p' and ct_type in [
                        'text/plain', 'text/text'
                ]:
                    text = response.text
                    save_requests(read_hosts(text))

                if match_mime(ct_type):
                    drop_requests(link)
                    return

                # submit data
                data = response.content
                submit_requests(timestamp,
                                link,
                                response,
                                session,
                                data,
                                mime_type=ct_type,
                                html=False)

                return

            html = response.content
            if not html:
                print(render_error(
                    f'[REQUESTS] Empty response from {link.url}',
                    stem.util.term.Color.RED),
                      file=sys.stderr)  # pylint: disable=no-member
                save_requests(link, single=True)
                return

            # submit data
            submit_requests(timestamp,
                            link,
                            response,
                            session,
                            html,
                            mime_type=ct_type,
                            html=True)

            # add link to queue
            save_requests(extract_links(link, html), score=0, nx=True)

            if not response.ok:
                print(render_error(
                    f'[REQUESTS] Failed on {link.url} [{response.status_code}]',
                    stem.util.term.Color.RED),
                      file=sys.stderr)  # pylint: disable=no-member
                save_requests(link, single=True)
                return

            # add link to queue
            save_selenium(link, single=True, score=0, nx=True)
    except Exception:
        error = f'[Error from {link.url}]' + os.linesep + traceback.format_exc() + '-' * shutil.get_terminal_size().columns  # pylint: disable=line-too-long
        print(render_error(error, stem.util.term.Color.CYAN), file=sys.stderr)  # pylint: disable=no-member
        save_requests(link, single=True)
    print(f'[REQUESTS] Requested {link.url}')