def _check_ng(temp_list: typing.List[Link]) -> typing.List[Link]: """Check content type of links through ``HEAD`` requests. Args: temp_list: List of links to be checked. Returns: List of links matches the requirements. See Also: * :func:`darc.parse.match_host` * :func:`darc.parse.match_proxy` * :func:`darc.parse.match_mime` """ from darc.crawl import request_session # pylint: disable=import-outside-toplevel session_map = dict() result_list = list() for link in temp_list: if match_host(link.host): continue if match_proxy(link.proxy): continue # get session session = session_map.get(link.proxy) if session is None: session = request_session(link, futures=True) session_map[link.proxy] = session result = session.head(link.url, allow_redirects=True) result_list.append(result) print(f'[HEAD] Checking content type from {link.url}') link_list = list() for result in concurrent.futures.as_completed(result_list): try: response: typing.Response = result.result() except requests.RequestException as error: if error.response is None: print(render_error(f'[HEAD] Checking failed <{error}>', stem.util.term.Color.RED)) # pylint: disable=no-member continue print(render_error(f'[HEAD] Failed on {error.response.url} <{error}>', stem.util.term.Color.RED)) # pylint: disable=no-member link_list.append(error.response.url) continue ct_type = get_content_type(response) print(f'[HEAD] Checked content type from {response.url} ({ct_type})') if match_mime(ct_type): continue temp_link = parse_link(response.request.url) link_list.append(temp_link) return link_list
def process_loader( cls, timestamp: typing.Datetime, driver: typing.Driver, link: Link, record: CacheRecord) -> None: # pylint: disable=unused-argument """Process the :class:`WebDriver <selenium.webdriver.Chrome>` object. Args: timestamp: Timestamp of the worker node reference. driver (selenium.webdriver.Chrome): Web driver object with proxy settings and cookies presets. link: Link object to be loaded. record: Cached record from the remote database. """ driver.get(link.url) # wait for page to finish loading if SE_WAIT is not None: time.sleep(SE_WAIT) # get HTML source html = driver.page_source if html == SE_EMPTY: print(render_error(f'[SELENIUM] Empty page from {link.url}', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member save_selenium(link, single=True) return screenshot = None try: # get maximum height height = driver.execute_script('return document.body.scrollHeight') # resize window (with some magic numbers) if height < 1000: height = 1000 driver.set_window_size(1024, math.ceil(height * 1.1)) # take a full page screenshot screenshot = driver.get_screenshot_as_base64() except Exception as error: print(render_error( f'[SELENIUM] Fail to save screenshot from {link.url} <{error}>', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member # submit data submit_selenium(timestamp, link, html, screenshot) # add link to queue extracted_links = cls._extract_links(link, html) save_requests(extracted_links, score=0, nx=True)
def freenet_bootstrap() -> None: """Bootstrap wrapper for Freenet. The function will bootstrap the Freenet proxy. It will retry for :data:`~darc.proxy.freenet.FREENET_RETRY` times in case of failure. Also, it will **NOT** re-bootstrap the proxy as is guaranteed by :data:`~darc.proxy.freenet._FREENET_BS_FLAG`. Warns: FreenetBootstrapFailed: If failed to bootstrap Freenet proxy. Raises: :exc:`UnsupportedPlatform`: If the system is not supported, i.e. not macOS or Linux. See Also: * :func:`darc.proxy.freenet._freenet_bootstrap` * :data:`darc.proxy.freenet.FREENET_RETRY` * :data:`darc.proxy.freenet._FREENET_BS_FLAG` """ if _unsupported: raise UnsupportedPlatform(f'unsupported system: {platform.system()}') # don't re-bootstrap if _FREENET_BS_FLAG: return print( stem.util.term.format('-*- Freenet Bootstrap -*-', stem.util.term.Color.MAGENTA)) # pylint: disable=no-member for _ in range(FREENET_RETRY + 1): try: _freenet_bootstrap() break except Exception as error: if DEBUG: message = '[Error bootstraping Freenet proxy]' + os.linesep + traceback.format_exc( ) print(render_error(message, stem.util.term.Color.RED), end='', file=sys.stderr) # pylint: disable=no-member warning = warnings.formatwarning(str(error), FreenetBootstrapFailed, __file__, 147, 'freenet_bootstrap()') print(render_error(warning, stem.util.term.Color.YELLOW), end='', file=sys.stderr) # pylint: disable=no-member print( stem.util.term.format('-' * shutil.get_terminal_size().columns, stem.util.term.Color.MAGENTA)) # pylint: disable=no-member
def have_hostname(link: Link) -> typing.Tuple[bool, bool]: """Check if current link is a new host. Args: link: Link to check against. Returns: A tuple of two :obj:`bool` values representing if such link is a known host and needs force refetch respectively. See Also: * :func:`darc.db._have_hostname_db` * :func:`darc.db._have_hostname_redis` """ if FLAG_DB: with database.connection_context(): try: return _have_hostname_db(link) except Exception as error: warning = warnings.formatwarning(str(error), DatabaseOperaionFailed, __file__, 236, f'_have_hostname_db({link})') print(render_error(warning, stem.util.term.Color.YELLOW), end='', file=sys.stderr) # pylint: disable=no-member return False, False return _have_hostname_redis(link)
def _load_requests_redis() -> typing.List[Link]: """Load link from the :mod:`requests` database. The function reads the ``queue_requests`` database. Returns: List of loaded links from the :mod:`requests` database. Note: At runtime, the function will load links with maximum number at :data:`~darc.db.MAX_POOL` to limit the memory usage. """ now = time.time() if TIME_CACHE is None: sec_delta = 0 # type: ignore max_score = now else: sec_delta = TIME_CACHE.total_seconds() max_score = now - sec_delta try: with _redis_get_lock('lock_queue_requests', blocking_timeout=LOCK_TIMEOUT): # type: ignore temp_pool: typing.List[bytes] = [_redis_command('get', name) for name in _redis_command('zrangebyscore', 'queue_requests', # pylint: disable=line-too-long min=0, max=max_score, start=0, num=MAX_POOL)] # pylint: disable=line-too-long link_pool = [pickle.loads(link) for link in filter(None, temp_pool)] # nosec if TIME_CACHE is not None: new_score = now + sec_delta _save_requests_redis(link_pool, score=new_score) # force update records except redis_lock.LockError: warning = warnings.formatwarning(f'[REQUESTS] Failed to acquire Redis lock after {LOCK_TIMEOUT} second(s)', LockWarning, __file__, 949, "_redis_get_lock('lock_queue_requests')") print(render_error(warning, stem.util.term.Color.YELLOW), end='', file=sys.stderr) # pylint: disable=no-member link_pool = list() return link_pool
def submit(api: str, domain: Domain, data: typing.Dict[str, typing.Any]): """Submit data. Args: api: API URL. domain (``'new_host'``, ``'requests'`` or ``'selenium'``): Domain of the submit data. data: Submit data. See Also: * :data:`darc.submit.API_RETRY` * :func:`darc.submit.save_submit` * :func:`darc.submit.submit_new_host` * :func:`darc.submit.submit_requests` * :func:`darc.submit.submit_selenium` """ with null_session() as session: for _ in range(API_RETRY + 1): try: response = session.post(api, json=data) if response.ok: return except requests.RequestException as error: warning = warnings.formatwarning( error, APIRequestFailed, __file__, 150, f'[{domain.upper()}] response = requests.post(api, json=data)' ) print(render_error(warning, stem.util.term.Color.YELLOW), end='', file=sys.stderr) # pylint: disable=no-member save_submit(domain, data)
def _db_operation(operation: typing.Callable[..., typing.T], *args, **kwargs) -> typing.T: # type: ignore """Retry operation on database. Args: operation: Callable / method to perform. *args: Arbitrary positional arguments. Keyword Args: **kwargs: Arbitrary keyword arguments. Returns: Any return value from a successful ``operation`` call. """ _arg_msg = None while True: try: value = operation(*args, **kwargs) except Exception as error: if _arg_msg is None: _arg_msg = _gen_arg_msg(*args, **kwargs) model = typing.cast(typing.MethodType, operation).__self__.__class__.__name__ warning = warnings.formatwarning(str(error), DatabaseOperaionFailed, __file__, 166, f'{model}.{operation.__name__}({_arg_msg})') print(render_error(warning, stem.util.term.Color.YELLOW), end='', file=sys.stderr) # pylint: disable=no-member if RETRY_INTERVAL is not None: time.sleep(RETRY_INTERVAL) continue break return value
def _zeronet_bootstrap(): """ZeroNet bootstrap. The bootstrap arguments are defined as :data:`~darc.proxy.zeronet._ZERONET_ARGS`. Raises: subprocess.CalledProcessError: If the return code of :data:`~darc.proxy.zeronet._ZERONET_PROC` is non-zero. See Also: * :func:`darc.proxy.zeronet.zeronet_bootstrap` * :data:`darc.proxy.zeronet.BS_WAIT` * :data:`darc.proxy.zeronet._ZERONET_BS_FLAG` * :data:`darc.proxy.zeronet._ZERONET_PROC` """ global _ZERONET_BS_FLAG, _ZERONET_PROC # launch Tor first tor_bootstrap() # launch ZeroNet process _ZERONET_PROC = subprocess.Popen( _ZERONET_ARGS, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, ) try: stdout, stderr = _ZERONET_PROC.communicate(timeout=BS_WAIT) except subprocess.TimeoutExpired as error: stdout, stderr = error.stdout, error.stderr if VERBOSE: if stdout is not None: print(render_error(stdout, stem.util.term.Color.BLUE)) # pylint: disable=no-member if stderr is not None: print(render_error(stderr, stem.util.term.Color.RED)) # pylint: disable=no-member returncode = _ZERONET_PROC.returncode if returncode is not None and returncode != 0: raise subprocess.CalledProcessError(returncode, _ZERONET_ARGS, _ZERONET_PROC.stdout, _ZERONET_PROC.stderr) # update flag _ZERONET_BS_FLAG = True
def tor_bootstrap(): """Bootstrap wrapper for Tor. The function will bootstrap the Tor proxy. It will retry for :data:`~darc.proxy.tor.TOR_RETRY` times in case of failure. Also, it will **NOT** re-bootstrap the proxy as is guaranteed by :data:`~darc.proxy.tor._TOR_BS_FLAG`. Warns: TorBootstrapFailed: If failed to bootstrap Tor proxy. See Also: * :func:`darc.proxy.tor._tor_bootstrap` * :data:`darc.proxy.tor.TOR_RETRY` * :data:`darc.proxy.tor._TOR_BS_FLAG` """ # don't re-bootstrap if _TOR_BS_FLAG: return print( stem.util.term.format('-*- Tor Bootstrap -*-', stem.util.term.Color.MAGENTA)) # pylint: disable=no-member for _ in range(TOR_RETRY + 1): try: _tor_bootstrap() break except Exception as error: if DEBUG: message = '[Error bootstraping Tor proxy]' + os.linesep + traceback.format_exc( ) print(render_error(message, stem.util.term.Color.RED), end='', file=sys.stderr) # pylint: disable=no-member warning = warnings.formatwarning(error, TorBootstrapFailed, __file__, 170, 'tor_bootstrap()') print(render_error(warning, stem.util.term.Color.YELLOW), end='', file=sys.stderr) # pylint: disable=no-member print( stem.util.term.format('-' * shutil.get_terminal_size().columns, stem.util.term.Color.MAGENTA)) # pylint: disable=no-member
def _freenet_bootstrap() -> None: """Freenet bootstrap. The bootstrap arguments are defined as :data:`~darc.proxy.freenet._FREENET_ARGS`. Raises: subprocess.CalledProcessError: If the return code of :data:`~darc.proxy.freenet._FREENET_PROC` is non-zero. See Also: * :func:`darc.proxy.freenet.freenet_bootstrap` * :data:`darc.proxy.freenet.BS_WAIT` * :data:`darc.proxy.freenet._FREENET_BS_FLAG` * :data:`darc.proxy.freenet._FREENET_PROC` """ global _FREENET_BS_FLAG, _FREENET_PROC # pylint: disable=global-statement # launch Freenet process _FREENET_PROC = subprocess.Popen( # nosec _FREENET_ARGS, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, ) try: stdout, stderr = _FREENET_PROC.communicate(timeout=BS_WAIT) except subprocess.TimeoutExpired as error: stdout, stderr = error.stdout, error.stderr if VERBOSE: if stdout is not None: print(render_error(stdout, stem.util.term.Color.BLUE)) # pylint: disable=no-member if stderr is not None: print(render_error(stderr, stem.util.term.Color.RED)) # pylint: disable=no-member returncode = _FREENET_PROC.returncode if returncode != 0: raise subprocess.CalledProcessError( returncode, _FREENET_ARGS, typing.cast(typing.IO[bytes], _FREENET_PROC.stdout).read(), typing.cast(typing.IO[bytes], _FREENET_PROC.stderr).read()) # update flag _FREENET_BS_FLAG = True
def load_selenium(check: bool = CHECK) -> typing.List[Link]: """Load link from the :mod:`selenium` database. Args: check: If perform checks on loaded links, default to :data:`~darc.const.CHECK`. Returns: List of loaded links from the :mod:`selenium` database. Note: At runtime, the function will load links with maximum number at :data:`~darc.db.MAX_POOL` to limit the memory usage. See Also: * :func:`darc.db._load_selenium_db` * :func:`darc.db._load_selenium_redis` """ if FLAG_DB: with database.connection_context(): try: link_pool = _load_selenium_db() except Exception as error: warning = warnings.formatwarning(str(error), DatabaseOperaionFailed, __file__, 983, '_load_selenium_db()') print(render_error(warning, stem.util.term.Color.YELLOW), end='', file=sys.stderr) # pylint: disable=no-member link_pool = list() else: link_pool = _load_selenium_redis() if check: link_pool = _check(link_pool) if VERBOSE: print(stem.util.term.format('-*- [SELENIUM] LINK POOL -*-', stem.util.term.Color.MAGENTA)) # pylint: disable=no-member print(render_error(pprint.pformat(sorted(link.url for link in link_pool)), stem.util.term.Color.MAGENTA)) # pylint: disable=no-member print(stem.util.term.format('-' * shutil.get_terminal_size().columns, stem.util.term.Color.MAGENTA)) # pylint: disable=no-member return link_pool
def _redis_command(command: str, *args, **kwargs) -> typing.Any: """Wrapper function for Redis command. Args: command: Command name. *args: Arbitrary arguments for the Redis command. Keyword Args: **kwargs: Arbitrary keyword arguments for the Redis command. Return: Values returned from the Redis command. Warns: RedisCommandFailed: Warns at each round when the command failed. See Also: Between each retry, the function sleeps for :data:`~darc.db.REDIS_RETRY` second(s) if such value is **NOT** :data:`None`. """ _arg_msg = None method = getattr(redis, command) while True: try: value = method(*args, **kwargs) except Exception as error: if _arg_msg is None: _args = ', '.join(map(repr, args)) _kwargs = ', '.join(f'{k}={v!r}' for k, v in kwargs.items()) if _kwargs: if _args: _args += ', ' _args += _kwargs _arg_msg = textwrap.shorten(_args, shutil.get_terminal_size().columns) warning = warnings.formatwarning( error, RedisCommandFailed, __file__, 85, f'value = redis.{command}({_arg_msg})') print(render_error(warning, stem.util.term.Color.YELLOW), end='', file=sys.stderr) # pylint: disable=no-member if REDIS_RETRY is not None: time.sleep(REDIS_RETRY) continue break return value
def _load_selenium_redis() -> typing.List[Link]: """Load link from the :mod:`selenium` database. The function reads the ``queue_selenium`` database. Args: check: If perform checks on loaded links, default to :data:`~darc.const.CHECK`. Returns: List of loaded links from the :mod:`selenium` database. Note: At runtime, the function will load links with maximum number at :data:`~darc.db.MAX_POOL` to limit the memory usage. """ now = time.time() if TIME_CACHE is None: sec_delta = 0 max_score = now else: sec_delta = TIME_CACHE.total_seconds() max_score = now - sec_delta try: with _redis_get_lock('lock_queue_selenium', blocking_timeout=LOCK_TIMEOUT): link_pool = [ pickle.loads(link) for link in _redis_command('zrangebyscore', 'queue_selenium', min=0, max=max_score, start=0, num=MAX_POOL) ] if TIME_CACHE is not None: new_score = now + sec_delta _save_selenium_redis(link_pool, score=new_score) # force update records except redis_lock.LockError: warning = warnings.formatwarning( f'[SELENIUM] Failed to acquire Redis lock after {LOCK_TIMEOUT} second(s)', LockWarning, __file__, 299, "_redis_get_lock('lock_queue_selenium')") print(render_error(warning, stem.util.term.Color.YELLOW), end='', file=sys.stderr) # pylint: disable=no-member link_pool = list() return link_pool
def renew_tor_session(): """Renew Tor session.""" global _TOR_CTRL try: # Tor controller process if _TOR_CTRL is None: _TOR_CTRL = stem.control.Controller.from_port(port=int(TOR_CTRL)) _TOR_CTRL.authenticate(TOR_PASS) _TOR_CTRL.signal(stem.Signal.NEWNYM) # pylint: disable=no-member except Exception as error: warning = warnings.formatwarning( error, TorRenewFailed, __file__, 88, '_TOR_CTRL = stem.control.Controller.from_port(port=int(TOR_CTRL))' ) print(render_error(warning, stem.util.term.Color.YELLOW), end='', file=sys.stderr) # pylint: disable=no-member
def crawler(session: typing.Session, link: Link) -> typing.NoReturn: # pylint: disable=unused-argument """Crawler hook for data URIs. Args: session (:class:`requests.Session`): Session object with proxy settings. link: Link object to be crawled. Raises: LinkNoReturn: This link has no return response. """ try: save_data(link) except ValueError as error: print(render_error( f'[REQUESTS] Failed to save data URI from {link.url} <{error}>', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member raise LinkNoReturn
def save_requests(entries: typing.Union[Link, typing.List[Link]], single: bool = False, # pylint: disable=inconsistent-return-statements score: typing.Optional[float] = None, nx: bool = False, xx: bool = False) -> None: """Save link to the :mod:`requests` database. The function updates the ``queue_requests`` database. Args: entries: Links to be added to the :mod:`requests` database. It can be either a :obj:`list` of links, or a single link string (if ``single`` set as :data:`True`). single: Indicate if ``entries`` is a :obj:`list` of links or a single link string. score: Score to for the Redis sorted set. nx: Only create new elements and not to update scores for elements that already exist. xx: Only update scores of elements that already exist. New elements will not be added. Notes: The ``entries`` will be dumped through :mod:`pickle` so that :mod:`darc` do not need to parse them again. When ``entries`` is a list of :class:`~darc.link.Link` instances, we tries to perform *bulk* update to easy the memory consumption. The *bulk* size is defined by :data:`~darc.db.BULK_SIZE`. See Also: * :func:`darc.db._save_requests_db` * :func:`darc.db._save_requests_redis` """ if FLAG_DB: with database.connection_context(): try: return _save_requests_db(entries, single, score, nx, xx) except Exception as error: warning = warnings.formatwarning(str(error), DatabaseOperaionFailed, __file__, 505, '_save_requests_db(...)') print(render_error(warning, stem.util.term.Color.YELLOW), end='', file=sys.stderr) # pylint: disable=no-member return return _save_requests_redis(entries, single, score, nx, xx)
def drop_selenium(link: Link) -> None: # pylint: disable=inconsistent-return-statements """Remove link from the :mod:`selenium` database. Args: link: Link to be removed. See Also: * :func:`darc.db._drop_selenium_db` * :func:`darc.db._drop_selenium_redis` """ if FLAG_DB: with database.connection_context(): try: return _drop_selenium_db(link) except Exception as error: warning = warnings.formatwarning(str(error), DatabaseOperaionFailed, __file__, 433, f'_drop_selenium_db({link})') print(render_error(warning, stem.util.term.Color.YELLOW), end='', file=sys.stderr) # pylint: disable=no-member return return _drop_selenium_redis(link)
def load_requests(check: bool = CHECK) -> typing.List[Link]: """Load link from the :mod:`requests` database. Args: check: If perform checks on loaded links, default to :data:`~darc.const.CHECK`. Returns: List of loaded links from the :mod:`requests` database. Note: At runtime, the function will load links with maximum number at :data:`~darc.db.MAX_POOL` to limit the memory usage. See Also: * :func:`darc.db._load_requests_db` * :func:`darc.db._load_requests_redis` """ if FLAG_DB: with database.connection_context(): link_pool = _load_requests_db() else: link_pool = _load_requests_redis() if check: link_pool = _check(link_pool) if VERBOSE: print( stem.util.term.format('-*- [REQUESTS] LINK POOL -*-', stem.util.term.Color.MAGENTA)) # pylint: disable=no-member print( render_error( pprint.pformat(sorted(link.url for link in link_pool)), stem.util.term.Color.MAGENTA)) # pylint: disable=no-member print( stem.util.term.format('-' * shutil.get_terminal_size().columns, stem.util.term.Color.MAGENTA)) # pylint: disable=no-member return link_pool
def _redis_command(command: str, *args, **kwargs) -> typing.Any: # type: ignore """Wrapper function for Redis command. Args: command: Command name. *args: Arbitrary arguments for the Redis command. Keyword Args: **kwargs: Arbitrary keyword arguments for the Redis command. Return: Values returned from the Redis command. Warns: RedisCommandFailed: Warns at each round when the command failed. See Also: Between each retry, the function sleeps for :data:`~darc.db.RETRY_INTERVAL` second(s) if such value is **NOT** :data:`None`. """ _arg_msg = None method = getattr(redis, command) while True: try: value = method(*args, **kwargs) except Exception as error: if _arg_msg is None: _arg_msg = _gen_arg_msg(*args, **kwargs) warning = warnings.formatwarning(str(error), RedisCommandFailed, __file__, 131, f'value = redis.{command}({_arg_msg})') print(render_error(warning, stem.util.term.Color.YELLOW), end='', file=sys.stderr) # pylint: disable=no-member if RETRY_INTERVAL is not None: time.sleep(RETRY_INTERVAL) continue break return value
_FREENET_ARGS = list() else: _FREENET_ARGS = [os.path.join(FREENET_PATH, 'run.sh'), 'start'] _FREENET_ARGS.extend(FREENET_ARGS) if DEBUG: print( stem.util.term.format('-*- FREENET PROXY -*-', stem.util.term.Color.MAGENTA)) # pylint: disable=no-member if _unsupported: print( stem.util.term.format(f'unsupported system: {platform.system()}', stem.util.term.Color.RED)) # pylint: disable=no-member else: print( render_error(pprint.pformat(_FREENET_ARGS), stem.util.term.Color.MAGENTA)) # pylint: disable=no-member print( stem.util.term.format('-' * shutil.get_terminal_size().columns, stem.util.term.Color.MAGENTA)) # pylint: disable=no-member def _freenet_bootstrap() -> None: """Freenet bootstrap. The bootstrap arguments are defined as :data:`~darc.proxy.freenet._FREENET_ARGS`. Raises: subprocess.CalledProcessError: If the return code of :data:`~darc.proxy.freenet._FREENET_PROC` is non-zero. See Also: * :func:`darc.proxy.freenet.freenet_bootstrap`
def loader(link: Link): """Single :mod:`selenium` loader for a entry link. Args: Link: URL to be crawled by :mod:`selenium`. The function will first parse the URL using :func:`~darc.link.parse_link` and start loading the URL using :mod:`selenium` with Google Chrome. At this point, :mod:`darc` will call the customised hook function from :mod:`darc.sites` to load and return the original :class:`selenium.webdriver.Chrome` object. .. note:: If :exc:`~darc.error.LinkNoReturn` is raised, the link will be removed from the :mod:`selenium` database through :func:`~darc.db.drop_selenium`. If successful, the rendered source HTML document will be saved, and a full-page screenshot will be taken and saved. .. note:: When taking full-page screenshot, :func:`~darc.crawl.loader` will use :javascript:`document.body.scrollHeight` to get the total height of web page. If the page height is *less than* **1,000 pixels**, then :mod:`darc` will by default set the height as **1,000 pixels**. Later :mod:`darc` will tell :mod:`selenium` to resize the window (in *headless* mode) to **1,024 pixels** in width and **110%** of the page height in height, and take a *PNG* screenshot. If the submission API is provided, :func:`~darc.submit.submit_selenium` will be called and submit the document just loaded. Later, :func:`~darc.parse.extract_links` will be called then to extract all possible links from the HTML document and save such links into the :mod:`requests` database (c.f. :func:`~darc.db.save_requests`). .. seealso:: * :data:`darc.const.SE_EMPTY` * :data:`darc.const.SE_WAIT` """ print(f'[SELENIUM] Loading {link.url}') try: # timestamp timestamp = datetime.now() # retrieve source from Chrome with request_driver(link) as driver: try: # selenium driver hook driver = loader_hook(link, driver) except urllib3.exceptions.HTTPError as error: print(render_error( f'[SELENIUM] Fail to load {link.url} <{error}>', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member save_selenium(link, single=True) return except selenium.common.exceptions.WebDriverException as error: print(render_error( f'[SELENIUM] Fail to load {link.url} <{error}>', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member save_selenium(link, single=True) return except LinkNoReturn: print(render_error( f'[SELENIUM] Removing from database: {link.url}', stem.util.term.Color.YELLOW), file=sys.stderr) # pylint: disable=no-member drop_selenium(link) return # get HTML source html = driver.page_source if html == SE_EMPTY: print(render_error(f'[SELENIUM] Empty page from {link.url}', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member save_selenium(link, single=True) return screenshot = None try: # get maximum height height = driver.execute_script( 'return document.body.scrollHeight') # resize window (with some magic numbers) if height < 1000: height = 1000 driver.set_window_size(1024, math.ceil(height * 1.1)) # take a full page screenshot screenshot = driver.get_screenshot_as_base64() except Exception as error: print(render_error( f'[SELENIUM] Fail to save screenshot from {link.url} <{error}>', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member # submit data submit_selenium(timestamp, link, html, screenshot) # add link to queue save_requests(extract_links(link, html), score=0, nx=True) except Exception: error = f'[Error from {link.url}]' + os.linesep + traceback.format_exc() + '-' * shutil.get_terminal_size().columns # pylint: disable=line-too-long print(render_error(error, stem.util.term.Color.CYAN), file=sys.stderr) # pylint: disable=no-member save_selenium(link, single=True) print(f'[SELENIUM] Loaded {link.url}')
def fetch_hosts(link: Link): """Fetch ``hosts.txt``. Args: link: Link object to fetch for its ``hosts.txt``. Returns: Content of the ``hosts.txt`` file. """ hosts_path = have_hosts(link) if hosts_path is not None: print( stem.util.term.format(f'[HOSTS] Cached {link.url}', stem.util.term.Color.YELLOW)) # pylint: disable=no-member with open(hosts_path) as hosts_file: hosts_text = hosts_file.read() else: from darc.requests import i2p_session # pylint: disable=import-outside-toplevel hosts_link = parse_link(urljoin(link.url, '/hosts.txt')) print(f'[HOSTS] Subscribing {hosts_link.url}') with i2p_session() as session: try: response = session.get(hosts_link.url) except requests.RequestException as error: print(render_error( f'[HOSTS] Failed on {hosts_link.url} <{error}>', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member return if not response.ok: print(render_error( f'[HOSTS] Failed on {hosts_link.url} [{response.status_code}]', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member return ct_type = get_content_type(response) if ct_type not in ['text/text', 'text/plain']: print(render_error( f'[HOSTS] Unresolved content type on {hosts_link.url} ({ct_type}', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member return hosts_text = response.text save_hosts(hosts_link, hosts_text) print(f'[HOSTS] Subscribed {hosts_link.url}') from darc.db import save_requests # pylint: disable=import-outside-toplevel # add link to queue save_requests(read_hosts(hosts_text))
def submit_new_host(time: typing.Datetime, link: Link, partial: bool = False): """Submit new host. When a new host is discovered, the :mod:`darc` crawler will submit the host information. Such includes ``robots.txt`` (if exists) and ``sitemap.xml`` (if any). Args: time (datetime.datetime): Timestamp of submission. link: Link object of submission. partial: If the data is not complete, i.e. failed when fetching ``robots.txt``, ``hosts.txt`` and/or sitemaps. If :data:`~darc.submit.API_NEW_HOST` is :data:`None`, the data for submission will directly be save through :func:`~darc.submit.save_submit`. The data submitted should have following format:: { // partial flag - true / false "$PARTIAL$": ..., // metadata of URL "[metadata]": { // original URL - <scheme>://<netloc>/<path>;<params>?<query>#<fragment> "url": ..., // proxy type - null / tor / i2p / zeronet / freenet "proxy": ..., // hostname / netloc, c.f. ``urllib.parse.urlparse`` "host": ..., // base folder, relative path (to data root path ``PATH_DATA``) in containter - <proxy>/<scheme>/<host> "base": ..., // sha256 of URL as name for saved files (timestamp is in ISO format) // JSON log as this one - <base>/<name>_<timestamp>.json // HTML from requests - <base>/<name>_<timestamp>_raw.html // HTML from selenium - <base>/<name>_<timestamp>.html // generic data files - <base>/<name>_<timestamp>.dat "name": ... }, // requested timestamp in ISO format as in name of saved file "Timestamp": ..., // original URL "URL": ..., // robots.txt from the host (if not exists, then ``null``) "Robots": { // path of the file, relative path (to data root path ``PATH_DATA``) in container // - <proxy>/<scheme>/<host>/robots.txt "path": ..., // content of the file (**base64** encoded) "data": ..., }, // sitemaps from the host (if none, then ``null``) "Sitemaps": [ { // path of the file, relative path (to data root path ``PATH_DATA``) in container // - <proxy>/<scheme>/<host>/sitemap_<name>.txt "path": ..., // content of the file (**base64** encoded) "data": ..., }, ... ], // hosts.txt from the host (if proxy type is ``i2p``; if not exists, then ``null``) "Hosts": { // path of the file, relative path (to data root path ``PATH_DATA``) in container // - <proxy>/<scheme>/<host>/hosts.txt "path": ..., // content of the file (**base64** encoded) "data": ..., } } See Also: * :data:`darc.submit.API_NEW_HOST` * :func:`darc.submit.submit` * :func:`darc.submit.save_submit` * :func:`darc.submit.get_metadata` * :func:`darc.submit.get_robots` * :func:`darc.submit.get_sitemap` * :func:`darc.submit.get_hosts` """ metadata = get_metadata(link) ts = time.isoformat() robots = get_robots(link) sitemap = get_sitemap(link) hosts = get_hosts(link) if SAVE_DB: model, _ = HostnameModel.get_or_create( hostname=link.host, defaults=dict( proxy=HostnameModel.Proxy[link.proxy.upper()], discovery=time, last_seen=time, alive=False, since=time, )) if robots is not None: RobotsModel.create( host=model, timestamp=time, document=base64.b64decode(robots['data']).decode(), ) if sitemap is not None: SitemapModel.create( host=model, timestamp=time, document=base64.b64decode(sitemap['data']).decode(), ) if hosts is not None: HostsModel.create( host=model, timestamp=time, document=base64.b64decode(hosts['data']).decode(), ) data = { '$PARTIAL$': partial, '[metadata]': metadata, 'Timestamp': ts, 'URL': link.host, 'Robots': robots, 'Sitemaps': sitemap, 'Hosts': hosts, } if DEBUG: print( stem.util.term.format('-*- NEW HOST DATA -*-', stem.util.term.Color.MAGENTA)) # pylint: disable=no-member print(render_error(pprint.pformat(data), stem.util.term.Color.MAGENTA)) # pylint: disable=no-member print( stem.util.term.format('-' * shutil.get_terminal_size().columns, stem.util.term.Color.MAGENTA)) # pylint: disable=no-member if API_NEW_HOST is None: save_submit('new_host', data) return # submit data submit(API_NEW_HOST, 'new_host', data)
def submit_requests(time: typing.Datetime, link: Link, response: typing.Response, session: typing.Session, content: bytes, mime_type: str, html: bool = True): """Submit requests data. When crawling, we'll first fetch the URl using :mod:`requests`, to check its availability and to save its HTTP headers information. Such information will be submitted to the web UI. Args: time (datetime.datetime): Timestamp of submission. link: Link object of submission. response (requests.Response): Response object of submission. session (requests.Session): Session object of submission. content: Raw content of from the response. mime_type: Content type. html: If current document is HTML or other files. If :data:`~darc.submit.API_REQUESTS` is :data:`None`, the data for submission will directly be save through :func:`~darc.submit.save_submit`. The data submitted should have following format:: { // metadata of URL "[metadata]": { // original URL - <scheme>://<netloc>/<path>;<params>?<query>#<fragment> "url": ..., // proxy type - null / tor / i2p / zeronet / freenet "proxy": ..., // hostname / netloc, c.f. ``urllib.parse.urlparse`` "host": ..., // base folder, relative path (to data root path ``PATH_DATA``) in containter - <proxy>/<scheme>/<host> "base": ..., // sha256 of URL as name for saved files (timestamp is in ISO format) // JSON log as this one - <base>/<name>_<timestamp>.json // HTML from requests - <base>/<name>_<timestamp>_raw.html // HTML from selenium - <base>/<name>_<timestamp>.html // generic data files - <base>/<name>_<timestamp>.dat "name": ... }, // requested timestamp in ISO format as in name of saved file "Timestamp": ..., // original URL "URL": ..., // request method "Method": "GET", // response status code "Status-Code": ..., // response reason "Reason": ..., // response cookies (if any) "Cookies": { ... }, // session cookies (if any) "Session": { ... }, // request headers (if any) "Request": { ... }, // response headers (if any) "Response": { ... }, // Content type "Content-Type": ..., // requested file (if not exists, then ``null``) "Document": { // path of the file, relative path (to data root path ``PATH_DATA``) in container // - <proxy>/<scheme>/<host>/<name>_<timestamp>_raw.html // or if the document is of generic content type, i.e. not HTML // - <proxy>/<scheme>/<host>/<name>_<timestamp>.dat "path": ..., // content of the file (**base64** encoded) "data": ..., }, // redirection history (if any) "History": [ // same record data as the original response {"...": "..."} ] } See Also: * :data:`darc.submit.API_REQUESTS` * :func:`darc.submit.submit` * :func:`darc.submit.save_submit` * :func:`darc.submit.get_metadata` * :func:`darc.submit.get_raw` * :func:`darc.crawl.crawler` """ metadata = get_metadata(link) ts = time.isoformat() if html: path = f'{link.base}/{link.name}_{ts}_raw.html' else: path = f'{link.base}/{link.name}_{ts}.dat' if SAVE_DB: url, _ = URLModel.get_or_create( hash=link.name, defaults=dict( url=link.url, hostname=HostnameModel.get( HostnameModel.hostname == link.host), proxy=URLModel.Proxy[link.proxy.upper()], discovery=time, last_seen=time, alive=False, since=time, )) model = RequestsModel.create( url=url, timestamp=time, method=response.request.method, document=content, mime_type=mime_type, is_html=html, status_code=response.status_code, reason=response.reason, cookies=response.cookies.get_dict(), session=response.cookies.get_dict(), request=dict(response.request.headers), response=dict(response.headers), ) for index, history in enumerate(response.history): RequestsHistoryModel.create( index=index, model=model, url=history.url, timestamp=time, method=history.request.method, document=history.content, status_code=history.status_code, reason=history.reason, cookies=history.cookies.get_dict(), request=dict(history.request.headers), response=dict(history.headers), ) data = { '[metadata]': metadata, 'Timestamp': ts, 'URL': link.url, 'Method': response.request.method, 'Status-Code': response.status_code, 'Reason': response.reason, 'Cookies': response.cookies.get_dict(), 'Session': session.cookies.get_dict(), 'Request': dict(response.request.headers), 'Response': dict(response.headers), 'Content-Type': mime_type, 'Document': dict( path=os.path.relpath(path, PATH_DB), data=base64.b64encode(content).decode(), ), 'History': [{ 'URL': history.url, 'Method': history.request.method, 'Status-Code': history.status_code, 'Reason': history.reason, 'Cookies': history.cookies.get_dict(), 'Request': dict(history.request.headers), 'Response': dict(history.headers), 'Document': base64.b64encode(history.content).decode(), } for history in response.history], } if DEBUG: print( stem.util.term.format('-*- REQUESTS DATA -*-', stem.util.term.Color.MAGENTA)) # pylint: disable=no-member print(render_error(pprint.pformat(data), stem.util.term.Color.MAGENTA)) # pylint: disable=no-member print( stem.util.term.format('-' * shutil.get_terminal_size().columns, stem.util.term.Color.MAGENTA)) # pylint: disable=no-member if API_REQUESTS is None: save_submit('requests', data) return # submit data submit(API_REQUESTS, 'requests', data)
def submit_selenium(time: typing.Datetime, link: Link, html: str, screenshot: typing.Optional[str]): """Submit selenium data. After crawling with :mod:`requests`, we'll then render the URl using :mod:`selenium` with Google Chrome and its web driver, to provide a fully rendered web page. Such information will be submitted to the web UI. Args: time (datetime.datetime): Timestamp of submission. link: Link object of submission. html: HTML source of the web page. screenshot: *base64* encoded screenshot. If :data:`~darc.submit.API_SELENIUM` is :data:`None`, the data for submission will directly be save through :func:`~darc.submit.save_submit`. Note: This information is optional, only provided if the content type from :mod:`requests` is HTML, status code not between ``400`` and ``600``, and HTML data not empty. The data submitted should have following format:: { // metadata of URL "[metadata]": { // original URL - <scheme>://<netloc>/<path>;<params>?<query>#<fragment> "url": ..., // proxy type - null / tor / i2p / zeronet / freenet "proxy": ..., // hostname / netloc, c.f. ``urllib.parse.urlparse`` "host": ..., // base folder, relative path (to data root path ``PATH_DATA``) in containter - <proxy>/<scheme>/<host> "base": ..., // sha256 of URL as name for saved files (timestamp is in ISO format) // JSON log as this one - <base>/<name>_<timestamp>.json // HTML from requests - <base>/<name>_<timestamp>_raw.html // HTML from selenium - <base>/<name>_<timestamp>.html // generic data files - <base>/<name>_<timestamp>.dat "name": ... }, // requested timestamp in ISO format as in name of saved file "Timestamp": ..., // original URL "URL": ..., // rendered HTML document (if not exists, then ``null``) "Document": { // path of the file, relative path (to data root path ``PATH_DATA``) in container // - <proxy>/<scheme>/<host>/<name>_<timestamp>.html "path": ..., // content of the file (**base64** encoded) "data": ..., }, // web page screenshot (if not exists, then ``null``) "Screenshot": { // path of the file, relative path (to data root path ``PATH_DATA``) in container // - <proxy>/<scheme>/<host>/<name>_<timestamp>.png "path": ..., // content of the file (**base64** encoded) "data": ..., } } See Also: * :data:`darc.submit.API_SELENIUM` * :func:`darc.submit.submit` * :func:`darc.submit.save_submit` * :func:`darc.submit.get_metadata` * :func:`darc.submit.get_html` * :func:`darc.submit.get_screenshot` * :func:`darc.crawl.loader` """ metadata = get_metadata(link) ts = time.isoformat() if screenshot is None: ss = None else: ss = dict( path=os.path.relpath(f'{link.base}/{link.name}_{ts}.png', PATH_DB), data=screenshot, ) if SAVE_DB: SeleniumModel.create( url=URLModel.get(URLModel.hash == link.name), timestamp=time, document=html, screenshot=base64.b64decode(screenshot), ) data = { '[metadata]': metadata, 'Timestamp': ts, 'URL': link.url, 'Document': dict( path=os.path.relpath(f'{link.base}/{link.name}_{ts}.html', PATH_DB), data=base64.b64encode(html.encode()).decode(), ), 'Screenshot': ss, } if DEBUG: print( stem.util.term.format('-*- SELENIUM DATA -*-', stem.util.term.Color.MAGENTA)) # pylint: disable=no-member print(render_error(pprint.pformat(data), stem.util.term.Color.MAGENTA)) # pylint: disable=no-member print( stem.util.term.format('-' * shutil.get_terminal_size().columns, stem.util.term.Color.MAGENTA)) # pylint: disable=no-member if API_SELENIUM is None: save_submit('selenium', data) return # submit data submit(API_REQUESTS, 'selenium', data)
def main(argv: typing.Optional[typing.List[str]] = None) -> int: """Entrypoint. Args: argv: Optional command line arguments. Returns: Exit code. """ parser = get_parser() args = parser.parse_args(argv) pid = os.getpid() with open(PATH_ID, 'w') as file: print(pid, file=file) # wait for Redis if _WAIT_REDIS: if not FLAG_DB: _redis_command('set', 'darc', pid) if FLAG_DB: while True: try: with DB: _db_operation(DB.create_tables, [ HostnameQueueModel, RequestsQueueModel, SeleniumQueueModel, ]) except Exception as error: warning = warnings.formatwarning( error, DatabaseOperaionFailed, __file__, 102, # type: ignore[arg-type] 'DB.create_tables([HostnameQueueModel, ...])') print(render_error(warning, stem.util.term.Color.YELLOW), end='', file=sys.stderr) # pylint: disable=no-member continue break if SAVE_DB: while True: try: with DB_WEB: _db_operation(DB_WEB.create_tables, [ HostnameModel, URLModel, RobotsModel, SitemapModel, HostsModel, RequestsModel, RequestsHistoryModel, SeleniumModel, ]) except Exception as error: warning = warnings.formatwarning( error, DatabaseOperaionFailed, __file__, 117, # type: ignore[arg-type] 'DB.create_tables([HostnameModel, ...])') print(render_error(warning, stem.util.term.Color.YELLOW), end='', file=sys.stderr) # pylint: disable=no-member continue break if DEBUG: print( stem.util.term.format('-*- Initialisation -*-', stem.util.term.Color.MAGENTA)) # pylint: disable=no-member # nuke the db if not FLAG_DB: _redis_command('delete', 'queue_hostname') _redis_command('delete', 'queue_requests') _redis_command('delete', 'queue_selenium') link_list = list() for link in filter( None, map(lambda s: s.strip(), args.link)): # type: ignore[name-defined,var-annotated] if DEBUG: print(stem.util.term.format(link, stem.util.term.Color.MAGENTA)) # pylint: disable=no-member link_list.append(link) if args.file is not None: for path in args.file: with open(path) as file: for line in filter(None, map(lambda s: s.strip(), file)): if line.startswith('#'): continue if DEBUG: print( stem.util.term.format( line, stem.util.term.Color.MAGENTA)) # pylint: disable=no-member link_list.append(line) # write to database link_pool = [parse_link(link) for link in link_list] save_requests(link_pool, score=0, nx=True) if DEBUG: print( stem.util.term.format('-' * shutil.get_terminal_size().columns, stem.util.term.Color.MAGENTA)) # pylint: disable=no-member # init link file if not os.path.isfile(PATH_LN): with open(PATH_LN, 'w') as file: print('proxy,scheme,host,hash,link', file=file) try: process(args.type) except BaseException: traceback.print_exc() _exit() return 0
_TOR_CTRL = None # Tor daemon process _TOR_PROC = None # Tor bootstrap config _TOR_CONFIG = { 'SocksPort': TOR_PORT, 'ControlPort': TOR_CTRL, } _TOR_CONFIG.update(TOR_CFG) if DEBUG: print( stem.util.term.format('-*- TOR PROXY -*-', stem.util.term.Color.MAGENTA)) # pylint: disable=no-member print( render_error(pprint.pformat(_TOR_CONFIG), stem.util.term.Color.MAGENTA)) # pylint: disable=no-member print( stem.util.term.format('-' * shutil.get_terminal_size().columns, stem.util.term.Color.MAGENTA)) # pylint: disable=no-member def renew_tor_session(): """Renew Tor session.""" global _TOR_CTRL try: # Tor controller process if _TOR_CTRL is None: _TOR_CTRL = stem.control.Controller.from_port(port=int(TOR_CTRL)) _TOR_CTRL.authenticate(TOR_PASS) _TOR_CTRL.signal(stem.Signal.NEWNYM) # pylint: disable=no-member
os.makedirs(PATH_MISC, exist_ok=True) # link file mapping PATH_LN = os.path.join(PATH_DB, 'link.csv') # PID file PATH_ID = os.path.join(PATH_DB, 'darc.pid') # extract link pattern _LINK_WHITE_LIST = json.loads(os.getenv('LINK_WHITE_LIST', '[]')) if DEBUG: print( stem.util.term.format('-*- LINK WHITE LIST -*-', stem.util.term.Color.MAGENTA)) # pylint: disable=no-member print( render_error(pprint.pformat(_LINK_WHITE_LIST), stem.util.term.Color.MAGENTA)) # pylint: disable=no-member print( stem.util.term.format('-' * shutil.get_terminal_size().columns, stem.util.term.Color.MAGENTA)) # pylint: disable=no-member LINK_WHITE_LIST = [ re.compile(link, re.IGNORECASE) for link in _LINK_WHITE_LIST ] # link black list _LINK_BLACK_LIST = json.loads(os.getenv('LINK_BLACK_LIST', '[]')) if DEBUG: print( stem.util.term.format('-*- LINK BLACK LIST -*-', stem.util.term.Color.MAGENTA)) # pylint: disable=no-member print( render_error(pprint.pformat(_LINK_BLACK_LIST),
def fetch_sitemap(link: Link, force: bool = False) -> None: """Fetch sitemap. The function will first fetch the ``robots.txt``, then fetch the sitemaps accordingly. Args: link: Link object to fetch for its sitemaps. force: Force refetch its sitemaps. Returns: Contents of ``robots.txt`` and sitemaps. See Also: * :func:`darc.proxy.null.read_robots` * :func:`darc.proxy.null.read_sitemap` * :func:`darc.parse.get_sitemap` """ if force: print(stem.util.term.format(f'[ROBOTS] Force refetch {link.url}', stem.util.term.Color.YELLOW)) # pylint: disable=no-member robots_path = None if force else have_robots(link) if robots_path is not None: print(stem.util.term.format(f'[ROBOTS] Cached {link.url}', stem.util.term.Color.YELLOW)) # pylint: disable=no-member with open(robots_path) as file: robots_text = file.read() else: robots_link = parse_link(urljoin(link.url, '/robots.txt')) print(f'[ROBOTS] Checking {robots_link.url}') with request_session(robots_link) as session: try: response = session.get(robots_link.url) except requests.RequestException as error: print(render_error(f'[ROBOTS] Failed on {robots_link.url} <{error}>', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member return if response.ok: ct_type = get_content_type(response) if ct_type not in ['text/text', 'text/plain']: print(render_error(f'[ROBOTS] Unresolved content type on {robots_link.url} ({ct_type})', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member robots_text = '' else: robots_text = response.text save_robots(robots_link, robots_text) print(f'[ROBOTS] Checked {robots_link.url}') else: print(render_error(f'[ROBOTS] Failed on {robots_link.url} [{response.status_code}]', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member robots_text = '' if force: print(stem.util.term.format(f'[SITEMAP] Force refetch {link.url}', stem.util.term.Color.YELLOW)) # pylint: disable=no-member sitemaps = read_robots(link, robots_text, host=link.host) for sitemap_link in sitemaps: sitemap_path = None if force else have_sitemap(sitemap_link) if sitemap_path is not None: print(stem.util.term.format(f'[SITEMAP] Cached {sitemap_link.url}', stem.util.term.Color.YELLOW)) # pylint: disable=no-member with open(sitemap_path) as file: sitemap_text = file.read() else: print(f'[SITEMAP] Fetching {sitemap_link.url}') with request_session(sitemap_link) as session: try: response = session.get(sitemap_link.url) except requests.RequestException as error: print(render_error(f'[SITEMAP] Failed on {sitemap_link.url} <{error}>', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member continue if not response.ok: print(render_error(f'[SITEMAP] Failed on {sitemap_link.url} [{response.status_code}]', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member continue # check content type ct_type = get_content_type(response) if ct_type == 'application/gzip': try: sitemap_text = gzip.decompress(response.content).decode() except UnicodeDecodeError: sitemap_text = response.text elif ct_type in ['text/xml', 'text/html']: sitemap_text = response.text save_sitemap(sitemap_link, sitemap_text) else: print(render_error(f'[SITEMAP] Unresolved content type on {sitemap_link.url} ({ct_type})', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member continue print(f'[SITEMAP] Fetched {sitemap_link.url}') # get more sitemaps sitemaps.extend(get_sitemap(sitemap_link, sitemap_text, host=link.host)) # add link to queue save_requests(read_sitemap(link, sitemap_text))
def crawler(link: Link): """Single :mod:`requests` crawler for a entry link. Args: link: URL to be crawled by :mod:`requests`. The function will first parse the URL using :func:`~darc.link.parse_link`, and check if need to crawl the URL (c.f. :data:`~darc.const.PROXY_WHITE_LIST`, :data:`~darc.const.PROXY_BLACK_LIST`, :data:`~darc.const.LINK_WHITE_LIST` and :data:`~darc.const.LINK_BLACK_LIST`); if true, then crawl the URL with :mod:`requests`. If the URL is from a brand new host, :mod:`darc` will first try to fetch and save ``robots.txt`` and sitemaps of the host (c.f. :func:`~darc.proxy.null.save_robots` and :func:`~darc.proxy.null.save_sitemap`), and extract then save the links from sitemaps (c.f. :func:`~darc.proxy.null.read_sitemap`) into link database for future crawling (c.f. :func:`~darc.db.save_requests`). .. note:: A host is new if :func:`~darc.db.have_hostname` returns :data:`True`. If :func:`darc.proxy.null.fetch_sitemap` and/or :func:`darc.proxy.i2p.fetch_hosts` failed when fetching such documents, the host will be removed from the hostname database through :func:`~darc.db.drop_hostname`, and considered as new when next encounter. Also, if the submission API is provided, :func:`~darc.submit.submit_new_host` will be called and submit the documents just fetched. If ``robots.txt`` presented, and :data:`~darc.const.FORCE` is :data:`False`, :mod:`darc` will check if allowed to crawl the URL. .. note:: The root path (e.g. ``/`` in https://www.example.com/) will always be crawled ignoring ``robots.txt``. At this point, :mod:`darc` will call the customised hook function from :mod:`darc.sites` to crawl and get the final response object. :mod:`darc` will save the session cookies and header information, using :func:`~darc.save.save_headers`. .. note:: If :exc:`requests.exceptions.InvalidSchema` is raised, the link will be saved by :func:`~darc.proxy.null.save_invalid`. Further processing is dropped, and the link will be removed from the :mod:`requests` database through :func:`~darc.db.drop_requests`. If :exc:`~darc.error.LinkNoReturn` is raised, the link will be removed from the :mod:`requests` database through :func:`~darc.db.drop_requests`. If the content type of response document is not ignored (c.f. :data:`~darc.const.MIME_WHITE_LIST` and :data:`~darc.const.MIME_BLACK_LIST`), :func:`~darc.submit.submit_requests` will be called and submit the document just fetched. If the response document is HTML (``text/html`` and ``application/xhtml+xml``), :func:`~darc.parse.extract_links` will be called then to extract all possible links from the HTML document and save such links into the database (c.f. :func:`~darc.db.save_requests`). And if the response status code is between ``400`` and ``600``, the URL will be saved back to the link database (c.f. :func:`~darc.db.save_requests`). If **NOT**, the URL will be saved into :mod:`selenium` link database to proceed next steps (c.f. :func:`~darc.db.save_selenium`). """ print(f'[REQUESTS] Requesting {link.url}') try: if match_proxy(link.proxy): print(render_error( f'[REQUESTS] Ignored proxy type from {link.url} ({link.proxy})', stem.util.term.Color.YELLOW), file=sys.stderr) # pylint: disable=no-member drop_requests(link) return if match_host(link.host): print(render_error( f'[REQUESTS] Ignored hostname from {link.url} ({link.proxy})', stem.util.term.Color.YELLOW), file=sys.stderr) # pylint: disable=no-member drop_requests(link) return # timestamp timestamp = datetime.now() # if it's a new host if not have_hostname(link): partial = False if link.proxy not in ('zeronet', 'freenet'): # fetch sitemap.xml try: fetch_sitemap(link) except Exception: error = f'[Error fetching sitemap of {link.url}]' + os.linesep + traceback.format_exc() + '-' * shutil.get_terminal_size().columns # pylint: disable=line-too-long print(render_error(error, stem.util.term.Color.CYAN), file=sys.stderr) # pylint: disable=no-member partial = True if link.proxy == 'i2p': # fetch hosts.txt try: fetch_hosts(link) except Exception: error = f'[Error subscribing hosts from {link.url}]' + os.linesep + traceback.format_exc() + '-' * shutil.get_terminal_size().columns # pylint: disable=line-too-long print(render_error(error, stem.util.term.Color.CYAN), file=sys.stderr) # pylint: disable=no-member partial = True # submit data / drop hostname from db if partial: drop_hostname(link) submit_new_host(timestamp, link, partial=partial) if not FORCE and not check_robots(link): print(render_error( f'[REQUESTS] Robots disallowed link from {link.url}', stem.util.term.Color.YELLOW), file=sys.stderr) # pylint: disable=no-member return with request_session(link) as session: try: # requests session hook response = crawler_hook(link, session) except requests.exceptions.InvalidSchema as error: print(render_error( f'[REQUESTS] Failed on {link.url} <{error}>', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member save_invalid(link) drop_requests(link) return except requests.RequestException as error: print(render_error( f'[REQUESTS] Failed on {link.url} <{error}>', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member save_requests(link, single=True) return except LinkNoReturn: print(render_error( f'[REQUESTS] Removing from database: {link.url}', stem.util.term.Color.YELLOW), file=sys.stderr) # pylint: disable=no-member drop_requests(link) return # save headers save_headers(timestamp, link, response, session) # check content type ct_type = get_content_type(response) if ct_type not in ['text/html', 'application/xhtml+xml']: print(render_error( f'[REQUESTS] Generic content type from {link.url} ({ct_type})', stem.util.term.Color.YELLOW), file=sys.stderr) # pylint: disable=no-member # probably hosts.txt if link.proxy == 'i2p' and ct_type in [ 'text/plain', 'text/text' ]: text = response.text save_requests(read_hosts(text)) if match_mime(ct_type): drop_requests(link) return # submit data data = response.content submit_requests(timestamp, link, response, session, data, mime_type=ct_type, html=False) return html = response.content if not html: print(render_error( f'[REQUESTS] Empty response from {link.url}', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member save_requests(link, single=True) return # submit data submit_requests(timestamp, link, response, session, html, mime_type=ct_type, html=True) # add link to queue save_requests(extract_links(link, html), score=0, nx=True) if not response.ok: print(render_error( f'[REQUESTS] Failed on {link.url} [{response.status_code}]', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member save_requests(link, single=True) return # add link to queue save_selenium(link, single=True, score=0, nx=True) except Exception: error = f'[Error from {link.url}]' + os.linesep + traceback.format_exc() + '-' * shutil.get_terminal_size().columns # pylint: disable=line-too-long print(render_error(error, stem.util.term.Color.CYAN), file=sys.stderr) # pylint: disable=no-member save_requests(link, single=True) print(f'[REQUESTS] Requested {link.url}')