def have_hostname(link: Link) -> typing.Tuple[bool, bool]: """Check if current link is a new host. Args: link: Link to check against. Returns: A tuple of two :obj:`bool` values representing if such link is a known host and needs force refetch respectively. See Also: * :func:`darc.db._have_hostname_db` * :func:`darc.db._have_hostname_redis` """ if FLAG_DB: with database.connection_context(): try: return _have_hostname_db(link) except Exception as error: warning = warnings.formatwarning(str(error), DatabaseOperaionFailed, __file__, 236, f'_have_hostname_db({link})') print(render_error(warning, stem.util.term.Color.YELLOW), end='', file=sys.stderr) # pylint: disable=no-member return False, False return _have_hostname_redis(link)
def have_hostname(link: 'Link') -> 'Tuple[bool, bool]': """Check if current link is a new host. Args: link: Link to check against. Returns: A tuple of two :obj:`bool` values representing if such link is a known host and needs force refetch respectively. See Also: * :func:`darc.db._have_hostname_db` * :func:`darc.db._have_hostname_redis` """ if FLAG_DB: with database.connection_context(): try: return _have_hostname_db(link) except Exception: logger.pexc(LOG_WARNING, category=DatabaseOperaionFailed, line=f'_have_hostname_db({link.url})') return False, False return _have_hostname_redis(link)
def drop_selenium(link: Link): """Remove link from the :mod:`selenium` database. Args: link: Link to be removed. See Also: * :func:`darc.db._drop_selenium_db` * :func:`darc.db._drop_selenium_redis` """ if FLAG_DB: with database.connection_context(): return _drop_selenium_db(link) return _drop_selenium_redis(link)
def drop_hostname(link: Link): """Remove link from the hostname database. Args: link: Link to be removed. See Also: * :func:`darc.db._drop_hostname_db` * :func:`darc.db._drop_hostname_redis` """ if FLAG_DB: with database.connection_context(): return _drop_hostname_db(link) return _drop_hostname_redis(link)
def save_requests(entries: 'Union[Link, List[Link]]', single: bool = False, score: 'Optional[float]' = None, nx: bool = False, xx: bool = False) -> None: """Save link to the :mod:`requests` database. The function updates the ``queue_requests`` database. Args: entries: Links to be added to the :mod:`requests` database. It can be either a :obj:`list` of links, or a single link string (if ``single`` set as :data:`True`). single: Indicate if ``entries`` is a :obj:`list` of links or a single link string. score: Score to for the Redis sorted set. nx: Only create new elements and not to update scores for elements that already exist. xx: Only update scores of elements that already exist. New elements will not be added. Notes: The ``entries`` will be dumped through :mod:`pickle` so that :mod:`darc` do not need to parse them again. When ``entries`` is a list of :class:`~darc.link.Link` instances, we tries to perform *bulk* update to easy the memory consumption. The *bulk* size is defined by :data:`~darc.db.BULK_SIZE`. See Also: * :func:`darc.db._save_requests_db` * :func:`darc.db._save_requests_redis` """ if FLAG_DB: with database.connection_context(): try: return _save_requests_db(entries, single, score, nx, xx) # type: ignore[call-overload] except Exception: _arg_msg = _gen_arg_msg(entries, single, score, nx, xx) logger.pexc(LOG_WARNING, category=DatabaseOperaionFailed, line=f'_save_requests_db({_arg_msg})') return None return _save_requests_redis(entries, single, score, nx, xx)
def have_hostname(link: Link) -> bool: """Check if current link is a new host. Args: link: Link to check against. Returns: If such link is a new host. See Also: * :func:`darc.db._have_hostname_db` * :func:`darc.db._have_hostname_redis` """ if FLAG_DB: with database.connection_context(): return _have_hostname_db(link) return _have_hostname_redis(link)
def load_selenium(check: bool = CHECK) -> typing.List[Link]: """Load link from the :mod:`selenium` database. Args: check: If perform checks on loaded links, default to :data:`~darc.const.CHECK`. Returns: List of loaded links from the :mod:`selenium` database. Note: At runtime, the function will load links with maximum number at :data:`~darc.db.MAX_POOL` to limit the memory usage. See Also: * :func:`darc.db._load_selenium_db` * :func:`darc.db._load_selenium_redis` """ if FLAG_DB: with database.connection_context(): try: link_pool = _load_selenium_db() except Exception as error: warning = warnings.formatwarning(str(error), DatabaseOperaionFailed, __file__, 983, '_load_selenium_db()') print(render_error(warning, stem.util.term.Color.YELLOW), end='', file=sys.stderr) # pylint: disable=no-member link_pool = list() else: link_pool = _load_selenium_redis() if check: link_pool = _check(link_pool) if VERBOSE: print(stem.util.term.format('-*- [SELENIUM] LINK POOL -*-', stem.util.term.Color.MAGENTA)) # pylint: disable=no-member print(render_error(pprint.pformat(sorted(link.url for link in link_pool)), stem.util.term.Color.MAGENTA)) # pylint: disable=no-member print(stem.util.term.format('-' * shutil.get_terminal_size().columns, stem.util.term.Color.MAGENTA)) # pylint: disable=no-member return link_pool
def save_requests(entries: typing.Union[Link, typing.List[Link]], single: bool = False, # pylint: disable=inconsistent-return-statements score: typing.Optional[float] = None, nx: bool = False, xx: bool = False) -> None: """Save link to the :mod:`requests` database. The function updates the ``queue_requests`` database. Args: entries: Links to be added to the :mod:`requests` database. It can be either a :obj:`list` of links, or a single link string (if ``single`` set as :data:`True`). single: Indicate if ``entries`` is a :obj:`list` of links or a single link string. score: Score to for the Redis sorted set. nx: Only create new elements and not to update scores for elements that already exist. xx: Only update scores of elements that already exist. New elements will not be added. Notes: The ``entries`` will be dumped through :mod:`pickle` so that :mod:`darc` do not need to parse them again. When ``entries`` is a list of :class:`~darc.link.Link` instances, we tries to perform *bulk* update to easy the memory consumption. The *bulk* size is defined by :data:`~darc.db.BULK_SIZE`. See Also: * :func:`darc.db._save_requests_db` * :func:`darc.db._save_requests_redis` """ if FLAG_DB: with database.connection_context(): try: return _save_requests_db(entries, single, score, nx, xx) except Exception as error: warning = warnings.formatwarning(str(error), DatabaseOperaionFailed, __file__, 505, '_save_requests_db(...)') print(render_error(warning, stem.util.term.Color.YELLOW), end='', file=sys.stderr) # pylint: disable=no-member return return _save_requests_redis(entries, single, score, nx, xx)
def drop_selenium(link: Link) -> None: # pylint: disable=inconsistent-return-statements """Remove link from the :mod:`selenium` database. Args: link: Link to be removed. See Also: * :func:`darc.db._drop_selenium_db` * :func:`darc.db._drop_selenium_redis` """ if FLAG_DB: with database.connection_context(): try: return _drop_selenium_db(link) except Exception as error: warning = warnings.formatwarning(str(error), DatabaseOperaionFailed, __file__, 433, f'_drop_selenium_db({link})') print(render_error(warning, stem.util.term.Color.YELLOW), end='', file=sys.stderr) # pylint: disable=no-member return return _drop_selenium_redis(link)
def drop_selenium(link: 'Link') -> None: # pylint: disable=inconsistent-return-statements """Remove link from the :mod:`selenium` database. Args: link: Link to be removed. See Also: * :func:`darc.db._drop_selenium_db` * :func:`darc.db._drop_selenium_redis` """ if FLAG_DB: with database.connection_context(): try: return _drop_selenium_db(link) except Exception: logger.pexc(LOG_WARNING, category=DatabaseOperaionFailed, line=f'_drop_selenium_db({link.url})') return None return _drop_selenium_redis(link)
def drop_hostname(link: 'Link') -> None: """Remove link from the hostname database. Args: link: Link to be removed. See Also: * :func:`darc.db._drop_hostname_db` * :func:`darc.db._drop_hostname_redis` """ if FLAG_DB: with database.connection_context(): try: return _drop_hostname_db(link) except Exception: logger.pexc(LOG_WARNING, category=DatabaseOperaionFailed, line=f'_drop_hostname_db({link.url})') return None return _drop_hostname_redis(link)
def load_requests(check: bool = CHECK) -> typing.List[Link]: """Load link from the :mod:`requests` database. Args: check: If perform checks on loaded links, default to :data:`~darc.const.CHECK`. Returns: List of loaded links from the :mod:`requests` database. Note: At runtime, the function will load links with maximum number at :data:`~darc.db.MAX_POOL` to limit the memory usage. See Also: * :func:`darc.db._load_requests_db` * :func:`darc.db._load_requests_redis` """ if FLAG_DB: with database.connection_context(): link_pool = _load_requests_db() else: link_pool = _load_requests_redis() if check: link_pool = _check(link_pool) if VERBOSE: print( stem.util.term.format('-*- [REQUESTS] LINK POOL -*-', stem.util.term.Color.MAGENTA)) # pylint: disable=no-member print( render_error( pprint.pformat(sorted(link.url for link in link_pool)), stem.util.term.Color.MAGENTA)) # pylint: disable=no-member print( stem.util.term.format('-' * shutil.get_terminal_size().columns, stem.util.term.Color.MAGENTA)) # pylint: disable=no-member return link_pool
def save_requests(entries: typing.List[Link], single: bool = False, score=None, nx=False, xx=False): """Save link to the :mod:`requests` database. The function updates the ``queue_requests`` database. Args: entries: Links to be added to the :mod:`requests` database. It can be either a :obj:`list` of links, or a single link string (if ``single`` set as :data:`True`). single: Indicate if ``entries`` is a :obj:`list` of links or a single link string. score: Score to for the Redis sorted set. nx: Only create new elements and not to update scores for elements that already exist. xx: Only update scores of elements that already exist. New elements will not be added. Notes: The ``entries`` will be dumped through :mod:`pickle` so that :mod:`darc` do not need to parse them again. When ``entries`` is a list of :class:`~darc.link.Link` instances, we tries to perform *bulk* update to easy the memory consumption. The *bulk* size is defined by :data:`~darc.db.BULK_SIZE`. See Also: * :func:`darc.db._save_requests_db` * :func:`darc.db._save_requests_redis` """ if FLAG_DB: with database.connection_context(): return _save_requests_db(entries, single, score, nx, xx) return _save_requests_redis(entries, single, score, nx, xx)
def load_requests(check: bool = CHECK) -> 'List[Link]': """Load link from the :mod:`requests` database. Args: check: If perform checks on loaded links, default to :data:`~darc.const.CHECK`. Returns: List of loaded links from the :mod:`requests` database. Note: At runtime, the function will load links with maximum number at :data:`~darc.db.MAX_POOL` to limit the memory usage. See Also: * :func:`darc.db._load_requests_db` * :func:`darc.db._load_requests_redis` """ if FLAG_DB: with database.connection_context(): try: link_pool = _load_requests_db() except Exception: logger.pexc(LOG_WARNING, category=DatabaseOperaionFailed, line='_load_requests_db()') link_pool = [] else: link_pool = _load_requests_redis() if check: link_pool = _check(link_pool) logger.plog(LOG_VERBOSE, '-*- [REQUESTS] LINK POOL -*-', object=sorted(link.url for link in link_pool)) return link_pool