def read_hosts(text: str, check: bool = CHECK) -> typing.List[Link]: """Read ``hosts.txt``. Args: text: Content of ``hosts.txt``. check: If perform checks on extracted links, default to :data:`~darc.const.CHECK`. Returns: List of links extracted. """ temp_list = list() for line in filter(None, map(lambda s: s.strip(), text.splitlines())): if line.startswith('#'): continue link = line.split('=', maxsplit=1)[0] if I2P_REGEX.fullmatch(link) is None: continue temp_list.append(parse_link(f'http://{link}')) if check: return _check(temp_list) return temp_list
def read_sitemap(link: 'darc_link.Link', text: str, check: bool = CHECK) -> 'List[darc_link.Link]': """Read sitemap. Args: link: Original link to the sitemap. text: Content of the sitemap. check: If perform checks on extracted links, default to :data:`~darc.const.CHECK`. Returns: List of links extracted. See Also: * :func:`darc.parse._check` * :func:`darc.parse._check_ng` """ soup = bs4.BeautifulSoup(text, 'html5lib') # https://www.sitemaps.org/protocol.html temp_list = [parse_link(urljoin(link.url, loc.text), host=link.host, backref=link) for loc in soup.select('urlset > url > loc')] # check content / proxy type if check: return _check(temp_list) return temp_list
def read_hosts(link: 'darc_link.Link', text: str, check: bool = CHECK) -> 'List[darc_link.Link]': """Read ``hosts.txt``. Args: link: Link object to fetch for its ``hosts.txt``. text: Content of ``hosts.txt``. check: If perform checks on extracted links, default to :data:`~darc.const.CHECK`. Returns: List of links extracted. """ temp_list = [] for line in filter(None, map(lambda s: s.strip(), text.splitlines())): if line.startswith('#'): continue host = line.split('=', maxsplit=1)[0] if I2P_REGEX.fullmatch(host) is None: continue temp_list.append(parse_link(f'http://{host}', backref=link)) if check: return _check(temp_list) return temp_list
def load_selenium(check: bool = CHECK) -> typing.List[Link]: """Load link from the :mod:`selenium` database. Args: check: If perform checks on loaded links, default to :data:`~darc.const.CHECK`. Returns: List of loaded links from the :mod:`selenium` database. Note: At runtime, the function will load links with maximum number at :data:`~darc.db.MAX_POOL` to limit the memory usage. See Also: * :func:`darc.db._load_selenium_db` * :func:`darc.db._load_selenium_redis` """ if FLAG_DB: with database.connection_context(): try: link_pool = _load_selenium_db() except Exception as error: warning = warnings.formatwarning(str(error), DatabaseOperaionFailed, __file__, 983, '_load_selenium_db()') print(render_error(warning, stem.util.term.Color.YELLOW), end='', file=sys.stderr) # pylint: disable=no-member link_pool = list() else: link_pool = _load_selenium_redis() if check: link_pool = _check(link_pool) if VERBOSE: print(stem.util.term.format('-*- [SELENIUM] LINK POOL -*-', stem.util.term.Color.MAGENTA)) # pylint: disable=no-member print(render_error(pprint.pformat(sorted(link.url for link in link_pool)), stem.util.term.Color.MAGENTA)) # pylint: disable=no-member print(stem.util.term.format('-' * shutil.get_terminal_size().columns, stem.util.term.Color.MAGENTA)) # pylint: disable=no-member return link_pool
def load_requests(check: bool = CHECK) -> typing.List[Link]: """Load link from the :mod:`requests` database. Args: check: If perform checks on loaded links, default to :data:`~darc.const.CHECK`. Returns: List of loaded links from the :mod:`requests` database. Note: At runtime, the function will load links with maximum number at :data:`~darc.db.MAX_POOL` to limit the memory usage. See Also: * :func:`darc.db._load_requests_db` * :func:`darc.db._load_requests_redis` """ if FLAG_DB: with database.connection_context(): link_pool = _load_requests_db() else: link_pool = _load_requests_redis() if check: link_pool = _check(link_pool) if VERBOSE: print( stem.util.term.format('-*- [REQUESTS] LINK POOL -*-', stem.util.term.Color.MAGENTA)) # pylint: disable=no-member print( render_error( pprint.pformat(sorted(link.url for link in link_pool)), stem.util.term.Color.MAGENTA)) # pylint: disable=no-member print( stem.util.term.format('-' * shutil.get_terminal_size().columns, stem.util.term.Color.MAGENTA)) # pylint: disable=no-member return link_pool
def load_requests(check: bool = CHECK) -> 'List[Link]': """Load link from the :mod:`requests` database. Args: check: If perform checks on loaded links, default to :data:`~darc.const.CHECK`. Returns: List of loaded links from the :mod:`requests` database. Note: At runtime, the function will load links with maximum number at :data:`~darc.db.MAX_POOL` to limit the memory usage. See Also: * :func:`darc.db._load_requests_db` * :func:`darc.db._load_requests_redis` """ if FLAG_DB: with database.connection_context(): try: link_pool = _load_requests_db() except Exception: logger.pexc(LOG_WARNING, category=DatabaseOperaionFailed, line='_load_requests_db()') link_pool = [] else: link_pool = _load_requests_redis() if check: link_pool = _check(link_pool) logger.plog(LOG_VERBOSE, '-*- [REQUESTS] LINK POOL -*-', object=sorted(link.url for link in link_pool)) return link_pool
def _extract_links(cls, link: Link, html: typing.Union[str, bytes], check: bool = CHECK) -> typing.List[Link]: """Extract links from HTML document. Args: link: Original link of the HTML document. html: Content of the HTML document. check: If perform checks on extracted links, default to :data:`~darc.const.CHECK`. Returns: List of extracted links. """ temp_list = cls.extract_links(link, html) link_list = [parse_link(link) for link in temp_list] # check content / proxy type if check: return _check(link_list) return link_list