def _load_selenium_db() -> typing.List[Link]: """Load link from the :mod:`selenium` database. The function reads the :class:`~darc.model.tasks.selenium.SeleniumQueueModel` table. Returns: List of loaded links from the :mod:`selenium` database. Note: At runtime, the function will load links with maximum number at :data:`~darc.db.MAX_POOL` to limit the memory usage. """ now = datetime.datetime.now() if TIME_CACHE is None: sec_delta = 0 max_score = now else: sec_delta = TIME_CACHE max_score = now - sec_delta with database.atomic(): query: typing.List[SeleniumQueueModel] = (SeleniumQueueModel.select( SeleniumQueueModel.link).where( SeleniumQueueModel.timestamp <= max_score).order_by( SeleniumQueueModel.timestamp).limit(MAX_POOL)) link_pool = [model.link for model in query] if TIME_CACHE is not None: new_score = now + sec_delta _save_selenium_db(link_pool, score=new_score) # force update records return link_pool
def _load_requests_db() -> 'List[Link]': """Load link from the :mod:`requests` database. The function reads the :class:`~darc.model.tasks.requests.RequestsQueueModel` table. Returns: List of loaded links from the :mod:`requests` database. Note: At runtime, the function will load links with maximum number at :data:`~darc.db.MAX_POOL` to limit the memory usage. """ now = datetime.now() if TIME_CACHE is None: sec_delta = timedelta(seconds=0) max_score = now else: sec_delta = TIME_CACHE max_score = now - sec_delta with database.atomic(): query = _db_operation( RequestsQueueModel.select(RequestsQueueModel.link).where( RequestsQueueModel.timestamp <= max_score).order_by( RequestsQueueModel.timestamp).limit( MAX_POOL).execute) # type: List[RequestsQueueModel] link_pool = [model.link for model in query] # force update records if TIME_CACHE is not None: new_score = (now + sec_delta).timestamp() _save_requests_db(link_pool, score=new_score) return link_pool
def have_hostname(link: Link) -> typing.Tuple[bool, bool]: """Check if current link is a new host. Args: link: Link to check against. Returns: A tuple of two :obj:`bool` values representing if such link is a known host and needs force refetch respectively. See Also: * :func:`darc.db._have_hostname_db` * :func:`darc.db._have_hostname_redis` """ if FLAG_DB: with database.connection_context(): try: return _have_hostname_db(link) except Exception as error: warning = warnings.formatwarning(str(error), DatabaseOperaionFailed, __file__, 236, f'_have_hostname_db({link})') print(render_error(warning, stem.util.term.Color.YELLOW), end='', file=sys.stderr) # pylint: disable=no-member return False, False return _have_hostname_redis(link)
def have_hostname(link: 'Link') -> 'Tuple[bool, bool]': """Check if current link is a new host. Args: link: Link to check against. Returns: A tuple of two :obj:`bool` values representing if such link is a known host and needs force refetch respectively. See Also: * :func:`darc.db._have_hostname_db` * :func:`darc.db._have_hostname_redis` """ if FLAG_DB: with database.connection_context(): try: return _have_hostname_db(link) except Exception: logger.pexc(LOG_WARNING, category=DatabaseOperaionFailed, line=f'_have_hostname_db({link.url})') return False, False return _have_hostname_redis(link)
def drop_hostname(link: Link): """Remove link from the hostname database. Args: link: Link to be removed. See Also: * :func:`darc.db._drop_hostname_db` * :func:`darc.db._drop_hostname_redis` """ if FLAG_DB: with database.connection_context(): return _drop_hostname_db(link) return _drop_hostname_redis(link)
def drop_selenium(link: Link): """Remove link from the :mod:`selenium` database. Args: link: Link to be removed. See Also: * :func:`darc.db._drop_selenium_db` * :func:`darc.db._drop_selenium_redis` """ if FLAG_DB: with database.connection_context(): return _drop_selenium_db(link) return _drop_selenium_redis(link)
def save_requests(entries: 'Union[Link, List[Link]]', single: bool = False, score: 'Optional[float]' = None, nx: bool = False, xx: bool = False) -> None: """Save link to the :mod:`requests` database. The function updates the ``queue_requests`` database. Args: entries: Links to be added to the :mod:`requests` database. It can be either a :obj:`list` of links, or a single link string (if ``single`` set as :data:`True`). single: Indicate if ``entries`` is a :obj:`list` of links or a single link string. score: Score to for the Redis sorted set. nx: Only create new elements and not to update scores for elements that already exist. xx: Only update scores of elements that already exist. New elements will not be added. Notes: The ``entries`` will be dumped through :mod:`pickle` so that :mod:`darc` do not need to parse them again. When ``entries`` is a list of :class:`~darc.link.Link` instances, we tries to perform *bulk* update to easy the memory consumption. The *bulk* size is defined by :data:`~darc.db.BULK_SIZE`. See Also: * :func:`darc.db._save_requests_db` * :func:`darc.db._save_requests_redis` """ if FLAG_DB: with database.connection_context(): try: return _save_requests_db(entries, single, score, nx, xx) # type: ignore[call-overload] except Exception: _arg_msg = _gen_arg_msg(entries, single, score, nx, xx) logger.pexc(LOG_WARNING, category=DatabaseOperaionFailed, line=f'_save_requests_db({_arg_msg})') return None return _save_requests_redis(entries, single, score, nx, xx)
def have_hostname(link: Link) -> bool: """Check if current link is a new host. Args: link: Link to check against. Returns: If such link is a new host. See Also: * :func:`darc.db._have_hostname_db` * :func:`darc.db._have_hostname_redis` """ if FLAG_DB: with database.connection_context(): return _have_hostname_db(link) return _have_hostname_redis(link)
def load_selenium(check: bool = CHECK) -> typing.List[Link]: """Load link from the :mod:`selenium` database. Args: check: If perform checks on loaded links, default to :data:`~darc.const.CHECK`. Returns: List of loaded links from the :mod:`selenium` database. Note: At runtime, the function will load links with maximum number at :data:`~darc.db.MAX_POOL` to limit the memory usage. See Also: * :func:`darc.db._load_selenium_db` * :func:`darc.db._load_selenium_redis` """ if FLAG_DB: with database.connection_context(): try: link_pool = _load_selenium_db() except Exception as error: warning = warnings.formatwarning(str(error), DatabaseOperaionFailed, __file__, 983, '_load_selenium_db()') print(render_error(warning, stem.util.term.Color.YELLOW), end='', file=sys.stderr) # pylint: disable=no-member link_pool = list() else: link_pool = _load_selenium_redis() if check: link_pool = _check(link_pool) if VERBOSE: print(stem.util.term.format('-*- [SELENIUM] LINK POOL -*-', stem.util.term.Color.MAGENTA)) # pylint: disable=no-member print(render_error(pprint.pformat(sorted(link.url for link in link_pool)), stem.util.term.Color.MAGENTA)) # pylint: disable=no-member print(stem.util.term.format('-' * shutil.get_terminal_size().columns, stem.util.term.Color.MAGENTA)) # pylint: disable=no-member return link_pool
def save_requests(entries: typing.Union[Link, typing.List[Link]], single: bool = False, # pylint: disable=inconsistent-return-statements score: typing.Optional[float] = None, nx: bool = False, xx: bool = False) -> None: """Save link to the :mod:`requests` database. The function updates the ``queue_requests`` database. Args: entries: Links to be added to the :mod:`requests` database. It can be either a :obj:`list` of links, or a single link string (if ``single`` set as :data:`True`). single: Indicate if ``entries`` is a :obj:`list` of links or a single link string. score: Score to for the Redis sorted set. nx: Only create new elements and not to update scores for elements that already exist. xx: Only update scores of elements that already exist. New elements will not be added. Notes: The ``entries`` will be dumped through :mod:`pickle` so that :mod:`darc` do not need to parse them again. When ``entries`` is a list of :class:`~darc.link.Link` instances, we tries to perform *bulk* update to easy the memory consumption. The *bulk* size is defined by :data:`~darc.db.BULK_SIZE`. See Also: * :func:`darc.db._save_requests_db` * :func:`darc.db._save_requests_redis` """ if FLAG_DB: with database.connection_context(): try: return _save_requests_db(entries, single, score, nx, xx) except Exception as error: warning = warnings.formatwarning(str(error), DatabaseOperaionFailed, __file__, 505, '_save_requests_db(...)') print(render_error(warning, stem.util.term.Color.YELLOW), end='', file=sys.stderr) # pylint: disable=no-member return return _save_requests_redis(entries, single, score, nx, xx)
def drop_selenium(link: 'Link') -> None: # pylint: disable=inconsistent-return-statements """Remove link from the :mod:`selenium` database. Args: link: Link to be removed. See Also: * :func:`darc.db._drop_selenium_db` * :func:`darc.db._drop_selenium_redis` """ if FLAG_DB: with database.connection_context(): try: return _drop_selenium_db(link) except Exception: logger.pexc(LOG_WARNING, category=DatabaseOperaionFailed, line=f'_drop_selenium_db({link.url})') return None return _drop_selenium_redis(link)
def drop_selenium(link: Link) -> None: # pylint: disable=inconsistent-return-statements """Remove link from the :mod:`selenium` database. Args: link: Link to be removed. See Also: * :func:`darc.db._drop_selenium_db` * :func:`darc.db._drop_selenium_redis` """ if FLAG_DB: with database.connection_context(): try: return _drop_selenium_db(link) except Exception as error: warning = warnings.formatwarning(str(error), DatabaseOperaionFailed, __file__, 433, f'_drop_selenium_db({link})') print(render_error(warning, stem.util.term.Color.YELLOW), end='', file=sys.stderr) # pylint: disable=no-member return return _drop_selenium_redis(link)
def drop_hostname(link: 'Link') -> None: """Remove link from the hostname database. Args: link: Link to be removed. See Also: * :func:`darc.db._drop_hostname_db` * :func:`darc.db._drop_hostname_redis` """ if FLAG_DB: with database.connection_context(): try: return _drop_hostname_db(link) except Exception: logger.pexc(LOG_WARNING, category=DatabaseOperaionFailed, line=f'_drop_hostname_db({link.url})') return None return _drop_hostname_redis(link)
def load_requests(check: bool = CHECK) -> typing.List[Link]: """Load link from the :mod:`requests` database. Args: check: If perform checks on loaded links, default to :data:`~darc.const.CHECK`. Returns: List of loaded links from the :mod:`requests` database. Note: At runtime, the function will load links with maximum number at :data:`~darc.db.MAX_POOL` to limit the memory usage. See Also: * :func:`darc.db._load_requests_db` * :func:`darc.db._load_requests_redis` """ if FLAG_DB: with database.connection_context(): link_pool = _load_requests_db() else: link_pool = _load_requests_redis() if check: link_pool = _check(link_pool) if VERBOSE: print( stem.util.term.format('-*- [REQUESTS] LINK POOL -*-', stem.util.term.Color.MAGENTA)) # pylint: disable=no-member print( render_error( pprint.pformat(sorted(link.url for link in link_pool)), stem.util.term.Color.MAGENTA)) # pylint: disable=no-member print( stem.util.term.format('-' * shutil.get_terminal_size().columns, stem.util.term.Color.MAGENTA)) # pylint: disable=no-member return link_pool
def save_requests(entries: typing.List[Link], single: bool = False, score=None, nx=False, xx=False): """Save link to the :mod:`requests` database. The function updates the ``queue_requests`` database. Args: entries: Links to be added to the :mod:`requests` database. It can be either a :obj:`list` of links, or a single link string (if ``single`` set as :data:`True`). single: Indicate if ``entries`` is a :obj:`list` of links or a single link string. score: Score to for the Redis sorted set. nx: Only create new elements and not to update scores for elements that already exist. xx: Only update scores of elements that already exist. New elements will not be added. Notes: The ``entries`` will be dumped through :mod:`pickle` so that :mod:`darc` do not need to parse them again. When ``entries`` is a list of :class:`~darc.link.Link` instances, we tries to perform *bulk* update to easy the memory consumption. The *bulk* size is defined by :data:`~darc.db.BULK_SIZE`. See Also: * :func:`darc.db._save_requests_db` * :func:`darc.db._save_requests_redis` """ if FLAG_DB: with database.connection_context(): return _save_requests_db(entries, single, score, nx, xx) return _save_requests_redis(entries, single, score, nx, xx)
def load_requests(check: bool = CHECK) -> 'List[Link]': """Load link from the :mod:`requests` database. Args: check: If perform checks on loaded links, default to :data:`~darc.const.CHECK`. Returns: List of loaded links from the :mod:`requests` database. Note: At runtime, the function will load links with maximum number at :data:`~darc.db.MAX_POOL` to limit the memory usage. See Also: * :func:`darc.db._load_requests_db` * :func:`darc.db._load_requests_redis` """ if FLAG_DB: with database.connection_context(): try: link_pool = _load_requests_db() except Exception: logger.pexc(LOG_WARNING, category=DatabaseOperaionFailed, line='_load_requests_db()') link_pool = [] else: link_pool = _load_requests_redis() if check: link_pool = _check(link_pool) logger.plog(LOG_VERBOSE, '-*- [REQUESTS] LINK POOL -*-', object=sorted(link.url for link in link_pool)) return link_pool
def _save_requests_db(entries: typing.Union[Link, typing.List[Link]], single: bool = False, score: typing.Optional[float] = None, nx: bool = False, xx: bool = False) -> None: """Save link to the :mod:`requests` database. The function updates the :class:`~darc.model.tasks.requests.RequestsQueueModel` table. Args: entries: Links to be added to the :mod:`requests` database. It can be either a :obj:`list` of links, or a single link string (if ``single`` set as :data:`True`). single: Indicate if ``entries`` is a :obj:`list` of links or a single link string. score: Score to for the Redis sorted set. nx: Only create new elements and not to update scores for elements that already exist. xx: Only update scores of elements that already exist. New elements will not be added. """ if not entries: return if score is None: score = datetime.datetime.now() # type: ignore if not single: if typing.TYPE_CHECKING: entries = typing.cast(typing.List[Link], entries) if nx: with database.atomic(): insert_many = [dict( text=link.url, hash=link.name, link=link, timestamp=score, ) for link in entries] for batch in peewee.chunked(insert_many, BULK_SIZE): _db_operation(RequestsQueueModel .insert_many(insert_many) .on_conflict_ignore() .execute) return if xx: entries_text = [link.url for link in entries] _db_operation(RequestsQueueModel .update(timestamp=score) .where(typing.cast(peewee.TextField, RequestsQueueModel.text).in_(entries_text)) .execute) return with database.atomic(): replace_many = [dict( text=link.url, hash=link.name, link=link, timestamp=score ) for link in entries] for batch in peewee.chunked(replace_many, BULK_SIZE): _db_operation(RequestsQueueModel.replace_many(batch).execute) return if typing.TYPE_CHECKING: entries = typing.cast(Link, entries) if nx: _db_operation(RequestsQueueModel.get_or_create, text=entries.url, defaults=dict( hash=entries.name, link=entries, timestamp=score, )) return if xx: with contextlib.suppress(peewee.DoesNotExist): model = _db_operation(RequestsQueueModel.get, RequestsQueueModel.text == entries.url) model.timestamp = score _db_operation(model.save) return _db_operation(RequestsQueueModel.replace( text=entries.url, hash=entries.name, link=entries, timestamp=score ).execute)
def _save_selenium_db(entries: typing.List[Link], single: bool = False, score=None, nx=False, xx=False): """Save link to the :mod:`selenium` database. The function updates the :class:`~darc.model.tasks.selenium.SeleniumQueueModel` table. Args: entries: Links to be added to the :mod:`selenium` database. It can be either a :obj:`list` of links, or a single link string (if ``single`` set as :data:`True`). single: Indicate if ``entries`` is a :obj:`list` of links or a single link string. score: Score to for the Redis sorted set. nx: Only create new elements and not to update scores for elements that already exist. xx: Only update scores of elements that already exist. New elements will not be added. """ if not entries: return if score is None: score = datetime.datetime.now() if not single: if nx: with database.atomic(): insert_many = [ dict( text=link.url, hash=link.name, link=link, timestamp=score, ) for link in entries ] for batch in peewee.chunked(insert_many, BULK_SIZE): (SeleniumQueueModel.insert_many( insert_many).on_conflict_ignore().execute()) return if xx: entries_text = [link.url for link in entries] (SeleniumQueueModel.update(timestamp=score).where( SeleniumQueueModel.text.in_(entries_text)).execute()) return with database.atomic(): replace_many = [ dict(text=link.url, hash=link.name, link=link, timestamp=score) for link in entries ] for batch in peewee.chunked(replace_many, BULK_SIZE): SeleniumQueueModel.replace_many(batch).execute() return if nx: SeleniumQueueModel.get_or_create(text=entries.url, defaults=dict( hash=entries.name, link=entries, timestamp=score, )) return if xx: with contextlib.suppress(peewee.DoesNotExist): model = SeleniumQueueModel.get( SeleniumQueueModel.text == entries.url) model.timestamp = score model.save() return SeleniumQueueModel.replace(text=entries.url, hash=entries.name, link=entries, timestamp=score).execute()
def _save_selenium_db(entries: 'Union[Link, List[Link]]', single: bool = False, score: 'Optional[float]' = None, nx: bool = False, xx: bool = False) -> None: """Save link to the :mod:`selenium` database. The function updates the :class:`~darc.model.tasks.selenium.SeleniumQueueModel` table. Args: entries: Links to be added to the :mod:`selenium` database. It can be either a :obj:`list` of links, or a single link string (if ``single`` set as :data:`True`). single: Indicate if ``entries`` is a :obj:`list` of links or a single link string. score: Score to for the Redis sorted set. nx: Only create new elements and not to update scores for elements that already exist. xx: Only update scores of elements that already exist. New elements will not be added. """ if not entries: return None if score is None: timestamp = datetime.now() else: timestamp = datetime.fromtimestamp(score) if not single: if TYPE_CHECKING: entries = cast('List[Link]', entries) if nx: with database.atomic(): insert_many = [{ 'text': link.url, 'hash': link.name, 'link': link, 'timestamp': timestamp, } for link in entries] for batch in peewee.chunked(insert_many, BULK_SIZE): _db_operation( SeleniumQueueModel.insert_many( insert_many).on_conflict_ignore().execute) return None if xx: entries_text = [link.url for link in entries] _db_operation( SeleniumQueueModel.update(timestamp=timestamp).where( cast('TextField', SeleniumQueueModel.text).in_(entries_text)).execute) return None with database.atomic(): replace_many = [{ 'text': link.url, 'hash': link.name, 'link': link, 'timestamp': timestamp, } for link in entries] for batch in peewee.chunked(replace_many, BULK_SIZE): _db_operation(SeleniumQueueModel.replace_many(batch).execute) return None if TYPE_CHECKING: entries = cast('Link', entries) if nx: _db_operation(SeleniumQueueModel.get_or_create, text=entries.url, defaults={ 'hash': entries.name, 'link': entries, 'timestamp': timestamp, }) return None if xx: with contextlib.suppress(peewee.DoesNotExist): model = _db_operation(SeleniumQueueModel.get, SeleniumQueueModel.text == entries.url) # type: SeleniumQueueModel # pylint: disable=line-too-long model.timestamp = timestamp _db_operation(model.save) return None _db_operation( SeleniumQueueModel.replace( text=entries.url, hash=entries.name, link=entries, timestamp=timestamp, ).execute) return None
def main(): """Entrypoint.""" parser = get_parser() args = parser.parse_args() pid = os.getpid() with open(PATH_ID, 'w') as file: print(pid, file=file) # wait for Redis if _WAIT_REDIS: if not FLAG_DB: _redis_command('set', 'darc', pid) if FLAG_DB: while True: with contextlib.suppress(Exception): with DB: DB.create_tables([ HostnameQueueModel, RequestsQueueModel, SeleniumQueueModel, ]) break if SAVE_DB: while True: with contextlib.suppress(Exception): with DB_WEB: DB_WEB.create_tables([ HostnameModel, URLModel, RobotsModel, SitemapModel, HostsModel, RequestsModel, RequestsHistoryModel, SeleniumModel, ]) break if DEBUG: print(stem.util.term.format('-*- Initialisation -*-', stem.util.term.Color.MAGENTA)) # pylint: disable=no-member # nuke the db if not FLAG_DB: _redis_command('delete', 'queue_hostname') _redis_command('delete', 'queue_requests') _redis_command('delete', 'queue_selenium') link_list = list() for link in filter(None, map(lambda s: s.strip(), args.link)): if DEBUG: print(stem.util.term.format(link, stem.util.term.Color.MAGENTA)) # pylint: disable=no-member link_list.append(link) if args.file is not None: for path in args.file: with open(path) as file: for line in filter(None, map(lambda s: s.strip(), file)): if line.startswith('#'): continue if DEBUG: print(stem.util.term.format(line, stem.util.term.Color.MAGENTA)) # pylint: disable=no-member link_list.append(line) # write to database link_pool = [parse_link(link) for link in link_list] save_requests(link_pool, score=0, nx=True) if DEBUG: print(stem.util.term.format('-' * shutil.get_terminal_size().columns, stem.util.term.Color.MAGENTA)) # pylint: disable=no-member # init link file if not os.path.isfile(PATH_LN): with open(PATH_LN, 'w') as file: print('proxy,scheme,host,hash,link', file=file) try: process(args.type) except BaseException: traceback.print_exc() _exit()