Esempio n. 1
0
def _load_requests_redis() -> typing.List[Link]:
    """Load link from the :mod:`requests` database.

    The function reads the ``queue_requests`` database.

    Returns:
        List of loaded links from the :mod:`requests` database.

    Note:
        At runtime, the function will load links with maximum number
        at :data:`~darc.db.MAX_POOL` to limit the memory usage.

    """
    now = time.time()
    if TIME_CACHE is None:
        sec_delta = 0  # type: ignore
        max_score = now
    else:
        sec_delta = TIME_CACHE.total_seconds()
        max_score = now - sec_delta

    try:
        with _redis_get_lock('lock_queue_requests', blocking_timeout=LOCK_TIMEOUT):  # type: ignore
            temp_pool: typing.List[bytes] = [_redis_command('get', name) for name in _redis_command('zrangebyscore', 'queue_requests',  # pylint: disable=line-too-long
                                                                                                    min=0, max=max_score, start=0, num=MAX_POOL)]  # pylint: disable=line-too-long
            link_pool = [pickle.loads(link) for link in filter(None, temp_pool)]  # nosec
            if TIME_CACHE is not None:
                new_score = now + sec_delta
                _save_requests_redis(link_pool, score=new_score)  # force update records
    except redis_lock.LockError:
        warning = warnings.formatwarning(f'[REQUESTS] Failed to acquire Redis lock after {LOCK_TIMEOUT} second(s)',
                                         LockWarning, __file__, 949, "_redis_get_lock('lock_queue_requests')")
        print(render_error(warning, stem.util.term.Color.YELLOW), end='', file=sys.stderr)  # pylint: disable=no-member
        link_pool = list()
    return link_pool
Esempio n. 2
0
def _load_selenium_redis() -> typing.List[Link]:
    """Load link from the :mod:`selenium` database.

    The function reads the ``queue_selenium`` database.

    Args:
        check: If perform checks on loaded links,
            default to :data:`~darc.const.CHECK`.

    Returns:
        List of loaded links from the :mod:`selenium` database.

    Note:
        At runtime, the function will load links with maximum number
        at :data:`~darc.db.MAX_POOL` to limit the memory usage.

    """
    now = time.time()
    if TIME_CACHE is None:
        sec_delta = 0
        max_score = now
    else:
        sec_delta = TIME_CACHE.total_seconds()
        max_score = now - sec_delta

    try:
        with _redis_get_lock('lock_queue_selenium',
                             blocking_timeout=LOCK_TIMEOUT):
            link_pool = [
                pickle.loads(link) for link in _redis_command('zrangebyscore',
                                                              'queue_selenium',
                                                              min=0,
                                                              max=max_score,
                                                              start=0,
                                                              num=MAX_POOL)
            ]
            if TIME_CACHE is not None:
                new_score = now + sec_delta
                _save_selenium_redis(link_pool,
                                     score=new_score)  # force update records
    except redis_lock.LockError:
        warning = warnings.formatwarning(
            f'[SELENIUM] Failed to acquire Redis lock after {LOCK_TIMEOUT} second(s)',
            LockWarning, __file__, 299,
            "_redis_get_lock('lock_queue_selenium')")
        print(render_error(warning, stem.util.term.Color.YELLOW),
              end='',
              file=sys.stderr)  # pylint: disable=no-member
        link_pool = list()
    return link_pool
Esempio n. 3
0
def _load_selenium_redis() -> 'List[Link]':
    """Load link from the :mod:`selenium` database.

    The function reads the ``queue_selenium`` database.

    Args:
        check: If perform checks on loaded links,
            default to :data:`~darc.const.CHECK`.

    Returns:
        List of loaded links from the :mod:`selenium` database.

    Note:
        At runtime, the function will load links with maximum number
        at :data:`~darc.db.MAX_POOL` to limit the memory usage.

    """
    now = time.time()
    if TIME_CACHE is None:
        sec_delta = 0  # type: float
        max_score = now
    else:
        sec_delta = TIME_CACHE.total_seconds()
        max_score = now - sec_delta

    try:
        with _redis_get_lock('queue_selenium'):
            temp_pool = [_redis_command('get', name) for name in _redis_command('zrangebyscore', 'queue_selenium',
                                                                                min=0, max=max_score, start=0, num=MAX_POOL)]  # type: List[bytes] # pylint: disable=line-too-long
            link_pool = [
                pickle.loads(link) for link in filter(None, temp_pool)
            ]  # nosec: B301
            if TIME_CACHE is not None:
                new_score = now + sec_delta
                _save_selenium_redis(link_pool,
                                     score=new_score)  # force update records
    except pottery_exceptions.PotteryError:
        logger.pexc(
            LOG_WARNING,
            f'[SELENIUM] Failed to acquire Redis lock after {LOCK_TIMEOUT} second(s)',
            LockWarning, "_redis_get_lock('queue_selenium')")
        link_pool = []
    return link_pool
Esempio n. 4
0
def _have_hostname_redis(link: 'Link') -> 'Tuple[bool, bool]':
    """Check if current link is a new host.

    The function checks the ``queue_hostname`` database.

    Args:
        link: Link to check against.

    Returns:
        A tuple of two :obj:`bool` values representing
        if such link is a known host and needs force
        refetch respectively.

    """
    new_score = time.time()
    if TIME_CACHE is None:
        threshold = math.inf
    else:
        threshold = new_score - TIME_CACHE.total_seconds()

    with _redis_get_lock('queue_hostname'):
        score = _redis_command('zscore', 'queue_hostname',
                               link.host)  # type: Optional[int]
        if score is None:
            have_flag = False
            force_fetch = False

            # update Redis record
            redis_update = True
        else:
            have_flag = True
            force_fetch = score < threshold

            # update Redis record (only if re-fetch)
            redis_update = force_fetch

    if redis_update:
        _redis_command('zadd', 'queue_hostname', {
            link.host: new_score,
        })
    return have_flag, force_fetch