Ejemplo n.º 1
0
def read_robots(link: 'darc_link.Link', text: str, host: 'Optional[str]' = None) -> 'List[darc_link.Link]':
    """Read ``robots.txt`` to fetch link to sitemaps.

    Args:
        link: Original link to ``robots.txt``.
        text: Content of ``robots.txt``.
        host: Hostname of the URL to ``robots.txt``,
            the value may not be same as in ``link``.

    Returns:
        List of link to sitemaps.

    Note:
        If the link to sitemap is not specified in
        ``robots.txt`` [*]_, the fallback link
        ``/sitemap.xml`` will be used.

        .. [*] https://www.sitemaps.org/protocol.html#submit_robots

    """
    rp = RobotFileParser()
    with io.StringIO(text) as file:
        rp.parse(file)

    sitemaps = rp.site_maps()
    if sitemaps is None:
        return [parse_link(urljoin(link.url, '/sitemap.xml'), backref=link)]
    return [parse_link(urljoin(link.url, sitemap), host=host, backref=link) for sitemap in sitemaps]
Ejemplo n.º 2
0
def extract_links_from_text(link: Link, text: str) -> typing.List[Link]:
    """Extract links from raw text source.

    Args:
        link: Original link of the source document.
        text: Content of source text document.

    Returns:
        List of extracted links.

    Important:
        The extraction is **NOT** as reliable since we did not
        perform `TLD`_ checks on the extracted links and we cannot
        guarantee all links to be extracted.

        .. _TLD: https://pypi.org/project/tld/

        The URL patterns used to extract links are defined by
        :data:`darc.parse.URL_PAT` and you may register your
        own expressions by :envvar:`DARC_URL_PAT`.

    """
    temp_list = list()
    for part in text.split():
        for pattern in URL_PAT:
            for match in pattern.finditer(part):
                match_url = match.group('url')

                # add scheme if not exist
                if not urlsplit(match_url).scheme:
                    match_url = f'{link.url_parse.scheme}:{match_url}'

                temp_link = parse_link(match_url)
                temp_list.append(temp_link)
    return temp_list
Ejemplo n.º 3
0
def read_hosts(text: str, check: bool = CHECK) -> typing.List[Link]:
    """Read ``hosts.txt``.

    Args:
        text: Content of ``hosts.txt``.
        check: If perform checks on extracted links,
            default to :data:`~darc.const.CHECK`.

    Returns:
        List of links extracted.

    """
    temp_list = list()
    for line in filter(None, map(lambda s: s.strip(), text.splitlines())):
        if line.startswith('#'):
            continue

        link = line.split('=', maxsplit=1)[0]
        if I2P_REGEX.fullmatch(link) is None:
            continue
        temp_list.append(parse_link(f'http://{link}'))

    if check:
        return _check(temp_list)
    return temp_list
Ejemplo n.º 4
0
def read_hosts(link: 'darc_link.Link', text: str, check: bool = CHECK) -> 'List[darc_link.Link]':
    """Read ``hosts.txt``.

    Args:
        link: Link object to fetch for its ``hosts.txt``.
        text: Content of ``hosts.txt``.
        check: If perform checks on extracted links,
            default to :data:`~darc.const.CHECK`.

    Returns:
        List of links extracted.

    """
    temp_list = []
    for line in filter(None, map(lambda s: s.strip(), text.splitlines())):
        if line.startswith('#'):
            continue

        host = line.split('=', maxsplit=1)[0]
        if I2P_REGEX.fullmatch(host) is None:
            continue
        temp_list.append(parse_link(f'http://{host}', backref=link))

    if check:
        return _check(temp_list)
    return temp_list
Ejemplo n.º 5
0
def read_sitemap(link: 'darc_link.Link', text: str, check: bool = CHECK) -> 'List[darc_link.Link]':
    """Read sitemap.

    Args:
        link: Original link to the sitemap.
        text: Content of the sitemap.
        check: If perform checks on extracted links,
            default to :data:`~darc.const.CHECK`.

    Returns:
        List of links extracted.

    See Also:
        * :func:`darc.parse._check`
        * :func:`darc.parse._check_ng`

    """
    soup = bs4.BeautifulSoup(text, 'html5lib')

    # https://www.sitemaps.org/protocol.html
    temp_list = [parse_link(urljoin(link.url, loc.text), host=link.host, backref=link)
                 for loc in soup.select('urlset > url > loc')]

    # check content / proxy type
    if check:
        return _check(temp_list)
    return temp_list
Ejemplo n.º 6
0
def extract_links(link: 'darc_link.Link',
                  html: 'Union[str, bytes]',
                  check: bool = CHECK) -> 'List[darc_link.Link]':
    """Extract links from HTML document.

    Args:
        link: Original link of the HTML document.
        html: Content of the HTML document.
        check: If perform checks on extracted links,
            default to :data:`~darc.const.CHECK`.

    Returns:
        List of extracted links.

    See Also:
        * :func:`darc.parse._check`
        * :func:`darc.parse._check_ng`

    """
    soup = bs4.BeautifulSoup(html, 'html5lib')

    temp_list = []
    for child in soup.find_all(
            lambda tag: tag.has_attr('href') or tag.has_attr('src')):
        if (href := child.get('href', child.get('src'))) is None:
            continue
        temp_link = parse_link(urljoin(link.url, href), backref=link)
        temp_list.append(temp_link)
Ejemplo n.º 7
0
def get_sitemap(link: 'darc_link.Link', text: str, host: 'Optional[str]' = None) -> 'List[darc_link.Link]':
    """Fetch link to other sitemaps from a sitemap.

    Args:
        link: Original link to the sitemap.
        text: Content of the sitemap.
        host: Hostname of the URL to the sitemap,
            the value may not be same as in ``link``.

    Returns:
        List of link to sitemaps.

    Note:
        As specified in the sitemap protocol,
        it may contain links to other sitemaps. [*]_

        .. [*] https://www.sitemaps.org/protocol.html#index

    """
    sitemaps = []
    soup = bs4.BeautifulSoup(text, 'html5lib')

    # https://www.sitemaps.org/protocol.html#index
    for loc in soup.select('sitemapindex > sitemap > loc'):
        sitemaps.append(urljoin(link.url, loc.text))
    return [parse_link(sitemap, host=host, backref=link) for sitemap in sitemaps]
Ejemplo n.º 8
0
def _check_ng(temp_list: typing.List[Link]) -> typing.List[Link]:
    """Check content type of links through ``HEAD`` requests.

    Args:
        temp_list: List of links to be checked.

    Returns:
        List of links matches the requirements.

    See Also:
        * :func:`darc.parse.match_host`
        * :func:`darc.parse.match_proxy`
        * :func:`darc.parse.match_mime`

    """
    from darc.crawl import request_session  # pylint: disable=import-outside-toplevel

    session_map = dict()
    result_list = list()
    for link in temp_list:
        if match_host(link.host):
            continue
        if match_proxy(link.proxy):
            continue

        # get session
        session = session_map.get(link.proxy)
        if session is None:
            session = request_session(link, futures=True)
            session_map[link.proxy] = session

        result = session.head(link.url, allow_redirects=True)
        result_list.append(result)

        print(f'[HEAD] Checking content type from {link.url}')

    link_list = list()
    for result in concurrent.futures.as_completed(result_list):
        try:
            response: typing.Response = result.result()
        except requests.RequestException as error:
            if error.response is None:
                print(render_error(f'[HEAD] Checking failed <{error}>',
                                   stem.util.term.Color.RED))  # pylint: disable=no-member
                continue
            print(render_error(f'[HEAD] Failed on {error.response.url} <{error}>',
                               stem.util.term.Color.RED))  # pylint: disable=no-member
            link_list.append(error.response.url)
            continue
        ct_type = get_content_type(response)

        print(f'[HEAD] Checked content type from {response.url} ({ct_type})')

        if match_mime(ct_type):
            continue
        temp_link = parse_link(response.request.url)
        link_list.append(temp_link)
    return link_list
Ejemplo n.º 9
0
def _check_ng(temp_list: 'List[darc_link.Link]') -> 'List[darc_link.Link]':
    """Check content type of links through ``HEAD`` requests.

    Args:
        temp_list: List of links to be checked.

    Returns:
        List of links matches the requirements.

    See Also:
        * :func:`darc.parse.match_host`
        * :func:`darc.parse.match_proxy`
        * :func:`darc.parse.match_mime`

    """
    from darc.crawl import request_session  # pylint: disable=import-outside-toplevel

    session_map = {}  # type: Dict[str, FuturesSession]
    result_list = []
    for link in temp_list:
        if match_host(link.host):
            continue
        if match_proxy(link.proxy):
            continue

        # get session
        session = session_map.get(link.proxy)
        if session is None:
            session = request_session(link, futures=True)
            session_map[link.proxy] = session

        result = session.head(link.url, allow_redirects=True)
        result_list.append(result)

        logger.info('[HEAD] Checking content type from %s', link.url)

    link_list = []
    for result in concurrent.futures.as_completed(result_list):  # type: ignore
        try:
            response = result.result()  # type: Response
        except requests.RequestException as error:
            if error.response is None:
                logger.pexc(message='[HEAD] Checking failed')
                continue
            logger.pexc(message=f'[HEAD] Failed on {error.response.url}')
            link_list.append(error.response.url)
            continue
        ct_type = get_content_type(response)

        logger.info('[HEAD] Checked content type from %s (%s)', response.url,
                    ct_type)

        if match_mime(ct_type):
            continue
        temp_link = parse_link(response.request.url)  # type: ignore
        link_list.append(temp_link)
    return link_list
Ejemplo n.º 10
0
def fetch_hosts(link: 'darc_link.Link', force: bool = False) -> None:
    """Fetch ``hosts.txt``.

    Args:
        link: Link object to fetch for its ``hosts.txt``.
        force: Force refetch ``hosts.txt``.

    Returns:
        Content of the ``hosts.txt`` file.

    """
    if force:
        logger.warning('[HOSTS] Force refetch %s', link.url)

    hosts_path = None if force else have_hosts(link)
    if hosts_path is not None:

        logger.warning('[HOSTS] Cached %s', link.url)  # pylint: disable=no-member
        with open(hosts_path) as hosts_file:
            hosts_text = hosts_file.read()

    else:

        from darc.requests import i2p_session  # pylint: disable=import-outside-toplevel

        hosts_link = parse_link(urljoin(link.url, '/hosts.txt'), backref=link)
        logger.info('[HOSTS] Subscribing %s', hosts_link.url)

        with i2p_session() as session:
            try:
                response = session.get(hosts_link.url)
            except requests.RequestException:
                logger.pexc(message=f'[HOSTS] Failed on {hosts_link.url}')
                return

        if not response.ok:
            logger.error('[HOSTS] Failed on %s [%d]', hosts_link.url, response.status_code)
            return

        ct_type = get_content_type(response)
        if ct_type not in ['text/text', 'text/plain']:
            logger.error('[HOSTS] Unresolved content type on %s (%s)', hosts_link.url, ct_type)
            return

        hosts_text = response.text
        save_hosts(hosts_link, hosts_text)

        logger.info('[HOSTS] Subscribed %s', hosts_link.url)

    from darc.db import save_requests  # pylint: disable=import-outside-toplevel

    # add link to queue
    save_requests(read_hosts(link, hosts_text))
Ejemplo n.º 11
0
    def _extract_links(cls,
                       link: Link,
                       html: typing.Union[str, bytes],
                       check: bool = CHECK) -> typing.List[Link]:
        """Extract links from HTML document.

        Args:
            link: Original link of the HTML document.
            html: Content of the HTML document.
            check: If perform checks on extracted links,
                default to :data:`~darc.const.CHECK`.

        Returns:
            List of extracted links.

        """
        temp_list = cls.extract_links(link, html)
        link_list = [parse_link(link) for link in temp_list]

        # check content / proxy type
        if check:
            return _check(link_list)
        return link_list
Ejemplo n.º 12
0
def fetch_sitemap(link: 'darc_link.Link', force: bool = False) -> None:
    """Fetch sitemap.

    The function will first fetch the ``robots.txt``, then
    fetch the sitemaps accordingly.

    Args:
        link: Link object to fetch for its sitemaps.
        force: Force refetch its sitemaps.

    Returns:
        Contents of ``robots.txt`` and sitemaps.

    See Also:
        * :func:`darc.proxy.null.read_robots`
        * :func:`darc.proxy.null.read_sitemap`
        * :func:`darc.parse.get_sitemap`

    """
    if force:
        logger.warning('[ROBOTS] Force refetch %s', link.url)

    robots_path = None if force else have_robots(link)
    if robots_path is not None:

        logger.warning('[ROBOTS] Cached %s', link.url)
        with open(robots_path) as file:
            robots_text = file.read()

    else:

        robots_link = parse_link(urljoin(link.url, '/robots.txt'), backref=link)
        logger.info('[ROBOTS] Checking %s', robots_link.url)

        with request_session(robots_link) as session:
            try:
                response = session.get(robots_link.url)
            except requests.RequestException:
                logger.pexc(message=f'[ROBOTS] Failed on {robots_link.url}')
                return

        if response.ok:
            ct_type = get_content_type(response)
            if ct_type not in ['text/text', 'text/plain']:
                logger.error('[ROBOTS] Unresolved content type on %s (%s)', robots_link.url, ct_type)
                robots_text = ''
            else:
                robots_text = response.text
                save_robots(robots_link, robots_text)
                logger.info('[ROBOTS] Checked %s', robots_link.url)
        else:
            logger.error('[ROBOTS] Failed on %s [%d]', robots_link.url, response.status_code)
            robots_text = ''

    if force:
        logger.warning('[SITEMAP] Force refetch %s', link.url)

    sitemaps = read_robots(link, robots_text, host=link.host)
    for sitemap_link in sitemaps:
        sitemap_path = None if force else have_sitemap(sitemap_link)
        if sitemap_path is not None:

            logger.warning('[SITEMAP] Cached %s', sitemap_link.url)
            with open(sitemap_path) as file:
                sitemap_text = file.read()

        else:

            logger.info('[SITEMAP] Fetching %s', sitemap_link.url)

            with request_session(sitemap_link) as session:
                try:
                    response = session.get(sitemap_link.url)
                except requests.RequestException:
                    logger.pexc(message=f'[SITEMAP] Failed on {sitemap_link.url}')
                    continue

            if not response.ok:
                logger.error('[SITEMAP] Failed on %s [%d]', sitemap_link.url, response.status_code)
                continue

            # check content type
            ct_type = get_content_type(response)
            if ct_type == 'application/gzip':
                try:
                    sitemap_text = gzip.decompress(response.content).decode()
                except UnicodeDecodeError:
                    sitemap_text = response.text
            elif ct_type in ['text/xml', 'text/html']:
                sitemap_text = response.text
                save_sitemap(sitemap_link, sitemap_text)
            else:
                logger.error('[SITEMAP] Unresolved content type on %s (%s)', sitemap_link.url, ct_type)
                continue

            logger.info('[SITEMAP] Fetched %s', sitemap_link.url)

        # get more sitemaps
        sitemaps.extend(get_sitemap(sitemap_link, sitemap_text, host=link.host))

        # add link to queue
        save_requests(read_sitemap(link, sitemap_text))
Ejemplo n.º 13
0
def fetch_sitemap(link: Link, force: bool = False) -> None:
    """Fetch sitemap.

    The function will first fetch the ``robots.txt``, then
    fetch the sitemaps accordingly.

    Args:
        link: Link object to fetch for its sitemaps.
        force: Force refetch its sitemaps.

    Returns:
        Contents of ``robots.txt`` and sitemaps.

    See Also:
        * :func:`darc.proxy.null.read_robots`
        * :func:`darc.proxy.null.read_sitemap`
        * :func:`darc.parse.get_sitemap`

    """
    if force:
        print(stem.util.term.format(f'[ROBOTS] Force refetch {link.url}',
                                    stem.util.term.Color.YELLOW))  # pylint: disable=no-member

    robots_path = None if force else have_robots(link)
    if robots_path is not None:

        print(stem.util.term.format(f'[ROBOTS] Cached {link.url}',
                                    stem.util.term.Color.YELLOW))  # pylint: disable=no-member
        with open(robots_path) as file:
            robots_text = file.read()

    else:

        robots_link = parse_link(urljoin(link.url, '/robots.txt'))
        print(f'[ROBOTS] Checking {robots_link.url}')

        with request_session(robots_link) as session:
            try:
                response = session.get(robots_link.url)
            except requests.RequestException as error:
                print(render_error(f'[ROBOTS] Failed on {robots_link.url} <{error}>',
                                   stem.util.term.Color.RED), file=sys.stderr)  # pylint: disable=no-member
                return

        if response.ok:
            ct_type = get_content_type(response)
            if ct_type not in ['text/text', 'text/plain']:
                print(render_error(f'[ROBOTS] Unresolved content type on {robots_link.url} ({ct_type})',
                                   stem.util.term.Color.RED), file=sys.stderr)  # pylint: disable=no-member
                robots_text = ''
            else:
                robots_text = response.text
                save_robots(robots_link, robots_text)
                print(f'[ROBOTS] Checked {robots_link.url}')
        else:
            print(render_error(f'[ROBOTS] Failed on {robots_link.url} [{response.status_code}]',
                               stem.util.term.Color.RED), file=sys.stderr)  # pylint: disable=no-member
            robots_text = ''

    if force:
        print(stem.util.term.format(f'[SITEMAP] Force refetch {link.url}',
                                    stem.util.term.Color.YELLOW))  # pylint: disable=no-member

    sitemaps = read_robots(link, robots_text, host=link.host)
    for sitemap_link in sitemaps:
        sitemap_path = None if force else have_sitemap(sitemap_link)
        if sitemap_path is not None:

            print(stem.util.term.format(f'[SITEMAP] Cached {sitemap_link.url}',
                                        stem.util.term.Color.YELLOW))  # pylint: disable=no-member
            with open(sitemap_path) as file:
                sitemap_text = file.read()

        else:

            print(f'[SITEMAP] Fetching {sitemap_link.url}')

            with request_session(sitemap_link) as session:
                try:
                    response = session.get(sitemap_link.url)
                except requests.RequestException as error:
                    print(render_error(f'[SITEMAP] Failed on {sitemap_link.url} <{error}>',
                                       stem.util.term.Color.RED), file=sys.stderr)  # pylint: disable=no-member
                    continue

            if not response.ok:
                print(render_error(f'[SITEMAP] Failed on {sitemap_link.url} [{response.status_code}]',
                                   stem.util.term.Color.RED), file=sys.stderr)  # pylint: disable=no-member
                continue

            # check content type
            ct_type = get_content_type(response)
            if ct_type == 'application/gzip':
                try:
                    sitemap_text = gzip.decompress(response.content).decode()
                except UnicodeDecodeError:
                    sitemap_text = response.text
            elif ct_type in ['text/xml', 'text/html']:
                sitemap_text = response.text
                save_sitemap(sitemap_link, sitemap_text)
            else:
                print(render_error(f'[SITEMAP] Unresolved content type on {sitemap_link.url} ({ct_type})',
                                   stem.util.term.Color.RED), file=sys.stderr)  # pylint: disable=no-member
                continue

            print(f'[SITEMAP] Fetched {sitemap_link.url}')

        # get more sitemaps
        sitemaps.extend(get_sitemap(sitemap_link, sitemap_text, host=link.host))

        # add link to queue
        save_requests(read_sitemap(link, sitemap_text))
Ejemplo n.º 14
0
def main(argv: 'Optional[List[str]]' = None) -> int:
    """Entrypoint.

    Args:
        argv: Optional command line arguments.

    Returns:
        Exit code.

    """
    parser = get_parser()
    args = parser.parse_args(argv)

    pid = os.getpid()
    with open(PATH_ID, 'w') as file:
        print(pid, file=file)

    # wait for Redis
    if _WAIT_REDIS:
        if not FLAG_DB:
            _redis_command('set', 'darc', pid)

    if FLAG_DB:
        while True:
            try:
                with DB:
                    _db_operation(DB.create_tables, [
                        HostnameQueueModel,
                        RequestsQueueModel,
                        SeleniumQueueModel,
                    ])
            except Exception:
                logger.pexc(LOG_WARNING,
                            category=DatabaseOperaionFailed,
                            line='DB.create_tables([HostnameQueueModel, ...]')
                continue
            break

    if SAVE_DB:
        while True:
            try:
                with DB_WEB:
                    _db_operation(DB_WEB.create_tables, [
                        HostnameModel,
                        URLModel,
                        URLThroughModel,
                        RobotsModel,
                        SitemapModel,
                        HostsModel,
                        RequestsModel,
                        RequestsHistoryModel,
                        SeleniumModel,
                    ])
            except Exception:
                logger.pexc(LOG_WARNING,
                            category=DatabaseOperaionFailed,
                            line='DB.create_tables([HostnameModel, ...]')
                continue
            break

    logger.debug('-*- Initialisation -*-')
    if DEBUG and not FLAG_DB:
        # nuke the db
        _redis_command('delete', 'queue_hostname')
        _redis_command('delete', 'queue_requests')
        _redis_command('delete', 'queue_selenium')

    link_list = []
    for link in filter(
            None, map(lambda s: s.strip(),
                      args.link)):  # type: ignore[name-defined,var-annotated]
        logger.pline(LOG_DEBUG, link)
        link_list.append(link)

    if args.file is not None:
        for path in args.file:
            with open(path) as file:
                for line in filter(None, map(lambda s: s.strip(), file)):
                    if line.startswith('#'):
                        continue
                    logger.pline(LOG_DEBUG, line)
                    link_list.append(line)

    # write to database
    link_pool = [parse_link(link, backref=None) for link in link_list]
    save_requests(link_pool, score=0, nx=True)
    logger.pline(LOG_DEBUG, logger.horizon)

    # init link file
    if not os.path.isfile(PATH_LN):
        with open(PATH_LN, 'w') as file:
            print('proxy,scheme,host,hash,link', file=file)

    try:
        process(args.type)
    except BaseException:
        traceback.print_exc()
    _exit()

    return 0
Ejemplo n.º 15
0
def fetch_hosts(link: Link):
    """Fetch ``hosts.txt``.

    Args:
        link: Link object to fetch for its ``hosts.txt``.

    Returns:
        Content of the ``hosts.txt`` file.

    """
    hosts_path = have_hosts(link)
    if hosts_path is not None:

        print(
            stem.util.term.format(f'[HOSTS] Cached {link.url}',
                                  stem.util.term.Color.YELLOW))  # pylint: disable=no-member

        with open(hosts_path) as hosts_file:
            hosts_text = hosts_file.read()

    else:

        from darc.requests import i2p_session  # pylint: disable=import-outside-toplevel

        hosts_link = parse_link(urljoin(link.url, '/hosts.txt'))
        print(f'[HOSTS] Subscribing {hosts_link.url}')

        with i2p_session() as session:
            try:
                response = session.get(hosts_link.url)
            except requests.RequestException as error:
                print(render_error(
                    f'[HOSTS] Failed on {hosts_link.url} <{error}>',
                    stem.util.term.Color.RED),
                      file=sys.stderr)  # pylint: disable=no-member
                return

        if not response.ok:
            print(render_error(
                f'[HOSTS] Failed on {hosts_link.url} [{response.status_code}]',
                stem.util.term.Color.RED),
                  file=sys.stderr)  # pylint: disable=no-member
            return

        ct_type = get_content_type(response)
        if ct_type not in ['text/text', 'text/plain']:
            print(render_error(
                f'[HOSTS] Unresolved content type on {hosts_link.url} ({ct_type}',
                stem.util.term.Color.RED),
                  file=sys.stderr)  # pylint: disable=no-member
            return

        hosts_text = response.text
        save_hosts(hosts_link, hosts_text)

        print(f'[HOSTS] Subscribed {hosts_link.url}')

    from darc.db import save_requests  # pylint: disable=import-outside-toplevel

    # add link to queue
    save_requests(read_hosts(hosts_text))
Ejemplo n.º 16
0
def main(argv: typing.Optional[typing.List[str]] = None) -> int:
    """Entrypoint.

    Args:
        argv: Optional command line arguments.

    Returns:
        Exit code.

    """
    parser = get_parser()
    args = parser.parse_args(argv)

    pid = os.getpid()
    with open(PATH_ID, 'w') as file:
        print(pid, file=file)

    # wait for Redis
    if _WAIT_REDIS:
        if not FLAG_DB:
            _redis_command('set', 'darc', pid)

    if FLAG_DB:
        while True:
            try:
                with DB:
                    _db_operation(DB.create_tables, [
                        HostnameQueueModel,
                        RequestsQueueModel,
                        SeleniumQueueModel,
                    ])
            except Exception as error:
                warning = warnings.formatwarning(
                    error,
                    DatabaseOperaionFailed,
                    __file__,
                    102,  # type: ignore[arg-type]
                    'DB.create_tables([HostnameQueueModel, ...])')
                print(render_error(warning, stem.util.term.Color.YELLOW),
                      end='',
                      file=sys.stderr)  # pylint: disable=no-member
                continue
            break

    if SAVE_DB:
        while True:
            try:
                with DB_WEB:
                    _db_operation(DB_WEB.create_tables, [
                        HostnameModel,
                        URLModel,
                        RobotsModel,
                        SitemapModel,
                        HostsModel,
                        RequestsModel,
                        RequestsHistoryModel,
                        SeleniumModel,
                    ])
            except Exception as error:
                warning = warnings.formatwarning(
                    error,
                    DatabaseOperaionFailed,
                    __file__,
                    117,  # type: ignore[arg-type]
                    'DB.create_tables([HostnameModel, ...])')
                print(render_error(warning, stem.util.term.Color.YELLOW),
                      end='',
                      file=sys.stderr)  # pylint: disable=no-member
                continue
            break

    if DEBUG:
        print(
            stem.util.term.format('-*- Initialisation -*-',
                                  stem.util.term.Color.MAGENTA))  # pylint: disable=no-member

        # nuke the db
        if not FLAG_DB:
            _redis_command('delete', 'queue_hostname')
            _redis_command('delete', 'queue_requests')
            _redis_command('delete', 'queue_selenium')

    link_list = list()
    for link in filter(
            None, map(lambda s: s.strip(),
                      args.link)):  # type: ignore[name-defined,var-annotated]
        if DEBUG:
            print(stem.util.term.format(link, stem.util.term.Color.MAGENTA))  # pylint: disable=no-member
        link_list.append(link)

    if args.file is not None:
        for path in args.file:
            with open(path) as file:
                for line in filter(None, map(lambda s: s.strip(), file)):
                    if line.startswith('#'):
                        continue
                    if DEBUG:
                        print(
                            stem.util.term.format(
                                line, stem.util.term.Color.MAGENTA))  # pylint: disable=no-member
                    link_list.append(line)

    # write to database
    link_pool = [parse_link(link) for link in link_list]
    save_requests(link_pool, score=0, nx=True)

    if DEBUG:
        print(
            stem.util.term.format('-' * shutil.get_terminal_size().columns,
                                  stem.util.term.Color.MAGENTA))  # pylint: disable=no-member

    # init link file
    if not os.path.isfile(PATH_LN):
        with open(PATH_LN, 'w') as file:
            print('proxy,scheme,host,hash,link', file=file)

    try:
        process(args.type)
    except BaseException:
        traceback.print_exc()
    _exit()

    return 0
Ejemplo n.º 17
0
def main():
    """Entrypoint."""
    parser = get_parser()
    args = parser.parse_args()

    pid = os.getpid()
    with open(PATH_ID, 'w') as file:
        print(pid, file=file)

    # wait for Redis
    if _WAIT_REDIS:
        if not FLAG_DB:
            _redis_command('set', 'darc', pid)

    if FLAG_DB:
        while True:
            with contextlib.suppress(Exception):
                with DB:
                    DB.create_tables([
                        HostnameQueueModel, RequestsQueueModel, SeleniumQueueModel,
                    ])
            break

    if SAVE_DB:
        while True:
            with contextlib.suppress(Exception):
                with DB_WEB:
                    DB_WEB.create_tables([
                        HostnameModel, URLModel,
                        RobotsModel, SitemapModel, HostsModel,
                        RequestsModel, RequestsHistoryModel, SeleniumModel,
                    ])
            break

    if DEBUG:
        print(stem.util.term.format('-*- Initialisation -*-', stem.util.term.Color.MAGENTA))  # pylint: disable=no-member

        # nuke the db
        if not FLAG_DB:
            _redis_command('delete', 'queue_hostname')
            _redis_command('delete', 'queue_requests')
            _redis_command('delete', 'queue_selenium')

    link_list = list()
    for link in filter(None, map(lambda s: s.strip(), args.link)):
        if DEBUG:
            print(stem.util.term.format(link, stem.util.term.Color.MAGENTA))  # pylint: disable=no-member
        link_list.append(link)

    if args.file is not None:
        for path in args.file:
            with open(path) as file:
                for line in filter(None, map(lambda s: s.strip(), file)):
                    if line.startswith('#'):
                        continue
                    if DEBUG:
                        print(stem.util.term.format(line, stem.util.term.Color.MAGENTA))  # pylint: disable=no-member
                    link_list.append(line)

    # write to database
    link_pool = [parse_link(link) for link in link_list]
    save_requests(link_pool, score=0, nx=True)

    if DEBUG:
        print(stem.util.term.format('-' * shutil.get_terminal_size().columns, stem.util.term.Color.MAGENTA))  # pylint: disable=no-member

    # init link file
    if not os.path.isfile(PATH_LN):
        with open(PATH_LN, 'w') as file:
            print('proxy,scheme,host,hash,link', file=file)

    try:
        process(args.type)
    except BaseException:
        traceback.print_exc()
    _exit()