Exemple #1
0
def crawl_with_options(urls, options_dict=None, logger_builder=None):
    """Crawls URLs with provided options and logger.

    :param options_dict: Must contain the long name of the command line
            options. (optional)

    :param logger_builder: Function that will be called to instantiate a
            logger. (optional)

    :rtype: A pylinkvalidator.crawler.Site instance
    """

    config = Config()

    config.parse_api_config(urls, options_dict)

    if not logger_builder:
        logger = configure_logger(config)
    else:
        logger = logger_builder()

    # TODO In the future, we will pass the logger builder and not the logger
    # to enable the ProcessSiteCrawler to instantiate its own custom logger.
    crawler = execute_from_config(config, logger)

    return crawler.site
Exemple #2
0
def crawl(url):
    """Crawls a URL and returns a pylinkvalidator.crawler.Site instance.

    :rtype: A pylinkvalidator.crawler.Site instance
    """
    config = Config()
    config.parse_api_config([url])
    logger = configure_logger(config)
    crawler = execute_from_config(config, logger)

    return crawler.site
Exemple #3
0
    def _run_crawler_plain(
            self, crawler_class, other_options=None, url="/index.html"):
        url = self.get_url(url)
        sys.argv = ['pylinkvalidator', "-m", "process", url]
        if not other_options:
            other_options = []
        sys.argv.extend(other_options)
        config = Config()
        config.parse_cli_config()

        crawler = crawler_class(config, get_logger())
        crawler.crawl()

        if config.options.multi:
            crawler.site.collect_multi_sites()

        return crawler.site
Exemple #4
0
    def _run_crawler_plain(
            self, crawler_class, other_options=None, url="/index.html"):
        url = self.get_url(url)
        sys.argv = ['pylinkvalidator', "-m", "process", url]
        if not other_options:
            other_options = []
        sys.argv.extend(other_options)
        config = Config()
        config.parse_cli_config()

        crawler = crawler_class(config, get_logger())
        crawler.crawl()

        if config.options.multi:
            crawler.site.collect_multi_sites()

        return crawler.site
Exemple #5
0
    def test_url_file_path(self):
        (_, temp_file_path) = mkstemp()
        url = self.get_url("/index.html")
        url2 = self.get_url("/robots.txt")
        with open(temp_file_path, "w") as temp_file:
            temp_file.write(url + "\n")
            temp_file.write(url2 + "\n")

        sys.argv = [
            "pylinkvalidator", "-m", "process", "--url-file-path",
            temp_file_path]
        config = Config()
        config.parse_cli_config()

        crawler = ThreadSiteCrawler(config, get_logger())
        crawler.crawl()

        site = crawler.site
        self.assertEqual(12, len(site.pages))
        self.assertEqual(1, len(site.error_pages))
        os.unlink(temp_file_path)
Exemple #6
0
    def test_url_file_path(self):
        (_, temp_file_path) = mkstemp()
        url = self.get_url("/index.html")
        url2 = self.get_url("/robots.txt")
        with open(temp_file_path, "w") as temp_file:
            temp_file.write(url + "\n")
            temp_file.write(url2 + "\n")

        sys.argv = [
            "pylinkvalidator", "-m", "process", "--url-file-path",
            temp_file_path]
        config = Config()
        config.parse_cli_config()

        crawler = ThreadSiteCrawler(config, get_logger())
        crawler.crawl()

        site = crawler.site
        self.assertEqual(12, len(site.pages))
        self.assertEqual(1, len(site.error_pages))
        os.unlink(temp_file_path)
Exemple #7
0
def execute_from_command_line():
    """Runs the crawler and retrieves the configuration from the command
       line.
    """
    try:
        start = time.time()
        config = Config()
        config.parse_cli_config()

        logger = configure_logger(config)
        crawler = execute_from_config(config, logger)

        stop = time.time()

        if not crawler.site.is_ok or config.options.when == WHEN_ALWAYS:
            report(crawler.site, config, stop - start, logger)

        if not crawler.site.is_ok:
            sys.exit(1)
    except Exception as e:
        print(e)
        sys.exit(1)
Exemple #8
0
def execute_from_command_line():
    """Runs the crawler and retrieves the configuration from the command
       line.
    """
    try:
        start = time.time()
        config = Config()
        config.parse_cli_config()

        logger = configure_logger(config)
        crawler = execute_from_config(config, logger)

        stop = time.time()

        if not crawler.site.is_ok or config.options.when == WHEN_ALWAYS:
            report(crawler.site, config, stop - start, logger)

        if not crawler.site.is_ok:
            sys.exit(1)
    except Exception as e:
        print(e)
        sys.exit(1)
Exemple #9
0
    def test_accepted_hosts(self):
        sys.argv = ['pylinkvalidator', 'http://www.example.com/']
        config = Config()
        config.parse_cli_config()
        self.assertTrue('www.example.com' in config.accepted_hosts)

        sys.argv = ['pylinkvalidator', '-H', 'www.example.com',
                    'http://example.com', 'foo.com', 'http://www.example.com/',
                    'baz.com']
        config = Config()
        config.parse_cli_config()

        self.assertTrue('www.example.com' in config.accepted_hosts)
        self.assertTrue('example.com' in config.accepted_hosts)
        self.assertTrue('foo.com' in config.accepted_hosts)
        self.assertTrue('baz.com' in config.accepted_hosts)
Exemple #10
0
    def test_accepted_hosts(self):
        sys.argv = ['pylinkvalidator', 'http://www.example.com/']
        config = Config()
        config.parse_cli_config()
        self.assertTrue('www.example.com' in config.accepted_hosts)

        sys.argv = ['pylinkvalidator', '-H', 'www.example.com',
                    'http://example.com', 'foo.com', 'http://www.example.com/',
                    'baz.com']
        config = Config()
        config.parse_cli_config()

        self.assertTrue('www.example.com' in config.accepted_hosts)
        self.assertTrue('example.com' in config.accepted_hosts)
        self.assertTrue('foo.com' in config.accepted_hosts)
        self.assertTrue('baz.com' in config.accepted_hosts)