def crawl_with_options(urls, options_dict=None, logger_builder=None): """Crawls URLs with provided options and logger. :param options_dict: Must contain the long name of the command line options. (optional) :param logger_builder: Function that will be called to instantiate a logger. (optional) :rtype: A pylinkvalidator.crawler.Site instance """ config = Config() config.parse_api_config(urls, options_dict) if not logger_builder: logger = configure_logger(config) else: logger = logger_builder() # TODO In the future, we will pass the logger builder and not the logger # to enable the ProcessSiteCrawler to instantiate its own custom logger. crawler = execute_from_config(config, logger) return crawler.site
def crawl(url): """Crawls a URL and returns a pylinkvalidator.crawler.Site instance. :rtype: A pylinkvalidator.crawler.Site instance """ config = Config() config.parse_api_config([url]) logger = configure_logger(config) crawler = execute_from_config(config, logger) return crawler.site
def _run_crawler_plain( self, crawler_class, other_options=None, url="/index.html"): url = self.get_url(url) sys.argv = ['pylinkvalidator', "-m", "process", url] if not other_options: other_options = [] sys.argv.extend(other_options) config = Config() config.parse_cli_config() crawler = crawler_class(config, get_logger()) crawler.crawl() if config.options.multi: crawler.site.collect_multi_sites() return crawler.site
def test_url_file_path(self): (_, temp_file_path) = mkstemp() url = self.get_url("/index.html") url2 = self.get_url("/robots.txt") with open(temp_file_path, "w") as temp_file: temp_file.write(url + "\n") temp_file.write(url2 + "\n") sys.argv = [ "pylinkvalidator", "-m", "process", "--url-file-path", temp_file_path] config = Config() config.parse_cli_config() crawler = ThreadSiteCrawler(config, get_logger()) crawler.crawl() site = crawler.site self.assertEqual(12, len(site.pages)) self.assertEqual(1, len(site.error_pages)) os.unlink(temp_file_path)
def execute_from_command_line(): """Runs the crawler and retrieves the configuration from the command line. """ try: start = time.time() config = Config() config.parse_cli_config() logger = configure_logger(config) crawler = execute_from_config(config, logger) stop = time.time() if not crawler.site.is_ok or config.options.when == WHEN_ALWAYS: report(crawler.site, config, stop - start, logger) if not crawler.site.is_ok: sys.exit(1) except Exception as e: print(e) sys.exit(1)
def test_accepted_hosts(self): sys.argv = ['pylinkvalidator', 'http://www.example.com/'] config = Config() config.parse_cli_config() self.assertTrue('www.example.com' in config.accepted_hosts) sys.argv = ['pylinkvalidator', '-H', 'www.example.com', 'http://example.com', 'foo.com', 'http://www.example.com/', 'baz.com'] config = Config() config.parse_cli_config() self.assertTrue('www.example.com' in config.accepted_hosts) self.assertTrue('example.com' in config.accepted_hosts) self.assertTrue('foo.com' in config.accepted_hosts) self.assertTrue('baz.com' in config.accepted_hosts)