def crawl_with_options(urls, options_dict=None, logger_builder=None): """Crawls URLs with provided options and logger. :param options_dict: Must contain the long name of the command line options. (optional) :param logger_builder: Function that will be called to instantiate a logger. (optional) :rtype: A pylinkchecker.crawler.Site instance """ config = Config() config.parse_api_config(urls, options_dict) if not logger_builder: logger = configure_logger(config) else: logger = logger_builder() # TODO In the future, we will pass the logger builder and not the logger # to enable the ProcessSiteCrawler to instantiate its own custom logger. crawler = execute_from_config(config, logger) return crawler.site
def test_accepted_hosts(self): sys.argv = ['pylinkchecker', 'http://www.example.com/'] config = Config() config.parse_cli_config() self.assertTrue('www.example.com' in config.accepted_hosts) sys.argv = ['pylinkchecker', '-H', 'www.example.com', 'http://example.com', 'foo.com', 'http://www.example.com/', 'baz.com'] config = Config() config.parse_cli_config() self.assertTrue('www.example.com' in config.accepted_hosts) self.assertTrue('example.com' in config.accepted_hosts) self.assertTrue('foo.com' in config.accepted_hosts) self.assertTrue('baz.com' in config.accepted_hosts)
def crawl(url): """Crawls a URL and returns a pylinkchecker.crawler.Site instance. :rtype: A pylinkchecker.crawler.Site instance """ config = Config() config.parse_api_config([url]) logger = configure_logger(config) crawler = execute_from_config(config, logger) return crawler.site
def _run_crawler_plain(self, crawler_class, other_options=None): url = self.get_url("/index.html") sys.argv = ['pylinkchecker', "-m", "process", url] if not other_options: other_options = [] sys.argv.extend(other_options) config = Config() config.parse_cli_config() crawler = crawler_class(config, get_logger()) crawler.crawl() return crawler.site
def execute_from_command_line(): """Runs the crawler and retrieves the configuration from the command line.""" try: start = time.time() config = Config() config.parse_cli_config() logger = configure_logger(config) crawler = execute_from_config(config, logger) stop = time.time() if not crawler.site.is_ok or config.options.when == WHEN_ALWAYS: report(crawler.site, config, stop - start, logger) if not crawler.site.is_ok: sys.exit(1) except Exception as e: print(e) sys.exit(1)