def _run_crawler_plain( self, crawler_class, other_options=None, url="/index.html"): url = self.get_url(url) sys.argv = ['pylinkvalidator', "-m", "process", url] if not other_options: other_options = [] sys.argv.extend(other_options) config = Config() config.parse_cli_config() crawler = crawler_class(config, get_logger()) crawler.crawl() if config.options.multi: crawler.site.collect_multi_sites() return crawler.site
def get_page_crawler(self, url): url = self.get_url(url) url_split = get_clean_url_split(url) input_queue = compat.Queue.Queue() output_queue = compat.Queue.Queue() worker_config = WorkerConfig( username=None, password=None, types=['a', 'img', 'link', 'script'], timeout=5, parser=PARSER_STDLIB, strict_mode=False, prefer_server_encoding=False, extra_headers=[]) worker_init = WorkerInit( worker_config=worker_config, input_queue=input_queue, output_queue=output_queue, logger=get_logger()) page_crawler = PageCrawler(worker_init) return page_crawler, url_split
def test_url_file_path(self): (_, temp_file_path) = mkstemp() url = self.get_url("/index.html") url2 = self.get_url("/robots.txt") with open(temp_file_path, "w") as temp_file: temp_file.write(url + "\n") temp_file.write(url2 + "\n") sys.argv = [ "pylinkvalidator", "-m", "process", "--url-file-path", temp_file_path] config = Config() config.parse_cli_config() crawler = ThreadSiteCrawler(config, get_logger()) crawler.crawl() site = crawler.site self.assertEqual(12, len(site.pages)) self.assertEqual(1, len(site.error_pages)) os.unlink(temp_file_path)