def init_crawler(): parser = argparse.ArgumentParser(description='Crawl a single domain') parser.add_argument('--alt_conf_path', required=False, help='Path to a python config (without sections) that may be used to override config defaults') parser.add_argument('--log_conf', required=True, help='The location of the logging configuration file in the python logging config format') parser.add_argument('--url', required=True, help='The base url with the domain name of the site to be crawled. (e.g. http://acme.com)') input = vars(parser.parse_args()) base_url = input['url'] page_fetcher = None logger = None site_graph = None try: base_domain, port = extract_domain_port(base_url) conf_path = input.get('alt_conf_path', None) configuration = None if conf_path is None: configuration = conf.get_default() else: configuration = conf.from_file(conf_path) config.fileConfig(input['log_conf']) logger = logging.getLogger(configuration.logger_name) logger.debug("Base domain : %s"%base_domain) executor = ThreadPoolExecutor(max_workers=configuration.max_parser_workers) termination_cond_var = Event() url_norm = URLNormalizer(base_domain, port) normalized_url = url_norm.normalize_with_domain(base_url) logger.debug("Constructed normalized base url : %s"%normalized_url) robots_fetch_timeout = configuration.connect_timeout + configuration.response_timeout robots_filter = RobotsURLFilter(normalized_url, robots_fetch_timeout, configuration.user_agent, logger) domain_filter = DomainFilter(base_domain, logger) site_graph = SiteGraph(logger) link_aggregator = LinkAggregator(logger, site_graph, link_mappers=[url_norm.normalize_with_domain], link_filters=[domain_filter.passes, robots_filter.passes,is_acceptable_url_scheme]) page_fetcher = PageFetcher(configuration, logger) orchestrator = CrawlOrchestrator(executor, logger, termination_cond_var, extract_links, link_aggregator, page_fetcher.fetch_page) page_fetcher.set_fetch_result_handler(orchestrator.handle_page_fetch_result) logger.info("Initiating crawl...") orchestrator.init_crawl(normalized_url) termination_cond_var.wait() logger.info("Crawl complete. Shutting down...") except: print("Error initiating crawl.Ensure that a valid url is " "provided & that the destination site is reachable") print traceback.print_exc() if page_fetcher: page_fetcher.shutdown() if logger and site_graph: logger.info("Site graph, in the form [array of pages] , [array of (src,dest) link tuples] follows: ") logger.info(site_graph.stringize())
def test_link_dedup(self): base_url = "acme.com:8999" base_domain, port = extract_domain_port(base_url) logger = logging.getLogger() url_norm = URLNormalizer(base_domain, port) normalized_url = url_norm.normalize_with_domain(base_url) logger.debug("Constructed normalized base url : %s"%normalized_url) domain_filter = DomainFilter(base_domain, logger) site_graph = SiteGraph(logger) link_aggregator = LinkAggregator(logger, site_graph, link_mappers=[url_norm.normalize_with_domain], link_filters=[domain_filter.passes, is_acceptable_url_scheme]) valid_links = ["/a/b","/a/b/./","http://acme.com:8002/a","https://acme.com:8002/b?q=asd#frag"] expected_links = ["http://acme.com:8999/a/b","http://acme.com:8002/a","https://acme.com:8002/b"] # This time, we also specify a referrer page filtered_links = link_aggregator.filter_update_links(valid_links, normalized_url) self.assertListEqual(expected_links,filtered_links) self.assertSetEqual(set(expected_links),link_aggregator._links) # Second invocation should result in deduplication filtered_links = link_aggregator.filter_update_links(valid_links, None) self.assertTrue(len(filtered_links) == 0) self.assertSetEqual(set(expected_links),link_aggregator._links) # None of the invalid links should pass invalid_links = ["mailto://[email protected]","code.acme.com","code.acme.com/b","https://127.122.9.1"] filtered_links = link_aggregator.filter_update_links(invalid_links, None) self.assertTrue(len(filtered_links) == 0) self.assertSetEqual(set(expected_links),link_aggregator._links) # A new valid link should pass new_valid_links = ["http://acme.com:8999/"] filtered_links = link_aggregator.filter_update_links(new_valid_links, None) expected_result = ["http://acme.com:8999"] self.assertListEqual(expected_result,filtered_links) expected_result_set = set(expected_links) expected_result_set.update(set(expected_result)) self.assertSetEqual(expected_result_set,link_aggregator._links) self.assertEqual(len(expected_result_set), site_graph.num_nodes()) for link in expected_result_set: self.assertTrue(site_graph.has_vertex(link)) self.assertEqual(len(expected_links), site_graph.num_edges()) for link in expected_links: self.assertTrue(site_graph.has_edge(normalized_url, link))