Esempio n. 1
0
def init_crawler():
    parser = argparse.ArgumentParser(description='Crawl a single domain')
    parser.add_argument('--alt_conf_path', required=False, help='Path to a python config (without sections) that may be used to override config defaults')
    parser.add_argument('--log_conf', required=True, help='The location of the logging configuration file in the python logging config format')
    parser.add_argument('--url', required=True, help='The base url with the domain name of the site to be crawled. (e.g. http://acme.com)')
    input = vars(parser.parse_args())
    base_url = input['url']
    page_fetcher = None
    logger = None
    site_graph = None
    try:
        base_domain, port = extract_domain_port(base_url)
        conf_path = input.get('alt_conf_path', None)
        configuration = None
        if conf_path is None:
            configuration = conf.get_default()
        else:
            configuration = conf.from_file(conf_path)

        config.fileConfig(input['log_conf'])
        logger = logging.getLogger(configuration.logger_name)

        logger.debug("Base domain : %s"%base_domain)

        executor = ThreadPoolExecutor(max_workers=configuration.max_parser_workers)
        termination_cond_var = Event()
        url_norm = URLNormalizer(base_domain, port)
        normalized_url = url_norm.normalize_with_domain(base_url)
        logger.debug("Constructed normalized base url : %s"%normalized_url)
        robots_fetch_timeout = configuration.connect_timeout + configuration.response_timeout
        robots_filter = RobotsURLFilter(normalized_url, robots_fetch_timeout, configuration.user_agent, logger)

        domain_filter = DomainFilter(base_domain, logger)
        site_graph = SiteGraph(logger)
        link_aggregator = LinkAggregator(logger, site_graph, link_mappers=[url_norm.normalize_with_domain], link_filters=[domain_filter.passes, robots_filter.passes,is_acceptable_url_scheme])

        page_fetcher = PageFetcher(configuration, logger)
        orchestrator = CrawlOrchestrator(executor, logger, termination_cond_var, extract_links, link_aggregator, page_fetcher.fetch_page)
        page_fetcher.set_fetch_result_handler(orchestrator.handle_page_fetch_result)

        logger.info("Initiating crawl...")
        orchestrator.init_crawl(normalized_url)
        termination_cond_var.wait()
        logger.info("Crawl complete. Shutting down...")
    except:
        print("Error initiating crawl.Ensure that a valid url is "
                     "provided & that the destination site is reachable")
        print traceback.print_exc()
    if page_fetcher:
        page_fetcher.shutdown()

    if logger and site_graph:
        logger.info("Site graph, in the form [array of pages] , [array of (src,dest) link tuples] follows: ")
        logger.info(site_graph.stringize())
    def test_link_dedup(self):
        base_url = "acme.com:8999"
        base_domain, port = extract_domain_port(base_url)
        logger = logging.getLogger()
        url_norm = URLNormalizer(base_domain, port)
        normalized_url = url_norm.normalize_with_domain(base_url)
        logger.debug("Constructed normalized base url : %s"%normalized_url)

        domain_filter = DomainFilter(base_domain, logger)
        site_graph = SiteGraph(logger)
        link_aggregator = LinkAggregator(logger, site_graph, link_mappers=[url_norm.normalize_with_domain], link_filters=[domain_filter.passes, is_acceptable_url_scheme])
        valid_links = ["/a/b","/a/b/./","http://acme.com:8002/a","https://acme.com:8002/b?q=asd#frag"]
        expected_links = ["http://acme.com:8999/a/b","http://acme.com:8002/a","https://acme.com:8002/b"]

        # This time, we also specify a referrer page
        filtered_links = link_aggregator.filter_update_links(valid_links, normalized_url)
        self.assertListEqual(expected_links,filtered_links)
        self.assertSetEqual(set(expected_links),link_aggregator._links)

        # Second invocation should result in deduplication
        filtered_links = link_aggregator.filter_update_links(valid_links, None)
        self.assertTrue(len(filtered_links) == 0)
        self.assertSetEqual(set(expected_links),link_aggregator._links)

        # None of the invalid links should pass
        invalid_links = ["mailto://[email protected]","code.acme.com","code.acme.com/b","https://127.122.9.1"]
        filtered_links = link_aggregator.filter_update_links(invalid_links, None)
        self.assertTrue(len(filtered_links) == 0)
        self.assertSetEqual(set(expected_links),link_aggregator._links)

        # A new valid link should pass
        new_valid_links = ["http://acme.com:8999/"]
        filtered_links = link_aggregator.filter_update_links(new_valid_links, None)
        expected_result = ["http://acme.com:8999"]
        self.assertListEqual(expected_result,filtered_links)
        expected_result_set = set(expected_links)
        expected_result_set.update(set(expected_result))
        self.assertSetEqual(expected_result_set,link_aggregator._links)

        self.assertEqual(len(expected_result_set), site_graph.num_nodes())
        for link in expected_result_set:
            self.assertTrue(site_graph.has_vertex(link))

        self.assertEqual(len(expected_links), site_graph.num_edges())
        for link in expected_links:
            self.assertTrue(site_graph.has_edge(normalized_url, link))