Ejemplo n.º 1
0
    def test_domain_extraction(self):
        test_expectation = {'httPs://www.foo.co.nz':("foo.co.nz", None),
                             ' hTTp://www.acme.com:899/kjbk ':("acme.com",899),
                            'http://bar.org:80/?q=sds+asa#frag':("bar.org",None),
                            'www.acme.com':("acme.com",None),
                            'acme.com:9004':("acme.com",9004)}

        for input, expected_result in test_expectation.items():
            actual_result = extract_domain_port(input)
            self.assertTupleEqual(actual_result, expected_result)

        invalid_scheme_urls = ["htt://foo.com", "mailto://[email protected]", "__SD@  "]
        for url in  invalid_scheme_urls:
            self.assertRaisesRegexp(ValueError,"scheme must be http or https",extract_domain_port,url)

        empty_urls = ["", None,"    "]
        for url in empty_urls:
            self.assertRaisesRegexp(ValueError,"cannot be null",extract_domain_port,url)

        null_or_empty_domains = ["http:///a","http://:9/bar"]
        for url in null_or_empty_domains:
            self.assertRaisesRegexp(ValueError,"Null or empty domain",extract_domain_port,url)

        invalid_domain_urls = ["https://www.__)2.com",
                               "http://lkjhhjjhhhlkjhhjjhhhlkjhhjjhhhlkjhhjjhhhlkjhhjjhhhlkjhhjjhhhlkjhhjjhhh."
                               + "lkjhhjjhhhlkjhhjjhhhlkjhhjjhhhlkjhhjjhhhlkjhhjjhhhlkjhhjjhhhlkjhhjjhhh.com"]
        for url in invalid_domain_urls:
            self.assertRaisesRegexp(ValueError,"Invalid domain provided",extract_domain_port,url)
Ejemplo n.º 2
0
    def test_link_dedup(self):
        base_url = "acme.com:8999"
        base_domain, port = extract_domain_port(base_url)
        logger = logging.getLogger()
        url_norm = URLNormalizer(base_domain, port)
        normalized_url = url_norm.normalize_with_domain(base_url)
        logger.debug("Constructed normalized base url : %s"%normalized_url)

        domain_filter = DomainFilter(base_domain, logger)
        site_graph = SiteGraph(logger)
        link_aggregator = LinkAggregator(logger, site_graph, link_mappers=[url_norm.normalize_with_domain], link_filters=[domain_filter.passes, is_acceptable_url_scheme])
        valid_links = ["/a/b","/a/b/./","http://acme.com:8002/a","https://acme.com:8002/b?q=asd#frag"]
        expected_links = ["http://acme.com:8999/a/b","http://acme.com:8002/a","https://acme.com:8002/b"]

        # This time, we also specify a referrer page
        filtered_links = link_aggregator.filter_update_links(valid_links, normalized_url)
        self.assertListEqual(expected_links,filtered_links)
        self.assertSetEqual(set(expected_links),link_aggregator._links)

        # Second invocation should result in deduplication
        filtered_links = link_aggregator.filter_update_links(valid_links, None)
        self.assertTrue(len(filtered_links) == 0)
        self.assertSetEqual(set(expected_links),link_aggregator._links)

        # None of the invalid links should pass
        invalid_links = ["mailto://[email protected]","code.acme.com","code.acme.com/b","https://127.122.9.1"]
        filtered_links = link_aggregator.filter_update_links(invalid_links, None)
        self.assertTrue(len(filtered_links) == 0)
        self.assertSetEqual(set(expected_links),link_aggregator._links)

        # A new valid link should pass
        new_valid_links = ["http://acme.com:8999/"]
        filtered_links = link_aggregator.filter_update_links(new_valid_links, None)
        expected_result = ["http://acme.com:8999"]
        self.assertListEqual(expected_result,filtered_links)
        expected_result_set = set(expected_links)
        expected_result_set.update(set(expected_result))
        self.assertSetEqual(expected_result_set,link_aggregator._links)

        self.assertEqual(len(expected_result_set), site_graph.num_nodes())
        for link in expected_result_set:
            self.assertTrue(site_graph.has_vertex(link))

        self.assertEqual(len(expected_links), site_graph.num_edges())
        for link in expected_links:
            self.assertTrue(site_graph.has_edge(normalized_url, link))