Ejemplo n.º 1
0
    def test_get_crawl_priority_and_depth(self):
        test_whitelist = {
            "http://www.google.com" : {},
            "http://www.sina.com.cn" : {"crawl_priority" : 2},
            "http://www.sina.cn" : {"crawl_priority" : 2, "crawl_depth" : 3},
        }

        test_set = [
            ("http://www.google.jp", "offline", 1, 2), #not whitelist, domain, offline
            ("http://news.sina.com.cn", "online", 2, 0), #whitelist, subdomain, online
            ("http://news.sina.com", "online", 0, 0), #not whitelist, subdomain, online
            ("http://news.sina.cn/a/b", "offline", 2, 3), #whitelist, others, offline
        ]

        crawlerdb.config("localhost", database="test_db")
        crawlerdb.db.crawlDomainWhitelist.drop()
        for url, config in test_whitelist.items():
            crawlerdb.save_crawl_domain_info(url, \
                crawl_priority = config.get("crawl_priority", -1), crawl_depth = config.get("crawl_depth", -1))

        for url, source, expected_priority, expected_depth in test_set:
            print url
            priority, depth = CrawlUrlHelper.get_crawl_priority_and_depth(url, source)
            self.assertEqual(priority, expected_priority)
            self.assertEqual(depth, expected_depth)

        crawlerdb.db.crawlDomainWhitelist.drop()
Ejemplo n.º 2
0
    def test_match_whitelist(self):
        test_whitelist = [
            "http://www.google.com",
            "http://www.sina.com.cn",
            "http://www.sina.cn",
        ]

        test_set = [
            ("http://www.google.jp", "domain", False),
            ("http://www.google.jp", "full_domain", False),
            ("http://news.sina.com.cn", "full_domain", False),
            ("http://news.sina.com", "full_domain", False),
            ("http://news.sina.com.cn", "host", False),
            ("http://news.sina.cn", "domain", False),
            ("http://news.sina.com.cn/a/b", "host", False),
            ("http://3g.sina.cn/a/b", "domain", False),
        ]

        crawlerdb.config("localhost", database="test_db")
        crawlerdb.db.crawlDomainWhitelist.drop()
        for url in test_whitelist:
            crawlerdb.save_crawl_domain_info(url)

        for url, match_section, expected in test_set:
            common_settings.general_crawl_policies["url_match_domain_type"] = match_section
            common_settings.general_crawl_policies["url_match_target"] = "whitelist"
            self.assertEqual(CrawlUrlHelper.valid_crawl_url(url, None), expected)

        crawlerdb.db.crawlDomainWhitelist.drop()