def test_get_crawl_priority_and_depth(self): test_whitelist = { "http://www.google.com" : {}, "http://www.sina.com.cn" : {"crawl_priority" : 2}, "http://www.sina.cn" : {"crawl_priority" : 2, "crawl_depth" : 3}, } test_set = [ ("http://www.google.jp", "offline", 1, 2), #not whitelist, domain, offline ("http://news.sina.com.cn", "online", 2, 0), #whitelist, subdomain, online ("http://news.sina.com", "online", 0, 0), #not whitelist, subdomain, online ("http://news.sina.cn/a/b", "offline", 2, 3), #whitelist, others, offline ] crawlerdb.config("localhost", database="test_db") crawlerdb.db.crawlDomainWhitelist.drop() for url, config in test_whitelist.items(): crawlerdb.save_crawl_domain_info(url, \ crawl_priority = config.get("crawl_priority", -1), crawl_depth = config.get("crawl_depth", -1)) for url, source, expected_priority, expected_depth in test_set: print url priority, depth = CrawlUrlHelper.get_crawl_priority_and_depth(url, source) self.assertEqual(priority, expected_priority) self.assertEqual(depth, expected_depth) crawlerdb.db.crawlDomainWhitelist.drop()
def test_match_whitelist(self): test_whitelist = [ "http://www.google.com", "http://www.sina.com.cn", "http://www.sina.cn", ] test_set = [ ("http://www.google.jp", "domain", False), ("http://www.google.jp", "full_domain", False), ("http://news.sina.com.cn", "full_domain", False), ("http://news.sina.com", "full_domain", False), ("http://news.sina.com.cn", "host", False), ("http://news.sina.cn", "domain", False), ("http://news.sina.com.cn/a/b", "host", False), ("http://3g.sina.cn/a/b", "domain", False), ] crawlerdb.config("localhost", database="test_db") crawlerdb.db.crawlDomainWhitelist.drop() for url in test_whitelist: crawlerdb.save_crawl_domain_info(url) for url, match_section, expected in test_set: common_settings.general_crawl_policies["url_match_domain_type"] = match_section common_settings.general_crawl_policies["url_match_target"] = "whitelist" self.assertEqual(CrawlUrlHelper.valid_crawl_url(url, None), expected) crawlerdb.db.crawlDomainWhitelist.drop()