def should_skip_host(h): if is_ip_address(h): return True elif domain_level(h) <= 1: return True for dl in DOMAIN_LEVELS: if nth_level_domain(h, dl) in UNWANTED_DOMAINS: return True return fnmatches_multiple(UNWANTED_PATTERNS, h)
def prune_news_dataset(news_sources_file): f = open(news_sources_file, 'r') news_urls = set() for line in f: host = line.strip().split('/')[0] host = normalize_url(host) if host in news_urls or host in UNWANTED_URLS or fnmatches_multiple(EXCEPTION_PATTERNS, host): continue news_urls.add(host) for host in sorted(list(news_urls)): disregard = False for parent in parents(host): if parent in news_urls or parent in UNWANTED_URLS: disregard = True break if not disregard: print host