Example #1
0
def should_skip_host(h):
	if is_ip_address(h):
		return True
	elif domain_level(h) <= 1:
		return True
	for dl in DOMAIN_LEVELS:
	   	if nth_level_domain(h, dl) in UNWANTED_DOMAINS:
   			return True
	return fnmatches_multiple(UNWANTED_PATTERNS, h)
def prune_news_dataset(news_sources_file):
	f = open(news_sources_file, 'r')
	news_urls = set()
	for line in f:
		host = line.strip().split('/')[0]
		
		host = normalize_url(host)

		if host in news_urls or host in UNWANTED_URLS or fnmatches_multiple(EXCEPTION_PATTERNS, host):
			continue

		news_urls.add(host)
	
	for host in sorted(list(news_urls)):
		disregard = False
		for parent in parents(host):
			if parent in news_urls or parent in UNWANTED_URLS:
				disregard = True
				break
		if not disregard:
			print host