def get_bad(bar_points, filename, outfilename):
	observed_sites = CD.ObservedSites()
	read_proto_from_file(observed_sites, filename)
	domain_set = set();

	for site in observed_sites.site:
		for observation in site.observation:
			url_domain = top_domain(observation.landing_url)
			domain_set.add(url_domain)

	domain_list = list(domain_set)
	bad_domains = get_domain_reputation(domain_list, bar_points)
	bad_observed_sites = CD.ObservedSites()
	bad_observed_sites.config.CopyFrom(observed_sites.config)

	for site in observed_sites.site:
		observation_list = list()
		for observation in site.observation:
			if top_domain(observation.landing_url) in bad_domains:
				observation_list.append(observation)
		if len(observation_list) == 0:
			continue
		bad_site = bad_observed_sites.site.add()
		bad_site.name = site.name
		for observation in observation_list:
			to_add = bad_site.observation.add()
			to_add.CopyFrom(observation)

	write_proto_to_file(bad_observed_sites, outfilename)
Esempio n. 2
0
def get_domains(observed_sites_list, outfile):
	domain_set = set()
	for filename in observed_sites_list:
		observed_sites = CD.ObservedSites()
		read_proto_from_file(observed_sites, filename)
		for site in observed_sites.site:
			for observation in site.observation:
				url_domain = top_domain(observation.landing_url)
				domain_set.add(url_domain)
	open(outfile, 'w').write("\n".join(domain_set))