def get_bad(bar_points, filename, outfilename): observed_sites = CD.ObservedSites() read_proto_from_file(observed_sites, filename) domain_set = set(); for site in observed_sites.site: for observation in site.observation: url_domain = top_domain(observation.landing_url) domain_set.add(url_domain) domain_list = list(domain_set) bad_domains = get_domain_reputation(domain_list, bar_points) bad_observed_sites = CD.ObservedSites() bad_observed_sites.config.CopyFrom(observed_sites.config) for site in observed_sites.site: observation_list = list() for observation in site.observation: if top_domain(observation.landing_url) in bad_domains: observation_list.append(observation) if len(observation_list) == 0: continue bad_site = bad_observed_sites.site.add() bad_site.name = site.name for observation in observation_list: to_add = bad_site.observation.add() to_add.CopyFrom(observation) write_proto_to_file(bad_observed_sites, outfilename)
def get_domains(observed_sites_list, outfile): domain_set = set() for filename in observed_sites_list: observed_sites = CD.ObservedSites() read_proto_from_file(observed_sites, filename) for site in observed_sites.site: for observation in site.observation: url_domain = top_domain(observation.landing_url) domain_set.add(url_domain) open(outfile, 'w').write("\n".join(domain_set))