def change_domain_levels(src, dest, new_domain_level): print "Processing", src data = read_vm_file(src) clicks = {} for referrer, target, num_clicks in data: if ( referrer is None or target is None or num_clicks == -1 or domain_level(referrer) == 1 or domain_level(target) == 1 ): continue newr = change_domain_level(referrer, new_domain_level) newt = change_domain_level(target, new_domain_level) if newr not in clicks: clicks[newr] = {} if newt not in clicks[newr]: clicks[newr][newt] = 0 clicks[newr][newt] += num_clicks with open(dest, "w") as destf: writer = csv.writer(destf, delimiter="\t") for referrer in clicks: for target in clicks[referrer]: writer.writerow([referrer, target, clicks[referrer][target]])
def domain_levels(hosts): dls = set() for host in hosts: dl = domain_level(host) if dl not in dls: dls.add(dl) return dls
def should_skip_host(h): if is_ip_address(h): return True elif domain_level(h) <= 1: return True for dl in DOMAIN_LEVELS: if nth_level_domain(h, dl) in UNWANTED_DOMAINS: return True return fnmatches_multiple(UNWANTED_PATTERNS, h)
def change_domain_levels(src, dest, new_domain_level): print "Processing", src data = read_vm_file(src) clicks = {} for referrer, target, num_clicks in data: if referrer is None or target is None or num_clicks == -1\ or domain_level(referrer) == 1 or domain_level(target) == 1: continue newr = change_domain_level(referrer, new_domain_level) newt = change_domain_level(target, new_domain_level) if newr not in clicks: clicks[newr] = {} if newt not in clicks[newr]: clicks[newr][newt] = 0 clicks[newr][newt] += num_clicks with open(dest, 'w') as destf: writer = csv.writer(destf, delimiter="\t") for referrer in clicks: for target in clicks[referrer]: writer.writerow([referrer, target, clicks[referrer][target]])
"deco*.slides.com", "*.zynga.com", "fb.*.com", "app.*.com", "apps.*.com", "img.*.com", "static.*", "lax-v*", "syndication.*", "*nyadmcncserve*", "facebook.*" ]) domain_levels = set() for host in UNWANTED_DOMAINS: dl = domain_level(host) if dl not in domain_levels: domain_levels.add(dl) DOMAIN_LEVELS = frozenset(domain_levels) def should_skip_host(h): if is_ip_address(h): return True elif domain_level(h) <= 1: return True for dl in DOMAIN_LEVELS: if nth_level_domain(h, dl) in UNWANTED_DOMAINS: return True return fnmatches_multiple(UNWANTED_PATTERNS, h) def remove_unwanted(src, dest):