def change_domain_levels(src, dest, new_domain_level):
    print "Processing", src
    data = read_vm_file(src)
    clicks = {}
    for referrer, target, num_clicks in data:
        if (
            referrer is None
            or target is None
            or num_clicks == -1
            or domain_level(referrer) == 1
            or domain_level(target) == 1
        ):
            continue
        newr = change_domain_level(referrer, new_domain_level)
        newt = change_domain_level(target, new_domain_level)
        if newr not in clicks:
            clicks[newr] = {}
        if newt not in clicks[newr]:
            clicks[newr][newt] = 0
        clicks[newr][newt] += num_clicks

    with open(dest, "w") as destf:
        writer = csv.writer(destf, delimiter="\t")
        for referrer in clicks:
            for target in clicks[referrer]:
                writer.writerow([referrer, target, clicks[referrer][target]])
Esempio n. 2
0
def domain_levels(hosts):
	dls = set()
	for host in hosts:
		dl = domain_level(host)
		if dl not in dls:
			dls.add(dl)
	return dls
Esempio n. 3
0
def should_skip_host(h):
	if is_ip_address(h):
		return True
	elif domain_level(h) <= 1:
		return True
	for dl in DOMAIN_LEVELS:
	   	if nth_level_domain(h, dl) in UNWANTED_DOMAINS:
   			return True
	return fnmatches_multiple(UNWANTED_PATTERNS, h)
Esempio n. 4
0
def change_domain_levels(src, dest, new_domain_level):
	print "Processing", src
	data = read_vm_file(src)
	clicks = {}
	for referrer, target, num_clicks in data:
		if referrer is None or target is None or num_clicks == -1\
		or domain_level(referrer) == 1 or domain_level(target) == 1:
			continue
		newr = change_domain_level(referrer, new_domain_level)
		newt = change_domain_level(target, new_domain_level)
		if newr not in clicks:
			clicks[newr] = {}
		if newt not in clicks[newr]:
			clicks[newr][newt] = 0
		clicks[newr][newt] += num_clicks
	
	with open(dest, 'w') as destf:
		writer = csv.writer(destf, delimiter="\t")
		for referrer in clicks:
			for target in clicks[referrer]:
				writer.writerow([referrer, target, clicks[referrer][target]])
Esempio n. 5
0
	"deco*.slides.com",
	"*.zynga.com",
	"fb.*.com",
	"app.*.com",
	"apps.*.com",
	"img.*.com",
	"static.*",
	"lax-v*",
	"syndication.*",
	"*nyadmcncserve*",
	"facebook.*"
])

domain_levels = set()
for host in UNWANTED_DOMAINS:
	dl = domain_level(host)
	if dl not in domain_levels:
		domain_levels.add(dl)
DOMAIN_LEVELS = frozenset(domain_levels)

def should_skip_host(h):
	if is_ip_address(h):
		return True
	elif domain_level(h) <= 1:
		return True
	for dl in DOMAIN_LEVELS:
	   	if nth_level_domain(h, dl) in UNWANTED_DOMAINS:
   			return True
	return fnmatches_multiple(UNWANTED_PATTERNS, h)

def remove_unwanted(src, dest):