Beispiel #1
0
def count_clicks(src):
	print "Processing %s" % src
	total = 0
	data = read_vm_file(src)
	for referrer, target, num_clicks in data:
		total += num_clicks
	return total
def read_counts(src):
	print "Processing %s" % src
	data = read_vm_file(src)
	counts = []
	for _, _, num_clicks in data:
		counts.append(num_clicks)
	return counts
def change_domain_levels(src, dest, new_domain_level):
    print "Processing", src
    data = read_vm_file(src)
    clicks = {}
    for referrer, target, num_clicks in data:
        if (
            referrer is None
            or target is None
            or num_clicks == -1
            or domain_level(referrer) == 1
            or domain_level(target) == 1
        ):
            continue
        newr = change_domain_level(referrer, new_domain_level)
        newt = change_domain_level(target, new_domain_level)
        if newr not in clicks:
            clicks[newr] = {}
        if newt not in clicks[newr]:
            clicks[newr][newt] = 0
        clicks[newr][newt] += num_clicks

    with open(dest, "w") as destf:
        writer = csv.writer(destf, delimiter="\t")
        for referrer in clicks:
            for target in clicks[referrer]:
                writer.writerow([referrer, target, clicks[referrer][target]])
def convert_to_json(src, dest):
	print "Processing %s" % src
	data = read_vm_file(src)
	click_counts = {}
	for referrer, target, num_clicks in data:
		if target not in click_counts:
			click_counts[target] = 0
		click_counts[target] += num_clicks

	total_count = float(numpy.sum(click_counts.values()))
	sorted_counts = sorted(click_counts.items(), key=lambda tupl: tupl[1], reverse=True)
	
	cum_count = 0
	json = ""
	for target, count in sorted_counts:
		cum_count += count
		json += '        {"name": "%s", "size": %d}' % (target, count)
		if cum_count / total_count > .3:
			json += "\n"
			break
		else:
			json += ",\n"
		
	with open(dest, 'w') as destf:
		destf.write("""{
    "name": "entropy",
    "children": [
%s
    ]
}
""" % json)
Beispiel #5
0
def convert_to_json(src, dest):
    print "Processing %s" % src
    data = read_vm_file(src)
    click_counts = {}
    for referrer, target, num_clicks in data:
        if target not in click_counts:
            click_counts[target] = 0
        click_counts[target] += num_clicks

    total_count = float(numpy.sum(click_counts.values()))
    sorted_counts = sorted(click_counts.items(),
                           key=lambda tupl: tupl[1],
                           reverse=True)

    cum_count = 0
    json = ""
    for target, count in sorted_counts:
        cum_count += count
        json += '        {"name": "%s", "size": %d}' % (target, count)
        if cum_count / total_count > .3:
            json += "\n"
            break
        else:
            json += ",\n"

    with open(dest, 'w') as destf:
        destf.write("""{
    "name": "entropy",
    "children": [
%s
    ]
}
""" % json)
Beispiel #6
0
def compute_traffic_volume(src):
    print "Processing %s" % src
    total = 0
    data = read_vm_file(src)
    for referrer, target, num_clicks in data:
        total += num_clicks
    return total
def compute_traffic_volume(src):
	print "Processing %s" % src
	total = 0
	data = read_vm_file(src)
	for referrer, target, num_clicks in data:
		total += num_clicks
	return total
Beispiel #8
0
def count_clicks(files):
	total = 0
	for src in files:
		print "Processing", src
		data = read_vm_file(src)
		for referrer, target, num_clicks in data:
			total += 1
	return total
Beispiel #9
0
def create_categories(src, dest, hosts):
	print "Processing %s" % dest
	data = read_vm_file(src)
	with open(dest, 'w') as destf:
		writer = csv.writer(destf, delimiter="\t")
		for referrer, target, num_clicks in data:
			if is_member(referrer, hosts):
				writer.writerow([referrer, target, num_clicks])
def filter_targets(src, dest):
    print "Processing %s" % src
    data = read_vm_file(src)
    with open(dest, 'w') as destf:
        writer = csv.writer(destf, delimiter="\t")
        for referrer, target, num_clicks in data:
            if not should_skip_host(target):
                writer.writerow([referrer, target, num_clicks])
def filter_referrers(src, dest):
	print "Processing %s" % src
	data = read_vm_file(src)
	with open(dest, 'w') as destf:
		writer = csv.writer(destf, delimiter="\t")
		for referrer, target, num_clicks in data:
			if not should_skip_host(referrer):
				writer.writerow([referrer, target, num_clicks])
Beispiel #12
0
def count_targets(files):
	targets = set()
	for src in files:
		print "Processing", src
		data = read_vm_file(src)
		for referrer, target, num_clicks in data:
			if target not in targets:
				targets.add(target)
	return len(targets)
def filter_news_junk(src, dest):
	print "Processing %s" % src
	data = read_vm_file(src)
	with open(dest, 'w') as destf:
		writer = csv.writer(destf, delimiter="\t")
		for referrer, target, num_clicks in data:
			target = normalize_url(target)
			if not should_skip_host(target):
				writer.writerow([referrer, target, num_clicks])
Beispiel #14
0
def show_top_targets(src):
	print "Processing %s" % src
	data = read_vm_file(src)
	click_counts = {}
	for referrer, target, num_clicks in data:
		if target not in click_counts:
			click_counts[target] = 0
		click_counts[target] += num_clicks
	return click_counts
Beispiel #15
0
def count_clicks_from_referrer(files, referrers):
	counts = {}
	for src in files:
		print "Processing", src
		data = read_vm_file(src)
		for referrer, target, num_clicks in data:
			if referrer in referrers:
				if not referrer in counts:
					counts[referrer] = 0
				counts[referrer] += 1
	return counts
Beispiel #16
0
def remove_unwanted(src, dest):
	print "Processing %s" % src
	data = read_vm_file(src)
	with open(dest, 'w') as destf:
		writer = csv.writer(destf, delimiter="\t")
		for referrer, target, num_clicks in data:
			try:
				r = normalize_url(referrer)
				t = normalize_url(target)
				if not should_skip_host(t):
					writer.writerow([r, t, num_clicks])
			except:
				print "Couldn't normalize. Skipping."
				print referrer
				print target
def combine_files(files, destfile):
	print "Processing %s" % destfile
	clicks = {}
	for f in files:
		data = read_vm_file(f)
		for referrer, target, num_clicks in data:
			if referrer not in clicks:
				clicks[referrer] = {}
			if target not in clicks[referrer]:
				clicks[referrer][target] = 0
			clicks[referrer][target] += num_clicks
	
	with open(destfile, 'w') as destf:
		writer = csv.writer(destf, delimiter="\t")
		for referrer in clicks:
			for target in clicks[referrer]:
				writer.writerow([referrer, target, clicks[referrer][target]])
def combine_files(files, destfile):
    print "Processing %s" % destfile
    clicks = {}
    for f in files:
        data = read_vm_file(f)
        for referrer, target, num_clicks in data:
            if referrer not in clicks:
                clicks[referrer] = {}
            if target not in clicks[referrer]:
                clicks[referrer][target] = 0
            clicks[referrer][target] += num_clicks

    with open(destfile, 'w') as destf:
        writer = csv.writer(destf, delimiter="\t")
        for referrer in clicks:
            for target in clicks[referrer]:
                writer.writerow([referrer, target, clicks[referrer][target]])
Beispiel #19
0
def smooth_vm(files, dest):
    print "Processing %s" % dest
    vms = [read_vm_file(f) for f in files]
    data = {}
    for vm in vms:
        for referrer, target, num_clicks in vm:
            if referrer not in data:
                data[referrer] = {}
            if target not in data[referrer]:
                data[referrer][target] = 0
            data[referrer][target] += num_clicks

    with open(dest, 'w') as destf:
        writer = csv.writer(destf, delimiter="\t")
        for referrer in data.keys():
            for target in data[referrer].keys():
                writer.writerow([referrer, target, data[referrer][target]])
def smooth_vm(files, dest):
	print "Processing %s" % dest
	vms = [read_vm_file(f) for f in files]
	data = {}
	for vm in vms:
		for referrer, target, num_clicks in vm:
			if referrer not in data:
				data[referrer] = {}
			if target not in data[referrer]:
				data[referrer][target] = 0
			data[referrer][target] += num_clicks

	with open(dest, 'w') as destf:
		writer = csv.writer(destf, delimiter="\t")
		for referrer in data.keys():
			for target in data[referrer].keys():
				writer.writerow([referrer, target, data[referrer][target]])
def count_top_targets_per_file(src, dest):
    print "Processing %s" % src
    data = read_vm_file(src)
    click_counts = {}
    for referrer, target, num_clicks in data:
        if target not in click_counts:
            click_counts[target] = 0
        click_counts[target] += num_clicks

    total_count = float(numpy.sum(click_counts.values()))
    sorted_counts = sorted(click_counts.items(), key=lambda tupl: tupl[1], reverse=True)
    cum_count = 0
    with open(dest, "w") as destf:
        writer = csv.writer(destf, delimiter="\t")
        for target, count in sorted_counts:
            cum_count += count
            writer.writerow([target, count, (total_count - cum_count) / total_count])
Beispiel #22
0
def index_vms(files):
    print "Loading index."
    vms = {}
    for filepath in files:
        data = read_vm_file(filepath)
        for referrer, target, num_clicks in data:
            if referrer not in vms:
                vms[referrer] = {}
            if target not in vms[referrer]:
                vms[referrer][target] = 0
            vms[referrer][target] += num_clicks

    index = []
    total = 0
    for referrer in vms:
        for target in vms[referrer]:
            total += vms[referrer][target]
            index.append((referrer, target, total))

    return index
def index_vms(files):
	print "Loading index."
	vms = {}
	for filepath in files:
		data = read_vm_file(filepath)
		for referrer, target, num_clicks in data:
			if referrer not in vms:
				vms[referrer] = {}
			if target not in vms[referrer]:
				vms[referrer][target] = 0
			vms[referrer][target] += num_clicks

	index = []
	total = 0
	for referrer in vms:
		for target in vms[referrer]:
			total += vms[referrer][target]
			index.append((referrer, target, total))

	return index
Beispiel #24
0
def count_top_targets_per_file(src, dest):
    print "Processing %s" % src
    data = read_vm_file(src)
    click_counts = {}
    for referrer, target, num_clicks in data:
        if target not in click_counts:
            click_counts[target] = 0
        click_counts[target] += num_clicks

    total_count = float(numpy.sum(click_counts.values()))
    sorted_counts = sorted(click_counts.items(),
                           key=lambda tupl: tupl[1],
                           reverse=True)
    cum_count = 0
    with open(dest, 'w') as destf:
        writer = csv.writer(destf, delimiter="\t")
        for target, count in sorted_counts:
            cum_count += count
            writer.writerow(
                [target, count, (total_count - cum_count) / total_count])
Beispiel #25
0
def change_domain_levels(src, dest, new_domain_level):
	print "Processing", src
	data = read_vm_file(src)
	clicks = {}
	for referrer, target, num_clicks in data:
		if referrer is None or target is None or num_clicks == -1\
		or domain_level(referrer) == 1 or domain_level(target) == 1:
			continue
		newr = change_domain_level(referrer, new_domain_level)
		newt = change_domain_level(target, new_domain_level)
		if newr not in clicks:
			clicks[newr] = {}
		if newt not in clicks[newr]:
			clicks[newr][newt] = 0
		clicks[newr][newt] += num_clicks
	
	with open(dest, 'w') as destf:
		writer = csv.writer(destf, delimiter="\t")
		for referrer in clicks:
			for target in clicks[referrer]:
				writer.writerow([referrer, target, clicks[referrer][target]])
def sample_vm(src, dest, num_clicks_to_sample):
	return sample_vm_data(read_vm_file(src), dest, num_clicks_to_sample)
def sample_vm(src, dest, num_clicks_to_sample):
    return sample_vm_data(read_vm_file(src), dest, num_clicks_to_sample)