Beispiel #1
0
def churn(adset_file, results_dir):
	'''INTERFACE: Churn is the number of ads per number of trials.

	Args:
		adset_file: Config file like "accounts.cf".
		results_dir: Directory path to save experiment results.
	'''
	file_set_lists = adParser.parse_conf(adset_file)
	ad_truth = adLib.true_ds_of_ads("dbs/adTruth.db")
	churn_out = "User\tTrials\tAll\tD\tR\tX\n"
	make_dir(results_dir)

	for user in file_set_lists:
		ad_list = []
		churn_out += "\t".join([user, "0", "0", "0", "0", "0"]) + "\n"

		for i in range(0, len(file_set_lists[user])):
		# for i in range(len(file_set_lists[user])-1, -1, -1):
			tmp_list = adParser.parse_html_set(file_set_lists[user][i])
			ad_list = adOps.union([ad_list, tmp_list])
			type_counts = adLib.ad_types_count(ad_list, ad_truth)
			churn_out += "\t".join([user, str(i+1), str(len(ad_list)), \
										types_count_str(type_counts)]) + "\n"
		
		fd = open(results_dir + "/" + user + ".txt", "w")
		fd.write(adOps.get_ads_str(ad_list))
		fd.flush()
		fd.close()

	fd = open(results_dir + "/churn.txt", "w")
	fd.write(churn_out)
	fd.flush()
	fd.close()
Beispiel #2
0
def parse_html_set(html_set):
	'''Parse a set of HTMLs to get union of ads in them.

	Args:
		html_set: Any iterable collection of HTML files.

	Returns:
		List of ad objects.
	'''
	ad_list = []

	for html_file in html_set:
		if os.path.isfile(html_file) \
			and "text/html" in magic.from_file(html_file, mime=True):
			ad_list.append(parse_html(html_file))

	return adOps.union(ad_list)
Beispiel #3
0
def dump_all_ads(conf_file, results_dir):
	'''INTERFACE: Dump all ads in base trials and cumulative shadow trials.

	Args:
		conf_file: Config file like "accounts.cf".
		results_dir: Directory into which the ads are to be dumped into.
	'''
	file_sets = adParser.parse_conf("accounts.cf")
	shadow_ads = []

	make_dir(results_dir + "/base")
	make_dir(results_dir + "/shadow")

	print "Trial Base Shadow Cumulative"
	for i in range(0, 91):
		base_file_set = file_sets["ccloudauditor10"][i]
		base_ads = adParser.parse_html_set(base_file_set)
		adLib.dump_ads(base_ads, results_dir + "/base/base_" + str(i) + ".txt")
		shadow_file_set = adParser.get_file_set(file_sets, i, "ccloudauditor10")
		sads = adParser.parse_html_set(shadow_file_set)
		shadow_ads = adOps.union([shadow_ads, sads])
		adLib.dump_ads(shadow_ads, results_dir + "/shadow/shadow_" + str(i) + \
																		".txt")
		print i, len(base_ads), len(sads), len(shadow_ads)
Beispiel #4
0
def compare_accounts(adset_file, results_dir):
	'''INTERFACE: Compare a "base" and "other" account to see which of the ads
	in "base" are found in "other".

	Args:
		adset_file: Config file like "accounts.cf" specifying "base" and "other".
		results_dir: Directory path to save experiment results.
	'''
	file_sets = adParser.parse_conf(adset_file)
	ad_truth = adLib.true_ds_of_ads("dbs/adTruth.db")

	if "base" in file_sets and "other" in file_sets:
		base_file_sets = file_sets["base"]
		other_file_sets = file_sets["other"]
	else:
		print "ERROR:", adset_file, "doesn't specify base and other accounts."
		return

	make_dir(results_dir)
	result_str = "Base\tCount\tDs\tRs\tXs\tOther\tNF\tDs\tRs\tXs\tCommon\tDs\tRs\tXs\n"

	for b in range(0, len(base_file_sets)):
		# print "BaseTrial", b
		base_ads = adParser.parse_html_set(base_file_sets[b])
		# adLib.dump_ads(base_ads, results_dir + "/base" + str(b) + ".txt")
		base_count = len(base_ads)
		base_tc = adLib.ad_types_count(base_ads, ad_truth)
		other_ads = []
		prev_diff = base_count
		printed = False

		for o in range(0, len(other_file_sets)):
			other_ads = adOps.union([other_ads, \
								adParser.parse_html_set(other_file_sets[o])])
			base_ads = adOps.difference(base_ads, other_ads)
			diff = len(base_ads)
			diff_tc = adLib.ad_types_count(base_ads, ad_truth)

			common = base_count - diff
			common_tc = {}
			for key in base_tc:
				common_tc[key] = base_tc[key] - diff_tc[key]

			if (not printed and o == len(other_file_sets)-1) or diff != prev_diff:
				printed = True
				result_str += "\t".join([str(b), str(base_count), \
								types_count_str(base_tc), str(o), str(diff), \
								types_count_str(diff_tc), str(common), \
								types_count_str(common_tc)]) + "\n"
			
			prev_diff = diff

			if diff == 0:
				break

		# adLib.dump_ads(base_ads, results_dir + "/diff" + str(b) + ".txt")

	# adLib.dump_ads(other_ads, results_dir + "/other.txt")

	fd = open(results_dir + "/results.txt", "w")
	fd.write(result_str)
	fd.flush()
	fd.close()