def churn(adset_file, results_dir): '''INTERFACE: Churn is the number of ads per number of trials. Args: adset_file: Config file like "accounts.cf". results_dir: Directory path to save experiment results. ''' file_set_lists = adParser.parse_conf(adset_file) ad_truth = adLib.true_ds_of_ads("dbs/adTruth.db") churn_out = "User\tTrials\tAll\tD\tR\tX\n" make_dir(results_dir) for user in file_set_lists: ad_list = [] churn_out += "\t".join([user, "0", "0", "0", "0", "0"]) + "\n" for i in range(0, len(file_set_lists[user])): # for i in range(len(file_set_lists[user])-1, -1, -1): tmp_list = adParser.parse_html_set(file_set_lists[user][i]) ad_list = adOps.union([ad_list, tmp_list]) type_counts = adLib.ad_types_count(ad_list, ad_truth) churn_out += "\t".join([user, str(i+1), str(len(ad_list)), \ types_count_str(type_counts)]) + "\n" fd = open(results_dir + "/" + user + ".txt", "w") fd.write(adOps.get_ads_str(ad_list)) fd.flush() fd.close() fd = open(results_dir + "/churn.txt", "w") fd.write(churn_out) fd.flush() fd.close()
def parse_html_set(html_set): '''Parse a set of HTMLs to get union of ads in them. Args: html_set: Any iterable collection of HTML files. Returns: List of ad objects. ''' ad_list = [] for html_file in html_set: if os.path.isfile(html_file) \ and "text/html" in magic.from_file(html_file, mime=True): ad_list.append(parse_html(html_file)) return adOps.union(ad_list)
def dump_all_ads(conf_file, results_dir): '''INTERFACE: Dump all ads in base trials and cumulative shadow trials. Args: conf_file: Config file like "accounts.cf". results_dir: Directory into which the ads are to be dumped into. ''' file_sets = adParser.parse_conf("accounts.cf") shadow_ads = [] make_dir(results_dir + "/base") make_dir(results_dir + "/shadow") print "Trial Base Shadow Cumulative" for i in range(0, 91): base_file_set = file_sets["ccloudauditor10"][i] base_ads = adParser.parse_html_set(base_file_set) adLib.dump_ads(base_ads, results_dir + "/base/base_" + str(i) + ".txt") shadow_file_set = adParser.get_file_set(file_sets, i, "ccloudauditor10") sads = adParser.parse_html_set(shadow_file_set) shadow_ads = adOps.union([shadow_ads, sads]) adLib.dump_ads(shadow_ads, results_dir + "/shadow/shadow_" + str(i) + \ ".txt") print i, len(base_ads), len(sads), len(shadow_ads)
def compare_accounts(adset_file, results_dir): '''INTERFACE: Compare a "base" and "other" account to see which of the ads in "base" are found in "other". Args: adset_file: Config file like "accounts.cf" specifying "base" and "other". results_dir: Directory path to save experiment results. ''' file_sets = adParser.parse_conf(adset_file) ad_truth = adLib.true_ds_of_ads("dbs/adTruth.db") if "base" in file_sets and "other" in file_sets: base_file_sets = file_sets["base"] other_file_sets = file_sets["other"] else: print "ERROR:", adset_file, "doesn't specify base and other accounts." return make_dir(results_dir) result_str = "Base\tCount\tDs\tRs\tXs\tOther\tNF\tDs\tRs\tXs\tCommon\tDs\tRs\tXs\n" for b in range(0, len(base_file_sets)): # print "BaseTrial", b base_ads = adParser.parse_html_set(base_file_sets[b]) # adLib.dump_ads(base_ads, results_dir + "/base" + str(b) + ".txt") base_count = len(base_ads) base_tc = adLib.ad_types_count(base_ads, ad_truth) other_ads = [] prev_diff = base_count printed = False for o in range(0, len(other_file_sets)): other_ads = adOps.union([other_ads, \ adParser.parse_html_set(other_file_sets[o])]) base_ads = adOps.difference(base_ads, other_ads) diff = len(base_ads) diff_tc = adLib.ad_types_count(base_ads, ad_truth) common = base_count - diff common_tc = {} for key in base_tc: common_tc[key] = base_tc[key] - diff_tc[key] if (not printed and o == len(other_file_sets)-1) or diff != prev_diff: printed = True result_str += "\t".join([str(b), str(base_count), \ types_count_str(base_tc), str(o), str(diff), \ types_count_str(diff_tc), str(common), \ types_count_str(common_tc)]) + "\n" prev_diff = diff if diff == 0: break # adLib.dump_ads(base_ads, results_dir + "/diff" + str(b) + ".txt") # adLib.dump_ads(other_ads, results_dir + "/other.txt") fd = open(results_dir + "/results.txt", "w") fd.write(result_str) fd.flush() fd.close()