def update_groundtruth(original_expected, original_u_text, original_u_dom, original_g_text, original_g_dom, add_expected, add_u_text, add_u_dom, add_g_text, add_g_dom, out_expected, out_u_text, out_u_dom, out_g_text, out_g_dom): in_e = CD.ObservedSites() read_proto_from_file(in_e, original_expected) in_u_t = CD.ObservedSites() read_proto_from_file(in_u_t, original_u_text) in_u_d = CD.ObservedSites() read_proto_from_file(in_u_d, original_u_dom) in_g_t = CD.ObservedSites() read_proto_from_file(in_g_t, original_g_text) in_g_d = CD.ObservedSites() read_proto_from_file(in_g_d, original_g_dom) # add google is list add_e = CD.ObservedSites() read_proto_from_file(add_e, add_expected) #add_u_t = CD.ObservedSites() #read_proto_from_file(add_u_t, add_u_text) #add_u_d = CD.ObservedSites() #read_proto_from_file(add_u_d, add_u_dom) #add_g_t = merge_observed_sites(add_g_text) #add_g_d = merge_observed_sites(add_g_dom) in_e_set = sites_name_set(in_e) add_e_set = sites_name_set(add_e) diff_e_set = add_e_set - in_e_set logger = logging.getLogger("global") logger.info("size of original set: {0}, size of add set: {1}, size of diff set: {2}".format( len(in_e_set), len(add_e_set), len(diff_e_set))) logger.info("diff set is") logger.info(diff_e_set) _output_sample_sites(diff_e_set, [add_expected], add_expected + ".temp") _output_sample_sites(diff_e_set, [add_u_text], add_u_text + ".temp") _output_sample_sites(diff_e_set, [add_u_dom], add_u_dom + ".temp") add_g_text_fs = filter(bool, open(add_g_text, 'r').read().split('\n')) add_g_dom_fs = filter(bool, open(add_g_dom, 'r').read().split('\n')) _output_sample_sites(diff_e_set, add_g_text_fs, add_g_text + ".temp") _output_sample_sites(diff_e_set, add_g_dom_fs, add_g_dom + ".temp") out_expected_sites = merge_observed_sites([original_expected, add_expected + ".temp"]) out_u_t_sites = merge_observed_sites([original_u_text, add_u_text + ".temp"]) out_u_d_sites = merge_observed_sites([original_u_dom, add_u_dom + ".temp"]) out_g_t_sites = merge_observed_sites([original_g_text, add_g_text + ".temp"]) out_g_d_sites = merge_observed_sites([original_g_dom, add_g_dom + ".temp"]) out_u_t_sites.config.CopyFrom(in_u_t.config) out_u_d_sites.config.CopyFrom(in_u_d.config) out_g_t_sites.config.CopyFrom(in_g_t.config) out_g_d_sites.config.CopyFrom(in_g_d.config) write_proto_to_file(out_expected_sites, out_expected) write_proto_to_file(out_u_t_sites, out_u_text) write_proto_to_file(out_u_d_sites, out_u_dom) write_proto_to_file(out_g_t_sites, out_g_text) write_proto_to_file(out_g_d_sites, out_g_dom)
def sample(text_filenames, outfile, sample_size): dom_filenames = _replace_list_by(text_filenames, 'text', 'dom') google_text_filenames = _replace_list_by(text_filenames, 'user', 'google') google_dom_filenames = _replace_list_by(dom_filenames, 'user', 'google') text_observed_sites = merge_observed_sites(text_filenames) observed_site_list = list() url_set = set() for observed_site in text_observed_sites.site: observed_site_list.append(observed_site) for observation in observed_site.observation: url_set.add(observation.landing_url) logger = logging.getLogger("global") logger.info("there are {0} urls".format(len(url_set))) logger.info("there are {0} observed sites".format(len(observed_site_list))) random.shuffle(observed_site_list) # test_size is number of sites, actual observation should be more than this. sample_sites = CD.ObservedSites() sample_sites.config.CopyFrom(text_observed_sites.config) sample_list = observed_site_list[0:sample_size] original_label_list = [observed_site.name for observed_site in sample_list] for observed_site in sample_list: sample_site = sample_sites.site.add() sample_site.CopyFrom(observed_site) sample_filename = outfile + ".user.sample.text" write_proto_to_file(sample_sites, sample_filename) _output_sample_sites(original_label_list, dom_filenames, outfile + ".user.sample.dom") _output_sample_sites(original_label_list, google_text_filenames, outfile + '.google.sample.text') _output_sample_sites(original_label_list, google_dom_filenames, outfile + '.google.sample.dom')
def load_split_observed_sites(filename): if not os.path.exists(filename): count = 0 split_files = list() while True: split_file = filename.replace('list', 'list_' + str(count)) if not os.path.exists(split_file): break split_files.append(split_file) count += 1 observed_sites = merge_observed_sites(split_files) else: observed_sites = CD.ObservedSites() read_proto_from_file(observed_sites, filename) return observed_sites
def _output_sample_sites(original_label_list, filenames, outfile): """ Output the sample sites, either google or user @parameter oringinal_label_list: the selected websites filenames: observed sites filenames outfile: output filename @return If observed_sites from filenames doesn't contain all urls from original label list, use the return value new_label_list to get the updated label list. """ observed_sites = merge_observed_sites(filenames) observed_sites_map = dict() for observed_site in observed_sites.site: observed_sites_map[observed_site.name] = observed_site sample_sites = CD.ObservedSites() if observed_sites.HasField("config"): sample_sites.config.CopyFrom(observed_sites.config) else: print "There is no config in the observed_sites, please double check why" print "This can only happen to expected sites" print filenames sample_list = list() new_label_list = list() for label in original_label_list: if label in observed_sites_map: sample_list.append(observed_sites_map[label]) new_label_list.append(label) for observed_site in sample_list: sample_site = sample_sites.site.add() sample_site.CopyFrom(observed_site) write_proto_to_file(sample_sites, outfile) o_size = len(original_label_list) n_size = len(new_label_list) if not o_size == n_size: print "size of the original label list is: {0}".format(o_size) print "size of the new label list is: {0}".format(n_size) return new_label_list
def update_groundtruth_redundant(count, original_expected, original_u_text, original_u_dom, original_g_text, original_g_dom, add_expected, add_all, out_expected, out_u_text, out_u_dom, out_g_text, out_g_dom): valid_instance(count, int) in_e = CD.ObservedSites() read_proto_from_file(in_e, original_expected) in_u_t = CD.ObservedSites() read_proto_from_file(in_u_t, original_u_text) in_u_d = CD.ObservedSites() read_proto_from_file(in_u_d, original_u_dom) in_g_t = CD.ObservedSites() read_proto_from_file(in_g_t, original_g_text) in_g_d = CD.ObservedSites() read_proto_from_file(in_g_d, original_g_dom) add_e = CD.ObservedSites() read_proto_from_file(add_e, add_expected) in_e_set = sites_name_set(in_e) add_e_set = sites_name_set(add_e) diff_e_set = add_e_set - in_e_set logger = logging.getLogger("global") logger.info("size of original set: {0}, size of add set: {1}, size of diff set: {2}".format( len(in_e_set), len(add_e_set), len(diff_e_set))) logger.info("diff set is") logger.info(diff_e_set) diff_e_list = list(diff_e_set) logger.info(len(diff_e_list)) random.shuffle(diff_e_list) diff_e_sample = diff_e_list[:count] """ get the sites that are in "de-deduplicated" examples and add them this is necessary, because there are sites, that are cloaking, but remove in de-dup phase. Doesn't know why. """ add_u_text_fs = filter(bool, open(add_all, 'r').read().split('\n')) diff_e_sample = set(_output_sample_sites(diff_e_sample, add_u_text_fs, add_all + ".u.text.temp")) # use the updated diff expected set, to generate the new data _output_sample_sites(diff_e_sample, [add_expected], add_expected + ".temp") add_u_dom_fs = _replace_list_by(add_u_text_fs, 'text', 'dom') _output_sample_sites(diff_e_sample, add_u_dom_fs, add_all + ".u.dom.temp") add_g_text_fs = _replace_list_by(add_u_text_fs, 'user', 'google') _output_sample_sites(diff_e_sample, add_g_text_fs, add_all + ".g.text.temp") add_g_dom_fs = _replace_list_by(add_u_dom_fs, 'user', 'google') _output_sample_sites(diff_e_sample, add_g_dom_fs, add_all + ".g.dom.temp") out_expected_sites = merge_observed_sites([original_expected, add_expected + ".temp"]) out_u_t_sites = merge_observed_sites([original_u_text, add_all + ".u.text.temp"]) out_u_d_sites = merge_observed_sites([original_u_dom, add_all + ".u.dom.temp"]) out_g_t_sites = merge_observed_sites([original_g_text, add_all + ".g.text.temp"]) out_g_d_sites = merge_observed_sites([original_g_dom, add_all + ".g.dom.temp"]) out_u_t_sites.config.CopyFrom(in_u_t.config) out_u_d_sites.config.CopyFrom(in_u_d.config) out_g_t_sites.config.CopyFrom(in_g_t.config) out_g_d_sites.config.CopyFrom(in_g_d.config) write_proto_to_file(out_expected_sites, out_expected) write_proto_to_file(out_u_t_sites, out_u_text) write_proto_to_file(out_u_d_sites, out_u_dom) write_proto_to_file(out_g_t_sites, out_g_text) write_proto_to_file(out_g_d_sites, out_g_dom)
def main(argv): has_function = False help_msg = """data_util.py -f <function> [-p <prefix>][-p <prefix> -o <outfile>][-i <inputfile> -t <proto_type>][-o <outfile>][-i <site_list> -l <server_link> -o <outdir> -m <mode>][-i <inputfile>-o <outfile> -s <simhash_type> -t <proto_type>][-i <inputfile> -o <outfile> -s <simhash_type> -t <proto_type> -a] [-o <outfile>] [-i <inputfile> -o <outfile>] [-i <inputfile>] [-i <text_filt>] [-i <inputfile> -c <count> -o <outfile>] [-o <outfile>] [-i <inputfile> -l <leanredfile> -o <outfile>], valid functions are append_prefix, compute_list, show_proto, intersect_sites, collect_observations, plot_simhash, plot_sim_distance, get_domains, get_domain_scores, domain_filter, dedup, sample, merge_sites, get_learned_eval, [-i <table_name> -o <outfie>] export_db [-i <inputfile> -o <outfile>] de_noise [-i <inputfile> -c <count>] update_groundtruth [-i <user observation list, suffix removed>] merge_user_sites""" try: opts, args = getopt.getopt(argv, "hf:p:o:t:i:m:l:s:ac:", ["function=", "prefix=", "outfile=", "proto_type=", "ifile=", "mode=", "link=", "simhash_type=", "avg_dist", "count"]) except getopt.GetoptError: print help_msg sys.exit(2) hasinputfile = False outfile = None avg_dist = False for opt, arg in opts: if opt == "-h": print help_msg sys.exit() elif opt in ("-f", "--function"): function = arg has_function = True elif opt in ("-p", "--prefix"): prefix = arg elif opt in ("-o", "--outfile"): outfile = arg elif opt in ("-i", "--ifile"): inputfile = arg hasinputfile = True elif opt in ("-t", "--proto_type"): proto_type = arg elif opt in ("-m", "--mode"): mode = arg elif opt in ("-l", "--link"): link = arg elif opt in ("-s", "--simhash_type"): simhash_type = arg elif opt in ("-a", "--avg_dist"): avg_dist = True elif opt in ("-c", "--count"): count = arg else: print help_msg sys.exit(2) if hasinputfile: logging.basicConfig(filename= inputfile + "_running_log_" + function, level=logging.DEBUG) logging.getLogger("global") if not has_function: print help_msg sys.exit() if function == "append_prefix": inputfile_list = [line[:-1] for line in sys.stdin] append_prefix(inputfile_list, prefix) elif function == "compute_list": crawl_log_list = [line[:-1] for line in sys.stdin] compute_list(crawl_log_list, outfile, prefix) elif function == "show_proto": show_proto(inputfile, proto_type) elif function == "intersect_sites": observed_sites_list = [line[:-1] for line in sys.stdin] result_sites = intersect_observed_sites(*observed_sites_list) write_proto_to_file(result_sites, outfile) evaluation_form(outfile, outfile + ".eval", "ObservedSites") elif function == "collect_observations": if link: util.REMOTE_DRIVER = link site_list = filter(bool, open(inputfile, 'r').read().split('\n')) site_set = set(site_list) outdir = outfile collect_site_for_plot(site_set, outdir, mode) elif function == "plot_simhash": if not outfile: outfile = inputfile + ".plot_cluster" plot_simhash(inputfile, outfile, simhash_type, proto_type) elif function == "plot_sim_distance": if not outfile: outfile = inputfile + ".plot_sim_distance" plot_sim_distance(inputfile, outfile, simhash_type, proto_type, avg_dist) elif function == "get_domains": observed_sites_list = [line[:-1] for line in sys.stdin] get_domains(observed_sites_list, outfile) elif function == "get_domain_scores": domains = filter(bool, open(inputfile, 'r').read().split('\n')) result = domain_scores(domains, outfile) elif function == "domain_filter": """ Three steps for computed sites. 1. filter known benign 2. de-duplicate 3. sample $count number of sites """ bar_points = 60 observed_sites_list = filter(bool, open(inputfile, 'r').read().split('\n')) for filename in observed_sites_list: get_bad(bar_points, filename, filename + ".filt") elif function == "dedup": text_filenames = filter(bool, open(inputfile, 'r').read().split('\n')) count = 0 for filename in text_filenames: if ((not 'text' in filename) or ('google' in filename) or ('dom' in filename)): response = interact_query("The input file doesn't seem to \ be valid! Press [Yes/No] to continue or exit!") if not response: sys.exit(0) count += dedup(filename) logger = logging.getLogger("global") logger.info("total sites after dedup: {0}".format(count)) elif function == "sample": text_filenames = filter(bool, open(inputfile, 'r').read().split('\n')) sample(text_filenames, outfile, int(count)) evaluation_form(outfile + '.user.sample.text', outfile + ".user.sample.text.eval", "ObservedSites") evaluation_form(outfile + '.google.sample.text', outfile + ".google.sample.text.eval", "ObservedSites") elif function == "merge_sites": observed_sites_names = [line[:-1] for line in sys.stdin] observed_sites = merge_observed_sites(observed_sites_names) logger = logging.getLogger("global") logger.info("total sites after merge: {0}".format(len(observed_sites.site))) write_proto_to_file(observed_sites, outfile) elif function == "merge_user_sites": """ -i input_file """ filenames = filter(bool, open(inputfile, 'r').read().split('\n')) text_filenames = [filename + '.text' for filename in filenames] dom_filenames = [filename + '.dom' for filename in filenames] text_observed_sites = merge_observed_sites(text_filenames) logger = logging.getLogger("global") logger.info("total sites after merge: {0}".format(len(text_observed_sites.site))) write_proto_to_file(text_observed_sites, inputfile + '.text') dom_observed_sites = merge_observed_sites(dom_filenames) logger.info("total sites after merge: {0}".format(len(dom_observed_sites.site))) write_proto_to_file(dom_observed_sites, inputfile + '.dom') elif function == "get_learned_eval": """ -l learned_file -i detected_file """ learned_file = link observed_file = inputfile result_sites = get_learned_eval(learned_file, observed_file) write_proto_to_file(result_sites, outfile) evaluation_form(outfile, outfile + ".eval", "LearnedSites") elif function == "export_db": """ -i table_name -o outfile """ export_db_to_file(inputfile, outfile) export_db_to_file(inputfile, outfile + ".noise", ["PageBroken"]) elif function == "de_noise": """ remove noise: index.html not found, feature count = 0 """ if "learn" in inputfile: response = interact_query("The input file seems to \ be learned sites, we only support observed \ sites! Press [Yes/No] to continue or exit!") if not response: sys.exit(0) logger = logging.getLogger("global") logger.info("processing {0}".format(inputfile)) de_noise_config = CD.DeNoiseConfig() de_noise_config.zero_feature = True original = CD.ObservedSites() read_proto_from_file(original, inputfile) observed_sites = de_noise(original, de_noise_config) logger.info("before de-noise {0}".format(len(original.site))) logger.info("after de-noise: {0}".format(len(observed_sites.site))) outfile = outfile if outfile else inputfile write_proto_to_file(observed_sites, outfile) elif function == "update_groundtruth": """ This function is too specific. It is to add more malicious examples to the collected groundtruth. """ filenames = filter(bool, open(inputfile, 'r').read().split('\n')) if len(filenames) == 15: original_expected = filenames[0] original_u_text = filenames[1] original_u_dom = filenames[2] original_g_text = filenames[3] original_g_dom = filenames[4] # observed site may have same URL. add_count = count add_expected = filenames[5] add_u_text = filenames[6] add_u_dom = filenames[7] add_g_text = filenames[8] add_g_dom = filenames[9] # outfile out_expected = filenames[10] out_u_text = filenames[11] out_u_dom = filenames[12] out_g_text = filenames[13] out_g_dom = filenames[14] # in this case we will add all update_groundtruth(original_expected, original_u_text, original_u_dom, original_g_text, original_g_dom, add_expected, add_u_text, add_u_dom, add_g_text, add_g_dom, out_expected, out_u_text, out_u_dom, out_g_text, out_g_dom) elif len(filenames) == 12: original_expected = filenames[0] original_u_text = filenames[1] original_u_dom = filenames[2] original_g_text = filenames[3] original_g_dom = filenames[4] # observed site may have same URL. add_count = int(count) add_expected = filenames[5] add_all = filenames[6] ''' add_u_text = filenames[6] add_u_dom = filenames[7] add_g_text = filenames[8] add_g_dom = filenames[9] # outfile out_expected = filenames[10] out_u_text = filenames[11] out_u_dom = filenames[12] out_g_text = filenames[13] out_g_dom = filenames[14] ''' out_expected = filenames[7] out_u_text = filenames[8] out_u_dom = filenames[9] out_g_text = filenames[10] out_g_dom = filenames[11] update_groundtruth_redundant(add_count, original_expected, original_u_text, original_u_dom, original_g_text, original_g_dom, add_expected, add_all, out_expected, out_u_text, out_u_dom, out_g_text, out_g_dom) else: raise Exception("Cannot handle now!") else: print help_msg sys.exit(2)