コード例 #1
0
def update_groundtruth(original_expected,
		original_u_text, original_u_dom, original_g_text, original_g_dom, 
		add_expected, add_u_text, add_u_dom, add_g_text, add_g_dom, 
		out_expected, out_u_text, out_u_dom, out_g_text, out_g_dom):
	in_e = CD.ObservedSites()
	read_proto_from_file(in_e, original_expected)
	in_u_t = CD.ObservedSites()
	read_proto_from_file(in_u_t, original_u_text)
	in_u_d = CD.ObservedSites()
	read_proto_from_file(in_u_d, original_u_dom)
	in_g_t = CD.ObservedSites()
	read_proto_from_file(in_g_t, original_g_text)
	in_g_d = CD.ObservedSites()
	read_proto_from_file(in_g_d, original_g_dom)

	# add google is list
	add_e = CD.ObservedSites()
	read_proto_from_file(add_e, add_expected)
	#add_u_t = CD.ObservedSites()
	#read_proto_from_file(add_u_t, add_u_text)
	#add_u_d = CD.ObservedSites()
	#read_proto_from_file(add_u_d, add_u_dom)
	#add_g_t = merge_observed_sites(add_g_text)
	#add_g_d = merge_observed_sites(add_g_dom)
	in_e_set = sites_name_set(in_e)
	add_e_set = sites_name_set(add_e)
	diff_e_set = add_e_set - in_e_set
	logger = logging.getLogger("global")
	logger.info("size of original set: {0}, size of add set: {1}, size of diff set: {2}".format(
		len(in_e_set), len(add_e_set), len(diff_e_set)))
	logger.info("diff set is")	
	logger.info(diff_e_set)

	_output_sample_sites(diff_e_set, [add_expected], add_expected + ".temp")
	_output_sample_sites(diff_e_set, [add_u_text], add_u_text + ".temp")
	_output_sample_sites(diff_e_set, [add_u_dom], add_u_dom + ".temp")
	add_g_text_fs = filter(bool, open(add_g_text, 'r').read().split('\n'))
	add_g_dom_fs = filter(bool, open(add_g_dom, 'r').read().split('\n'))
	_output_sample_sites(diff_e_set, add_g_text_fs, add_g_text + ".temp")
	_output_sample_sites(diff_e_set, add_g_dom_fs, add_g_dom + ".temp")
	out_expected_sites = merge_observed_sites([original_expected,
		add_expected + ".temp"])
	out_u_t_sites = merge_observed_sites([original_u_text, add_u_text + ".temp"])
	out_u_d_sites = merge_observed_sites([original_u_dom, add_u_dom + ".temp"])
	out_g_t_sites = merge_observed_sites([original_g_text, add_g_text + ".temp"])
	out_g_d_sites = merge_observed_sites([original_g_dom, add_g_dom + ".temp"])
	out_u_t_sites.config.CopyFrom(in_u_t.config)
	out_u_d_sites.config.CopyFrom(in_u_d.config)
	out_g_t_sites.config.CopyFrom(in_g_t.config)
	out_g_d_sites.config.CopyFrom(in_g_d.config)
	write_proto_to_file(out_expected_sites, out_expected)
	write_proto_to_file(out_u_t_sites, out_u_text)
	write_proto_to_file(out_u_d_sites, out_u_dom)
	write_proto_to_file(out_g_t_sites, out_g_text)
	write_proto_to_file(out_g_d_sites, out_g_dom)
コード例 #2
0
def sample(text_filenames, outfile, sample_size):
	dom_filenames = _replace_list_by(text_filenames, 'text', 'dom')
	google_text_filenames = _replace_list_by(text_filenames, 'user',
			'google')
	google_dom_filenames = _replace_list_by(dom_filenames, 'user', 'google')

	text_observed_sites = merge_observed_sites(text_filenames)
	observed_site_list = list()
	url_set = set()
	for observed_site in text_observed_sites.site:
		observed_site_list.append(observed_site)
		for observation in observed_site.observation:
			url_set.add(observation.landing_url)
	logger = logging.getLogger("global")
	logger.info("there are {0} urls".format(len(url_set)))
	logger.info("there are {0} observed sites".format(len(observed_site_list)))
	random.shuffle(observed_site_list)
	# test_size is number of sites, actual observation should be more than this.
	sample_sites = CD.ObservedSites()
	sample_sites.config.CopyFrom(text_observed_sites.config)
	sample_list = observed_site_list[0:sample_size]
	original_label_list = [observed_site.name for observed_site in sample_list]
	for observed_site in sample_list:
		sample_site = sample_sites.site.add()
		sample_site.CopyFrom(observed_site)
	sample_filename = outfile + ".user.sample.text"
	write_proto_to_file(sample_sites, sample_filename)


	_output_sample_sites(original_label_list, dom_filenames, outfile + ".user.sample.dom")
	_output_sample_sites(original_label_list, google_text_filenames,
			outfile + '.google.sample.text')
	_output_sample_sites(original_label_list, google_dom_filenames, outfile
			+ '.google.sample.dom')
コード例 #3
0
def load_split_observed_sites(filename):
	if not os.path.exists(filename):
		count = 0
		split_files = list()
		while True:
			split_file = filename.replace('list', 'list_' +
					str(count))
			if not os.path.exists(split_file):
				break
			split_files.append(split_file)
			count += 1
		observed_sites = merge_observed_sites(split_files)
	else:
		observed_sites = CD.ObservedSites()
		read_proto_from_file(observed_sites, filename)
	return observed_sites
コード例 #4
0
def _output_sample_sites(original_label_list, filenames, outfile):
	"""
	Output the sample sites, either google or user

	@parameter
	oringinal_label_list: the selected websites
	filenames: observed sites filenames
	outfile: output filename
	@return
	If observed_sites from filenames doesn't contain all urls from original
	label list, use the return value new_label_list to get the updated
	label list.
	"""
	observed_sites = merge_observed_sites(filenames)
	observed_sites_map = dict()
	for observed_site in observed_sites.site:
		observed_sites_map[observed_site.name] = observed_site
	sample_sites = CD.ObservedSites()
	if observed_sites.HasField("config"):
		sample_sites.config.CopyFrom(observed_sites.config)
	else:
		print "There is no config in the observed_sites, please double check why"
		print "This can only happen to expected sites"
		print filenames
	sample_list = list()
	new_label_list = list()
	for label in original_label_list:
		if label in observed_sites_map:
			sample_list.append(observed_sites_map[label])
			new_label_list.append(label)
	for observed_site in sample_list:
		sample_site = sample_sites.site.add()
		sample_site.CopyFrom(observed_site)
	write_proto_to_file(sample_sites, outfile)
	o_size = len(original_label_list) 
	n_size = len(new_label_list)
	if not o_size == n_size:
		print "size of the original label list is: {0}".format(o_size)
		print "size of the new label list is: {0}".format(n_size)
	return new_label_list
コード例 #5
0
def update_groundtruth_redundant(count, original_expected,
		original_u_text, original_u_dom,
		original_g_text, original_g_dom,
		add_expected, add_all,
		out_expected, out_u_text, out_u_dom,
		out_g_text, out_g_dom):
	valid_instance(count, int)
	in_e = CD.ObservedSites()
	read_proto_from_file(in_e, original_expected)
	in_u_t = CD.ObservedSites()
	read_proto_from_file(in_u_t, original_u_text)
	in_u_d = CD.ObservedSites()
	read_proto_from_file(in_u_d, original_u_dom)
	in_g_t = CD.ObservedSites()
	read_proto_from_file(in_g_t, original_g_text)
	in_g_d = CD.ObservedSites()
	read_proto_from_file(in_g_d, original_g_dom)
	add_e = CD.ObservedSites()
	read_proto_from_file(add_e, add_expected)

	in_e_set = sites_name_set(in_e)
	add_e_set = sites_name_set(add_e)
	diff_e_set = add_e_set - in_e_set
	logger = logging.getLogger("global")
	logger.info("size of original set: {0}, size of add set: {1}, size of diff set: {2}".format(
		len(in_e_set), len(add_e_set), len(diff_e_set)))
	logger.info("diff set is")	
	logger.info(diff_e_set)
	diff_e_list = list(diff_e_set)
	logger.info(len(diff_e_list))
	random.shuffle(diff_e_list)
	diff_e_sample = diff_e_list[:count]
	

	"""
	get the sites that are in "de-deduplicated" examples and add them
	this is necessary, because there are sites, that are cloaking, but
	remove in de-dup phase. Doesn't know why.
	"""
	add_u_text_fs = filter(bool, open(add_all, 'r').read().split('\n'))
	diff_e_sample = set(_output_sample_sites(diff_e_sample, add_u_text_fs, add_all
		+ ".u.text.temp"))

	# use the updated diff expected set, to generate the new data
	_output_sample_sites(diff_e_sample, [add_expected], add_expected + ".temp")
	add_u_dom_fs = _replace_list_by(add_u_text_fs, 'text', 'dom')
	_output_sample_sites(diff_e_sample, add_u_dom_fs, add_all + ".u.dom.temp")
	add_g_text_fs = _replace_list_by(add_u_text_fs, 'user', 'google')
	_output_sample_sites(diff_e_sample, add_g_text_fs, add_all + ".g.text.temp")
	add_g_dom_fs = _replace_list_by(add_u_dom_fs, 'user', 'google')
	_output_sample_sites(diff_e_sample, add_g_dom_fs, add_all + ".g.dom.temp")
	out_expected_sites = merge_observed_sites([original_expected, add_expected + ".temp"])
	out_u_t_sites = merge_observed_sites([original_u_text, add_all + ".u.text.temp"])
	out_u_d_sites = merge_observed_sites([original_u_dom, add_all + ".u.dom.temp"])
	out_g_t_sites = merge_observed_sites([original_g_text, add_all +
		".g.text.temp"])
	out_g_d_sites = merge_observed_sites([original_g_dom, add_all +
		".g.dom.temp"])
	out_u_t_sites.config.CopyFrom(in_u_t.config)
	out_u_d_sites.config.CopyFrom(in_u_d.config)
	out_g_t_sites.config.CopyFrom(in_g_t.config)
	out_g_d_sites.config.CopyFrom(in_g_d.config)
	write_proto_to_file(out_expected_sites, out_expected)
	write_proto_to_file(out_u_t_sites, out_u_text)
	write_proto_to_file(out_u_d_sites, out_u_dom)
	write_proto_to_file(out_g_t_sites, out_g_text)
	write_proto_to_file(out_g_d_sites, out_g_dom)
コード例 #6
0
def main(argv):
	has_function = False
	help_msg = """data_util.py -f <function> [-p <prefix>][-p <prefix> -o
	<outfile>][-i <inputfile> -t <proto_type>][-o <outfile>][-i <site_list>
	-l <server_link> -o <outdir> -m <mode>][-i <inputfile>-o <outfile> -s
	<simhash_type> -t <proto_type>][-i <inputfile> -o <outfile> -s
	<simhash_type> -t <proto_type> -a] [-o <outfile>] [-i <inputfile> -o
	<outfile>] [-i <inputfile>] [-i <text_filt>] [-i <inputfile> -c <count>
	-o <outfile>] [-o <outfile>] [-i <inputfile> -l <leanredfile> -o <outfile>], valid functions are
	append_prefix, compute_list, show_proto, intersect_sites,
	collect_observations, plot_simhash, plot_sim_distance, get_domains,
	get_domain_scores, domain_filter, dedup, sample, merge_sites,
	get_learned_eval, [-i <table_name> -o <outfie>] export_db
	[-i <inputfile> -o <outfile>] de_noise
	[-i <inputfile> -c <count>] update_groundtruth
	[-i <user observation list, suffix removed>] merge_user_sites"""
	try:
		opts, args = getopt.getopt(argv, "hf:p:o:t:i:m:l:s:ac:",
				["function=", "prefix=", "outfile=",
					"proto_type=", "ifile=", "mode=",
					"link=", "simhash_type=", "avg_dist",
					"count"])
	except getopt.GetoptError:
		print help_msg
		sys.exit(2)
	hasinputfile = False
	outfile = None
	avg_dist = False
	for opt, arg in opts:
		if opt == "-h":
			print help_msg
			sys.exit()
		elif opt in ("-f", "--function"):
			function = arg
			has_function = True
		elif opt in ("-p", "--prefix"):
			prefix = arg
		elif opt in ("-o", "--outfile"):
			outfile = arg
		elif opt in ("-i", "--ifile"):
			inputfile = arg
			hasinputfile = True
		elif opt in ("-t", "--proto_type"):
			proto_type = arg
		elif opt in ("-m", "--mode"):
			mode = arg
		elif opt in ("-l", "--link"):
			link = arg
		elif opt in ("-s", "--simhash_type"):
			simhash_type = arg
		elif opt in ("-a", "--avg_dist"):
			avg_dist = True
		elif opt in ("-c", "--count"):
			count = arg
		else:
			print help_msg
			sys.exit(2)
	if hasinputfile:
		logging.basicConfig(filename= inputfile + "_running_log_" + function, level=logging.DEBUG)
		logging.getLogger("global")
	if not has_function:
		print help_msg
		sys.exit()
	if function == "append_prefix":
		inputfile_list = [line[:-1] for line in sys.stdin]
		append_prefix(inputfile_list, prefix)
	elif function == "compute_list":
		crawl_log_list = [line[:-1] for line in sys.stdin]
		compute_list(crawl_log_list, outfile, prefix)
	elif function == "show_proto":
		show_proto(inputfile, proto_type)
	elif function == "intersect_sites":
		observed_sites_list = [line[:-1] for line in sys.stdin]
		result_sites = intersect_observed_sites(*observed_sites_list)
		write_proto_to_file(result_sites, outfile)
		evaluation_form(outfile, outfile + ".eval", "ObservedSites")
	elif function == "collect_observations":
		if link:
			util.REMOTE_DRIVER = link
		site_list = filter(bool, open(inputfile, 'r').read().split('\n'))
		site_set = set(site_list)
		outdir = outfile
		collect_site_for_plot(site_set, outdir, mode)
	elif function == "plot_simhash":
		if not outfile:
			outfile = inputfile + ".plot_cluster"
		plot_simhash(inputfile, outfile, simhash_type, proto_type)
	elif function == "plot_sim_distance":
		if not outfile:
			outfile = inputfile + ".plot_sim_distance"
		plot_sim_distance(inputfile, outfile, simhash_type, proto_type,
				avg_dist)
	elif function == "get_domains":
		observed_sites_list = [line[:-1] for line in sys.stdin]
		get_domains(observed_sites_list, outfile)
	elif function == "get_domain_scores":
		domains = filter(bool, open(inputfile, 'r').read().split('\n'))
		result = domain_scores(domains, outfile)
	elif function == "domain_filter":
		"""
		Three steps for computed sites.
		1. filter known benign
		2. de-duplicate
		3. sample $count number of sites
		"""
		bar_points = 60
		observed_sites_list = filter(bool, open(inputfile, 'r').read().split('\n'))
		for filename in observed_sites_list:
			get_bad(bar_points, filename, filename + ".filt")
	elif function == "dedup":
		text_filenames = filter(bool, open(inputfile, 'r').read().split('\n'))
		count = 0
		for filename in text_filenames:
			if ((not 'text' in filename) or ('google' in filename) or
					('dom' in filename)):
				response = interact_query("The input file doesn't seem to \
						be valid! Press [Yes/No] to continue or exit!")
				if not response:
					sys.exit(0)
			count += dedup(filename)

		logger = logging.getLogger("global")
		logger.info("total sites after dedup: {0}".format(count))
	elif function == "sample":
		text_filenames = filter(bool, open(inputfile, 'r').read().split('\n'))
		sample(text_filenames, outfile, int(count))
		evaluation_form(outfile + '.user.sample.text', outfile +
				".user.sample.text.eval", "ObservedSites")
		evaluation_form(outfile + '.google.sample.text', outfile +
				".google.sample.text.eval", "ObservedSites")
	elif function == "merge_sites":
		observed_sites_names = [line[:-1] for line in sys.stdin]
		observed_sites = merge_observed_sites(observed_sites_names)
		logger = logging.getLogger("global")
		logger.info("total sites after merge: {0}".format(len(observed_sites.site)))
		write_proto_to_file(observed_sites, outfile)
	elif function == "merge_user_sites":
		"""
		-i input_file
		"""
		filenames = filter(bool, open(inputfile, 'r').read().split('\n'))
		text_filenames = [filename + '.text' for filename in filenames]
		dom_filenames = [filename + '.dom' for filename in filenames]
		text_observed_sites = merge_observed_sites(text_filenames)
		logger = logging.getLogger("global")
		logger.info("total sites after merge: {0}".format(len(text_observed_sites.site)))
		write_proto_to_file(text_observed_sites, inputfile + '.text')
		dom_observed_sites = merge_observed_sites(dom_filenames)
		logger.info("total sites after merge: {0}".format(len(dom_observed_sites.site)))
		write_proto_to_file(dom_observed_sites, inputfile + '.dom')
	elif function == "get_learned_eval":
		"""
		-l learned_file -i detected_file
		"""
		learned_file = link
		observed_file = inputfile
		result_sites = get_learned_eval(learned_file, observed_file)
		write_proto_to_file(result_sites, outfile)
		evaluation_form(outfile, outfile + ".eval", "LearnedSites")
	elif function == "export_db":
		"""
		-i table_name -o outfile
		"""
		export_db_to_file(inputfile, outfile)
		export_db_to_file(inputfile, outfile + ".noise", ["PageBroken"])
	elif function == "de_noise":
		"""
		remove noise: index.html not found, feature count = 0
		"""
		if "learn" in inputfile:
			response = interact_query("The input file seems to \
					be learned sites, we only support observed \
					sites! Press [Yes/No] to continue or exit!")
			if not response:
				sys.exit(0)

		logger = logging.getLogger("global")
		logger.info("processing {0}".format(inputfile))
		de_noise_config = CD.DeNoiseConfig()
		de_noise_config.zero_feature = True
		original = CD.ObservedSites()
		read_proto_from_file(original, inputfile)
		observed_sites = de_noise(original, de_noise_config)
		logger.info("before de-noise {0}".format(len(original.site)))
		logger.info("after de-noise: {0}".format(len(observed_sites.site)))
		outfile = outfile if outfile else inputfile
		write_proto_to_file(observed_sites, outfile)
	elif function == "update_groundtruth":
		"""
		This function is too specific. It is to add more malicious
		examples to the collected groundtruth.
		"""
		filenames = filter(bool, open(inputfile, 'r').read().split('\n'))
		if len(filenames) == 15:
			original_expected = filenames[0]
			original_u_text = filenames[1]
			original_u_dom = filenames[2]
			original_g_text = filenames[3]
			original_g_dom = filenames[4]
			# observed site may have same URL.
			add_count = count
			add_expected = filenames[5]
			add_u_text = filenames[6]
			add_u_dom = filenames[7]
			add_g_text = filenames[8]
			add_g_dom = filenames[9]
			# outfile
			out_expected = filenames[10]
			out_u_text = filenames[11]
			out_u_dom = filenames[12]
			out_g_text = filenames[13]
			out_g_dom = filenames[14]
			# in this case we will add all
			update_groundtruth(original_expected,
					original_u_text, original_u_dom,
					original_g_text, original_g_dom,
					add_expected, add_u_text, add_u_dom,
					add_g_text, add_g_dom,
					out_expected, out_u_text, out_u_dom,
					out_g_text, out_g_dom)
		elif len(filenames) == 12:
			original_expected = filenames[0]
			original_u_text = filenames[1]
			original_u_dom = filenames[2]
			original_g_text = filenames[3]
			original_g_dom = filenames[4]
			# observed site may have same URL.
			add_count = int(count)
			add_expected = filenames[5]
			add_all = filenames[6]
			'''
			add_u_text = filenames[6]
			add_u_dom = filenames[7]
			add_g_text = filenames[8]
			add_g_dom = filenames[9]
			# outfile
			out_expected = filenames[10]
			out_u_text = filenames[11]
			out_u_dom = filenames[12]
			out_g_text = filenames[13]
			out_g_dom = filenames[14]
			'''
			out_expected = filenames[7]
			out_u_text = filenames[8]
			out_u_dom = filenames[9]
			out_g_text = filenames[10]
			out_g_dom = filenames[11]

			update_groundtruth_redundant(add_count, original_expected,
					original_u_text, original_u_dom,
					original_g_text, original_g_dom,
					add_expected, add_all,
					out_expected, out_u_text, out_u_dom,
					out_g_text, out_g_dom)
		else:
			raise Exception("Cannot handle now!")
	else:
		print help_msg
		sys.exit(2)