コード例 #1
0
def get_domain_reputation(domain_list, bar_points, 
		domain_database = '../../data/all.computed/all_domains.scores'):
	if not domain_database:
		print "Querying WOT service..."
		domain_database = '../../data/all.computed/all_domains.scores'
		wot.domain_scores(domain_list, domain_database)
		return get_domain_reputation(domain_list, bar_points)
	else:
		score_dict = dict()
		domain_scores = filter(bool, open(domain_database, 'r').read().split('\n'))
		for domain_score in domain_scores:
			domain = domain_score.split(',')[0]
			score = sum([int(s) for s in 
				domain_score.split(',')[1:]]) 
			score_dict[domain] = score
		bad_domain = set()
		unknown_domain = set()
		for domain in domain_list:
			if domain in score_dict:
				if score_dict[domain] <= bar_points:
					bad_domain.add(domain)
			else:
				unknown_domain.add(domain)
		if len(unknown_domain) > 0:
			# call self recursively
			print "Querying WOT service"
			bad_domain = bad_domain | set(wot.filt(unknown_domain, bar_points))
			wot.domain_scores(unknown_domain, domain_database)
		return bad_domain
コード例 #2
0
def main(argv):
	has_function = False
	help_msg = """data_util.py -f <function> [-p <prefix>][-p <prefix> -o
	<outfile>][-i <inputfile> -t <proto_type>][-o <outfile>][-i <site_list>
	-l <server_link> -o <outdir> -m <mode>][-i <inputfile>-o <outfile> -s
	<simhash_type> -t <proto_type>][-i <inputfile> -o <outfile> -s
	<simhash_type> -t <proto_type> -a] [-o <outfile>] [-i <inputfile> -o
	<outfile>] [-i <inputfile>] [-i <text_filt>] [-i <inputfile> -c <count>
	-o <outfile>] [-o <outfile>] [-i <inputfile> -l <leanredfile> -o <outfile>], valid functions are
	append_prefix, compute_list, show_proto, intersect_sites,
	collect_observations, plot_simhash, plot_sim_distance, get_domains,
	get_domain_scores, domain_filter, dedup, sample, merge_sites,
	get_learned_eval, [-i <table_name> -o <outfie>] export_db
	[-i <inputfile> -o <outfile>] de_noise
	[-i <inputfile> -c <count>] update_groundtruth
	[-i <user observation list, suffix removed>] merge_user_sites"""
	try:
		opts, args = getopt.getopt(argv, "hf:p:o:t:i:m:l:s:ac:",
				["function=", "prefix=", "outfile=",
					"proto_type=", "ifile=", "mode=",
					"link=", "simhash_type=", "avg_dist",
					"count"])
	except getopt.GetoptError:
		print help_msg
		sys.exit(2)
	hasinputfile = False
	outfile = None
	avg_dist = False
	for opt, arg in opts:
		if opt == "-h":
			print help_msg
			sys.exit()
		elif opt in ("-f", "--function"):
			function = arg
			has_function = True
		elif opt in ("-p", "--prefix"):
			prefix = arg
		elif opt in ("-o", "--outfile"):
			outfile = arg
		elif opt in ("-i", "--ifile"):
			inputfile = arg
			hasinputfile = True
		elif opt in ("-t", "--proto_type"):
			proto_type = arg
		elif opt in ("-m", "--mode"):
			mode = arg
		elif opt in ("-l", "--link"):
			link = arg
		elif opt in ("-s", "--simhash_type"):
			simhash_type = arg
		elif opt in ("-a", "--avg_dist"):
			avg_dist = True
		elif opt in ("-c", "--count"):
			count = arg
		else:
			print help_msg
			sys.exit(2)
	if hasinputfile:
		logging.basicConfig(filename= inputfile + "_running_log_" + function, level=logging.DEBUG)
		logging.getLogger("global")
	if not has_function:
		print help_msg
		sys.exit()
	if function == "append_prefix":
		inputfile_list = [line[:-1] for line in sys.stdin]
		append_prefix(inputfile_list, prefix)
	elif function == "compute_list":
		crawl_log_list = [line[:-1] for line in sys.stdin]
		compute_list(crawl_log_list, outfile, prefix)
	elif function == "show_proto":
		show_proto(inputfile, proto_type)
	elif function == "intersect_sites":
		observed_sites_list = [line[:-1] for line in sys.stdin]
		result_sites = intersect_observed_sites(*observed_sites_list)
		write_proto_to_file(result_sites, outfile)
		evaluation_form(outfile, outfile + ".eval", "ObservedSites")
	elif function == "collect_observations":
		if link:
			util.REMOTE_DRIVER = link
		site_list = filter(bool, open(inputfile, 'r').read().split('\n'))
		site_set = set(site_list)
		outdir = outfile
		collect_site_for_plot(site_set, outdir, mode)
	elif function == "plot_simhash":
		if not outfile:
			outfile = inputfile + ".plot_cluster"
		plot_simhash(inputfile, outfile, simhash_type, proto_type)
	elif function == "plot_sim_distance":
		if not outfile:
			outfile = inputfile + ".plot_sim_distance"
		plot_sim_distance(inputfile, outfile, simhash_type, proto_type,
				avg_dist)
	elif function == "get_domains":
		observed_sites_list = [line[:-1] for line in sys.stdin]
		get_domains(observed_sites_list, outfile)
	elif function == "get_domain_scores":
		domains = filter(bool, open(inputfile, 'r').read().split('\n'))
		result = domain_scores(domains, outfile)
	elif function == "domain_filter":
		"""
		Three steps for computed sites.
		1. filter known benign
		2. de-duplicate
		3. sample $count number of sites
		"""
		bar_points = 60
		observed_sites_list = filter(bool, open(inputfile, 'r').read().split('\n'))
		for filename in observed_sites_list:
			get_bad(bar_points, filename, filename + ".filt")
	elif function == "dedup":
		text_filenames = filter(bool, open(inputfile, 'r').read().split('\n'))
		count = 0
		for filename in text_filenames:
			if ((not 'text' in filename) or ('google' in filename) or
					('dom' in filename)):
				response = interact_query("The input file doesn't seem to \
						be valid! Press [Yes/No] to continue or exit!")
				if not response:
					sys.exit(0)
			count += dedup(filename)

		logger = logging.getLogger("global")
		logger.info("total sites after dedup: {0}".format(count))
	elif function == "sample":
		text_filenames = filter(bool, open(inputfile, 'r').read().split('\n'))
		sample(text_filenames, outfile, int(count))
		evaluation_form(outfile + '.user.sample.text', outfile +
				".user.sample.text.eval", "ObservedSites")
		evaluation_form(outfile + '.google.sample.text', outfile +
				".google.sample.text.eval", "ObservedSites")
	elif function == "merge_sites":
		observed_sites_names = [line[:-1] for line in sys.stdin]
		observed_sites = merge_observed_sites(observed_sites_names)
		logger = logging.getLogger("global")
		logger.info("total sites after merge: {0}".format(len(observed_sites.site)))
		write_proto_to_file(observed_sites, outfile)
	elif function == "merge_user_sites":
		"""
		-i input_file
		"""
		filenames = filter(bool, open(inputfile, 'r').read().split('\n'))
		text_filenames = [filename + '.text' for filename in filenames]
		dom_filenames = [filename + '.dom' for filename in filenames]
		text_observed_sites = merge_observed_sites(text_filenames)
		logger = logging.getLogger("global")
		logger.info("total sites after merge: {0}".format(len(text_observed_sites.site)))
		write_proto_to_file(text_observed_sites, inputfile + '.text')
		dom_observed_sites = merge_observed_sites(dom_filenames)
		logger.info("total sites after merge: {0}".format(len(dom_observed_sites.site)))
		write_proto_to_file(dom_observed_sites, inputfile + '.dom')
	elif function == "get_learned_eval":
		"""
		-l learned_file -i detected_file
		"""
		learned_file = link
		observed_file = inputfile
		result_sites = get_learned_eval(learned_file, observed_file)
		write_proto_to_file(result_sites, outfile)
		evaluation_form(outfile, outfile + ".eval", "LearnedSites")
	elif function == "export_db":
		"""
		-i table_name -o outfile
		"""
		export_db_to_file(inputfile, outfile)
		export_db_to_file(inputfile, outfile + ".noise", ["PageBroken"])
	elif function == "de_noise":
		"""
		remove noise: index.html not found, feature count = 0
		"""
		if "learn" in inputfile:
			response = interact_query("The input file seems to \
					be learned sites, we only support observed \
					sites! Press [Yes/No] to continue or exit!")
			if not response:
				sys.exit(0)

		logger = logging.getLogger("global")
		logger.info("processing {0}".format(inputfile))
		de_noise_config = CD.DeNoiseConfig()
		de_noise_config.zero_feature = True
		original = CD.ObservedSites()
		read_proto_from_file(original, inputfile)
		observed_sites = de_noise(original, de_noise_config)
		logger.info("before de-noise {0}".format(len(original.site)))
		logger.info("after de-noise: {0}".format(len(observed_sites.site)))
		outfile = outfile if outfile else inputfile
		write_proto_to_file(observed_sites, outfile)
	elif function == "update_groundtruth":
		"""
		This function is too specific. It is to add more malicious
		examples to the collected groundtruth.
		"""
		filenames = filter(bool, open(inputfile, 'r').read().split('\n'))
		if len(filenames) == 15:
			original_expected = filenames[0]
			original_u_text = filenames[1]
			original_u_dom = filenames[2]
			original_g_text = filenames[3]
			original_g_dom = filenames[4]
			# observed site may have same URL.
			add_count = count
			add_expected = filenames[5]
			add_u_text = filenames[6]
			add_u_dom = filenames[7]
			add_g_text = filenames[8]
			add_g_dom = filenames[9]
			# outfile
			out_expected = filenames[10]
			out_u_text = filenames[11]
			out_u_dom = filenames[12]
			out_g_text = filenames[13]
			out_g_dom = filenames[14]
			# in this case we will add all
			update_groundtruth(original_expected,
					original_u_text, original_u_dom,
					original_g_text, original_g_dom,
					add_expected, add_u_text, add_u_dom,
					add_g_text, add_g_dom,
					out_expected, out_u_text, out_u_dom,
					out_g_text, out_g_dom)
		elif len(filenames) == 12:
			original_expected = filenames[0]
			original_u_text = filenames[1]
			original_u_dom = filenames[2]
			original_g_text = filenames[3]
			original_g_dom = filenames[4]
			# observed site may have same URL.
			add_count = int(count)
			add_expected = filenames[5]
			add_all = filenames[6]
			'''
			add_u_text = filenames[6]
			add_u_dom = filenames[7]
			add_g_text = filenames[8]
			add_g_dom = filenames[9]
			# outfile
			out_expected = filenames[10]
			out_u_text = filenames[11]
			out_u_dom = filenames[12]
			out_g_text = filenames[13]
			out_g_dom = filenames[14]
			'''
			out_expected = filenames[7]
			out_u_text = filenames[8]
			out_u_dom = filenames[9]
			out_g_text = filenames[10]
			out_g_dom = filenames[11]

			update_groundtruth_redundant(add_count, original_expected,
					original_u_text, original_u_dom,
					original_g_text, original_g_dom,
					add_expected, add_all,
					out_expected, out_u_text, out_u_dom,
					out_g_text, out_g_dom)
		else:
			raise Exception("Cannot handle now!")
	else:
		print help_msg
		sys.exit(2)