コード例 #1
0
def plot_sim_distance(inputfile, outfile, simhash_type, proto_type,
		avg_dist=True):
	simhash_type = get_simhash_type(simhash_type, True)
	sites = getattr(CD, proto_type)()
	read_proto_from_file(sites, inputfile)
	out_f = open(outfile, "w")
	if proto_type == "LearnedSites":
		for learned_site in sites.site:
			out_f.write(learned_site.name + "," + str(len(learned_site.pattern)) + "\n")
			for pattern in learned_site.pattern:
				dist_list = simhash_vector_distance(pattern.item,
						avg_dist)
				out_f.write("pattern\n" + "\n".join([str(d) for d in
					dist_list]) + "\n")
		out_f.close()
	elif proto_type == "ObservedSites":
		for observed_site in sites.site:
			out_f.write(observed_site.name + "," + str(len(observed_site.observation)) + "\n")
			simhash_item_vector = aggregate_simhash(observed_site, simhash_type)
			dist_list = simhash_vector_distance(simhash_item_vector,
					avg_dist)
			out_f.write("\n".join([str(d) for d in dist_list]) + "\n")
		out_f.close()
	else:
		raise Exception("Wrong proto! Only LearnedSites and ObservedSites can be used!")
コード例 #2
0
def get_bad(bar_points, filename, outfilename):
	observed_sites = CD.ObservedSites()
	read_proto_from_file(observed_sites, filename)
	domain_set = set();

	for site in observed_sites.site:
		for observation in site.observation:
			url_domain = top_domain(observation.landing_url)
			domain_set.add(url_domain)

	domain_list = list(domain_set)
	bad_domains = get_domain_reputation(domain_list, bar_points)
	bad_observed_sites = CD.ObservedSites()
	bad_observed_sites.config.CopyFrom(observed_sites.config)

	for site in observed_sites.site:
		observation_list = list()
		for observation in site.observation:
			if top_domain(observation.landing_url) in bad_domains:
				observation_list.append(observation)
		if len(observation_list) == 0:
			continue
		bad_site = bad_observed_sites.site.add()
		bad_site.name = site.name
		for observation in observation_list:
			to_add = bad_site.observation.add()
			to_add.CopyFrom(observation)

	write_proto_to_file(bad_observed_sites, outfilename)
コード例 #3
0
def plot_simhash(inputfile, outfile, simhash_type, proto_type):
	simhash_type = get_simhash_type(simhash_type)
	sites = getattr(CD, proto_type)()
	read_proto_from_file(sites, inputfile)
	out_f = open(outfile, "w")
	if proto_type == "LearnedSites":
		for site in sites.site:
			observation_size = 0
			for pattern in site.pattern:
				for item in pattern.item:
					observation_size += item.count
			out_f.write(site.name + "," + str(observation_size) + "\n")
			for pattern in site.pattern:
				for item in pattern.item:
					item_str = "%0.16x" % item.simhash
					item_str_array = [item_str for i in range(item.count)]
					out_f.write("\n".join(item_str_array) + "\n")
		out_f.close()
	elif proto_type == "ObservedSites":
		for site in sites.site:
			out_f.write(site.name + "," + str(len(site.observation)) + "\n")
			for observation in site.observation:
				simhash_str = "%0.16x" % getattr(observation, simhash_type)
				out_f.write(simhash_str + "\n")
		out_f.close()
	else:
		raise Exception("Wrong proto! Only LearnedSites and ObservedSites can be used!")
コード例 #4
0
def get_domains(observed_sites_list, outfile):
	domain_set = set()
	for filename in observed_sites_list:
		observed_sites = CD.ObservedSites()
		read_proto_from_file(observed_sites, filename)
		for site in observed_sites.site:
			for observation in site.observation:
				url_domain = top_domain(observation.landing_url)
				domain_set.add(url_domain)
	open(outfile, 'w').write("\n".join(domain_set))
コード例 #5
0
def test_intersect_observed_sites():
	observed_sites_list = ["../../data/abusive_words_9_category.computed/test.user.dom.cloaking",
			"../../data/abusive_words_9_category.computed/test.user.text.cloaking"]
	result = None
	for filename in observed_sites_list:
		observed_sites = CD.ObservedSites()
		read_proto_from_file(observed_sites, filename)
		files = sites_file_path_set(observed_sites)
		result = result & files if result else files
	result_sites = intersect_observed_sites(*observed_sites_list)
	new_set = sites_file_path_set(result_sites)
	assert_equal(result, new_set)
コード例 #6
0
def test_generate_test():
	filename = "../../data/US_web_search_list.Chrome.20141110-185317.selenium.crawl/crawl_log"
	generate_test(filename)
	text_test = filename + ".text.test"
	text_mismatch = filename + ".text.mismatch"
	dom_test = filename + ".dom.test"
	dom_mismatch = filename + ".dom.mismatch"
	text_test_sites = CD.ObservedSites()
	text_mismatch_sites = CD.ObservedSites()
	dom_test_sites = CD.ObservedSites()
	dom_mismatch_sites = CD.ObservedSites()
	read_proto_from_file(text_test_sites, text_test)
	read_proto_from_file(text_mismatch_sites, text_mismatch)
	read_proto_from_file(dom_test_sites, dom_test)
	read_proto_from_file(dom_mismatch_sites, dom_mismatch)
	assert_equal(len(text_test_sites.site), 5000)
	assert_equal(len(text_mismatch_sites.site), 1000)
	assert_equal(len(dom_test_sites.site), 5000)
	assert_equal(len(dom_mismatch_sites.site), 1000)
	text_test_set = set()
	text_mismatch_set = set()
	dom_test_set = set()
	dom_mismatch_set = set()
	for site in text_test_sites.site:
		text_test_set.add(site.name)
	for site in text_mismatch_sites.site:
		text_mismatch_set.add(site.name)
	for site in dom_test_sites.site:
		dom_test_set.add(site.name)
	for site in dom_mismatch_sites.site:
		dom_mismatch_set.add(site.name)
	assert_equal(text_test_set, dom_test_set)
	assert_equal(text_mismatch_set, dom_mismatch_set)
コード例 #7
0
def check_equal(first_file, second_file):
	first_observed_sites = CD.ObservedSites()
	read_proto_from_file(first_observed_sites, first_file)
	second_observed_sites = CD.ObservedSites()
	read_proto_from_file(second_observed_sites, second_file)
	first_observed_sites_map = dict()
	for observed_site in first_observed_sites.site:
		first_observed_sites_map[observed_site.name] = observed_site
	for observed_site in second_observed_sites.site:
		if not observed_site.name in first_observed_sites_map:
			return False
		if not observed_site == first_observed_sites_map[observed_site.name]:
			return False
	return True
コード例 #8
0
def load_split_observed_sites(filename):
	if not os.path.exists(filename):
		count = 0
		split_files = list()
		while True:
			split_file = filename.replace('list', 'list_' +
					str(count))
			if not os.path.exists(split_file):
				break
			split_files.append(split_file)
			count += 1
		observed_sites = merge_observed_sites(split_files)
	else:
		observed_sites = CD.ObservedSites()
		read_proto_from_file(observed_sites, filename)
	return observed_sites
コード例 #9
0
ファイル: util.py プロジェクト: slitayem/cloaking-detection-1
def evaluation_form(sites_filename, out_filename, proto):
	sites = getattr(CD, proto)()
	read_proto_from_file(sites, sites_filename)
	out_f = open(out_filename, "w")
	if proto == "LearnedSites":
		for site in sites.site:
			for pattern in site.pattern:
				out_f.write(site.name + "\n" + \
						pattern.item[0].sample_file_path + "\n")
		out_f.close()
	elif proto == "ObservedSites":
		for site in sites.site:
			for observation in site.observation:
				out_f.write(site.name + "\n" + observation.file_path + "\n")
		out_f.close()
	else:
		raise Exception("Wrong proto! Only LearnedSites and ObservedSites can be used!")
コード例 #10
0
	def crawl(self):
		has_written = False
		for user_agent in self.user_agents:
			user_agent_md5 = hex_md5(user_agent)
			self.crawl_config.user_agent = user_agent
			self.crawl_config.user_agent_md5_dir = self.base_dir + user_agent_md5 + '/'
			# specify which type of browser to use
			set_browser_type(self.crawl_config)
			mkdir_if_not_exist(self.crawl_config.user_agent_md5_dir)
			# md5 - user agent mapping logs
			md5_UA_f = open(self.md5_UA_filename, 'a')  # user agent
			md5_UA_f.write(user_agent_md5 + ":" + user_agent + "\n")
			md5_UA_f.close()
			# crawl web pages
			url_fetcher = UrlFetcher(self.crawl_config)
			thread_computer = ThreadComputer(url_fetcher, 'fetch_url', self.urls)
			url_fetcher.quit()
			# Write log for current user agent
			current_log = CD.CrawlLog()
			current_log_filename = self.crawl_config.user_agent_md5_dir + 'crawl_log'
			current_search = CD.CrawlSearchTerm()
			for p, s in thread_computer.result:
				result = current_search.result.add()
				result.CopyFrom(s)
				result_search = current_log.result_search.add()
				result_search.CopyFrom(current_search)
			write_proto_to_file(current_log, current_log_filename)
			# Write global crawl_log
			crawl_log = CD.CrawlLog()
			if has_written:
				read_proto_from_file(crawl_log, self.crawl_log_filename)
			else:
				has_written = True
			for r_s in current_log.result_search:
				result_search = crawl_log.result_search.add()
				result_search.CopyFrom(r_s)
			"""
			for s in current_log.result:
				result = crawl_log.result.add()
				result.CopyFrom(s)
			"""
			write_proto_to_file(crawl_log, self.crawl_log_filename)
コード例 #11
0
def get_learned_eval(learned_file, observed_file):
	learned_sites = CD.LearnedSites()
	read_proto_from_file(learned_sites, learned_file)
	observed_sites = CD.ObservedSites()
	read_proto_from_file(observed_sites, observed_file)
	observed_sites_list = list()
	for observed_site in observed_sites.site:
		observed_sites_list.append(observed_site.name)
	learned_sites_map = dict()
	for learned_site in learned_sites.site:
		learned_sites_map[learned_site.name] = learned_site
	result_sites = CD.LearnedSites()
	for site_name in observed_sites_list:
		if site_name not in learned_sites_map:
			print "Detected cloaking: {0} not in learned sites, \
					Strange!".format(site_name)
			continue
		result_site = result_sites.site.add()
		result_site.CopyFrom(learned_sites_map[site_name])
	return result_sites
コード例 #12
0
def revisit(crawl_log_file_list, word_file, n):
	"""
	visit landing urls in crawl_log_file n times
	@parameter
	crawl_log_file_list: list of filenames of crawl_log
	word_file: file containing words in crawl_log_file, used for creating base_dir
	n: number of times to visit
	"""
	# google_UA is not used in search and crawl. Used in later visit.
	google_UA = "AdsBot-Google (+http://www.google.com/adsbot.html)"
	google_suffix = 'google.crawl/'
	for i in range(int(n)):
		# the time label is set for each iteration of visit
		now_suffix = datetime.now().strftime(".%Y%m%d-%H%M%S")
		for crawl_log_file in crawl_log_file_list:
			# compute base_dir and start logging
			base_dir = '.'.join([word_file, google_suffix])
			mkdir_if_not_exist(base_dir)
			logging.basicConfig(filename=base_dir+'running_log'+now_suffix, level=logging.DEBUG)
			logging.getLogger("global")

			# set crawl_config
			crawl_config = CD.CrawlConfig()
			crawl_config.maximum_threads = 6
			crawl_config.user_agent = google_UA
			crawl_config.user_agent_md5_dir = base_dir + hex_md5(crawl_config.user_agent) \
					+ now_suffix + '/'
			crawl_config.browser_type = CD.CrawlConfig.CHROME

			google_crawl_log = crawl_log_file.split('/')[-1] + '.google'
			crawl_config.log_filename = google_crawl_log + now_suffix
			revisit = Visit(crawl_config)
			crawl_log = CD.CrawlLog()
			read_proto_from_file(crawl_log, crawl_log_file)
			landing_url_set = crawl_log_attr_set(crawl_log, "landing_url")
			revisit.visit_landing_url(landing_url_set)
			revisit.write_crawl_log(False)
コード例 #13
0
def update_groundtruth(original_expected,
		original_u_text, original_u_dom, original_g_text, original_g_dom, 
		add_expected, add_u_text, add_u_dom, add_g_text, add_g_dom, 
		out_expected, out_u_text, out_u_dom, out_g_text, out_g_dom):
	in_e = CD.ObservedSites()
	read_proto_from_file(in_e, original_expected)
	in_u_t = CD.ObservedSites()
	read_proto_from_file(in_u_t, original_u_text)
	in_u_d = CD.ObservedSites()
	read_proto_from_file(in_u_d, original_u_dom)
	in_g_t = CD.ObservedSites()
	read_proto_from_file(in_g_t, original_g_text)
	in_g_d = CD.ObservedSites()
	read_proto_from_file(in_g_d, original_g_dom)

	# add google is list
	add_e = CD.ObservedSites()
	read_proto_from_file(add_e, add_expected)
	#add_u_t = CD.ObservedSites()
	#read_proto_from_file(add_u_t, add_u_text)
	#add_u_d = CD.ObservedSites()
	#read_proto_from_file(add_u_d, add_u_dom)
	#add_g_t = merge_observed_sites(add_g_text)
	#add_g_d = merge_observed_sites(add_g_dom)
	in_e_set = sites_name_set(in_e)
	add_e_set = sites_name_set(add_e)
	diff_e_set = add_e_set - in_e_set
	logger = logging.getLogger("global")
	logger.info("size of original set: {0}, size of add set: {1}, size of diff set: {2}".format(
		len(in_e_set), len(add_e_set), len(diff_e_set)))
	logger.info("diff set is")	
	logger.info(diff_e_set)

	_output_sample_sites(diff_e_set, [add_expected], add_expected + ".temp")
	_output_sample_sites(diff_e_set, [add_u_text], add_u_text + ".temp")
	_output_sample_sites(diff_e_set, [add_u_dom], add_u_dom + ".temp")
	add_g_text_fs = filter(bool, open(add_g_text, 'r').read().split('\n'))
	add_g_dom_fs = filter(bool, open(add_g_dom, 'r').read().split('\n'))
	_output_sample_sites(diff_e_set, add_g_text_fs, add_g_text + ".temp")
	_output_sample_sites(diff_e_set, add_g_dom_fs, add_g_dom + ".temp")
	out_expected_sites = merge_observed_sites([original_expected,
		add_expected + ".temp"])
	out_u_t_sites = merge_observed_sites([original_u_text, add_u_text + ".temp"])
	out_u_d_sites = merge_observed_sites([original_u_dom, add_u_dom + ".temp"])
	out_g_t_sites = merge_observed_sites([original_g_text, add_g_text + ".temp"])
	out_g_d_sites = merge_observed_sites([original_g_dom, add_g_dom + ".temp"])
	out_u_t_sites.config.CopyFrom(in_u_t.config)
	out_u_d_sites.config.CopyFrom(in_u_d.config)
	out_g_t_sites.config.CopyFrom(in_g_t.config)
	out_g_d_sites.config.CopyFrom(in_g_d.config)
	write_proto_to_file(out_expected_sites, out_expected)
	write_proto_to_file(out_u_t_sites, out_u_text)
	write_proto_to_file(out_u_d_sites, out_u_dom)
	write_proto_to_file(out_g_t_sites, out_g_text)
	write_proto_to_file(out_g_d_sites, out_g_dom)
コード例 #14
0
def dedup(text_file):
	"""
	1. dom_file, google_text_file, google_dom_file are deducted from text_file
	2. google files can be split. we first check whether unsplit exisits, if
	not we merge all the split ones.
	3. The observed sites are output to correponding filename + '.dedup'

	@parameter
	text_file: text observed sites file
	@return
	number of websites after deduplicate
	"""
	dom_file = text_file.replace('text', 'dom')
	user_text_observed_sites = CD.ObservedSites()
	read_proto_from_file(user_text_observed_sites, text_file)
	logger = logging.getLogger("global")
	logger.info("processing {0}".format(text_file))
	logger.info("before dedup: {0}".format(len(user_text_observed_sites.site)))
	user_dom_observed_sites = CD.ObservedSites()
	read_proto_from_file(user_dom_observed_sites, dom_file)
	google_text_file = text_file.replace('user', 'google')
	google_text_observed_sites = load_split_observed_sites(google_text_file)
	google_dom_file = dom_file.replace('user', 'google')
	google_dom_observed_sites = load_split_observed_sites(google_dom_file)

	user_text_dict, user_text_sites_dict = build_site_simhash_dict(user_text_observed_sites)
	user_dom_dict, user_dom_sites_dict = build_site_simhash_dict(user_dom_observed_sites)
	google_text_dict, google_text_sites_dict = build_site_simhash_dict(google_text_observed_sites)
	google_dom_dict, google_dom_sites_dict = build_site_simhash_dict(google_dom_observed_sites)

	# how to define exact match
	user_text_remained = CD.ObservedSites()
	user_dom_remained = CD.ObservedSites()
	google_text_remained = CD.ObservedSites()
	google_dom_remained = CD.ObservedSites()
	text_failure = set([0])
	failure_count = 0
	# if the feature set is empty, then this is the hash value.
	text_zero = set([18446744073709551615])
	zero_count = 0
	google_failure_count = 0
	google_zero_count = 0
	for site_name in user_text_dict:
		if ((not site_name in google_text_dict) or
				(not site_name in google_dom_dict)):
			continue
		if (user_text_dict[site_name] == text_failure):
			failure_count += 1
			continue
		elif (user_text_dict[site_name] == text_zero):
			zero_count += 1
			continue
		elif (google_text_dict[site_name] == text_failure):
			google_failure_count += 1
			continue
		elif (google_text_dict[site_name] == text_zero):
			google_zero_count += 1
			continue
		text_common = user_text_dict[site_name] & google_text_dict[site_name] 
		dom_common = user_dom_dict[site_name] & google_dom_dict[site_name]
		if (text_common == user_text_dict[site_name] and 
				dom_common == user_dom_dict[site_name]):
			continue
		else:
			_add_observed_site(user_text_remained, user_text_sites_dict, site_name)
			_add_observed_site(user_dom_remained, user_dom_sites_dict, site_name)
			_add_observed_site(google_text_remained, google_text_sites_dict, site_name)
			_add_observed_site(google_dom_remained, google_dom_sites_dict, site_name)

	user_text_remained.config.CopyFrom(user_text_observed_sites.config)
	user_dom_remained.config.CopyFrom(user_dom_observed_sites.config)
	google_text_remained.config.CopyFrom(google_text_observed_sites.config)
	google_dom_remained.config.CopyFrom(google_dom_observed_sites.config)
	write_proto_to_file(user_text_remained, text_file + ".dedup")
	write_proto_to_file(user_dom_remained, dom_file + ".dedup")
	write_proto_to_file(google_text_remained, google_text_file + ".dedup")
	write_proto_to_file(google_dom_remained, google_dom_file + ".dedup")
	logger.info("after dedup: {0}".format(len(user_text_remained.site)))
	logger.info("failure count: {0}, zero feature count: {1}".format(failure_count, zero_count))
	logger.info("google failure count: {0}, google zero feature count: {1}".format(google_failure_count, google_zero_count))
	return len(user_text_remained.site)
コード例 #15
0
def main(argv):
	has_function = False
	help_msg = """data_util.py -f <function> [-p <prefix>][-p <prefix> -o
	<outfile>][-i <inputfile> -t <proto_type>][-o <outfile>][-i <site_list>
	-l <server_link> -o <outdir> -m <mode>][-i <inputfile>-o <outfile> -s
	<simhash_type> -t <proto_type>][-i <inputfile> -o <outfile> -s
	<simhash_type> -t <proto_type> -a] [-o <outfile>] [-i <inputfile> -o
	<outfile>] [-i <inputfile>] [-i <text_filt>] [-i <inputfile> -c <count>
	-o <outfile>] [-o <outfile>] [-i <inputfile> -l <leanredfile> -o <outfile>], valid functions are
	append_prefix, compute_list, show_proto, intersect_sites,
	collect_observations, plot_simhash, plot_sim_distance, get_domains,
	get_domain_scores, domain_filter, dedup, sample, merge_sites,
	get_learned_eval, [-i <table_name> -o <outfie>] export_db
	[-i <inputfile> -o <outfile>] de_noise
	[-i <inputfile> -c <count>] update_groundtruth
	[-i <user observation list, suffix removed>] merge_user_sites"""
	try:
		opts, args = getopt.getopt(argv, "hf:p:o:t:i:m:l:s:ac:",
				["function=", "prefix=", "outfile=",
					"proto_type=", "ifile=", "mode=",
					"link=", "simhash_type=", "avg_dist",
					"count"])
	except getopt.GetoptError:
		print help_msg
		sys.exit(2)
	hasinputfile = False
	outfile = None
	avg_dist = False
	for opt, arg in opts:
		if opt == "-h":
			print help_msg
			sys.exit()
		elif opt in ("-f", "--function"):
			function = arg
			has_function = True
		elif opt in ("-p", "--prefix"):
			prefix = arg
		elif opt in ("-o", "--outfile"):
			outfile = arg
		elif opt in ("-i", "--ifile"):
			inputfile = arg
			hasinputfile = True
		elif opt in ("-t", "--proto_type"):
			proto_type = arg
		elif opt in ("-m", "--mode"):
			mode = arg
		elif opt in ("-l", "--link"):
			link = arg
		elif opt in ("-s", "--simhash_type"):
			simhash_type = arg
		elif opt in ("-a", "--avg_dist"):
			avg_dist = True
		elif opt in ("-c", "--count"):
			count = arg
		else:
			print help_msg
			sys.exit(2)
	if hasinputfile:
		logging.basicConfig(filename= inputfile + "_running_log_" + function, level=logging.DEBUG)
		logging.getLogger("global")
	if not has_function:
		print help_msg
		sys.exit()
	if function == "append_prefix":
		inputfile_list = [line[:-1] for line in sys.stdin]
		append_prefix(inputfile_list, prefix)
	elif function == "compute_list":
		crawl_log_list = [line[:-1] for line in sys.stdin]
		compute_list(crawl_log_list, outfile, prefix)
	elif function == "show_proto":
		show_proto(inputfile, proto_type)
	elif function == "intersect_sites":
		observed_sites_list = [line[:-1] for line in sys.stdin]
		result_sites = intersect_observed_sites(*observed_sites_list)
		write_proto_to_file(result_sites, outfile)
		evaluation_form(outfile, outfile + ".eval", "ObservedSites")
	elif function == "collect_observations":
		if link:
			util.REMOTE_DRIVER = link
		site_list = filter(bool, open(inputfile, 'r').read().split('\n'))
		site_set = set(site_list)
		outdir = outfile
		collect_site_for_plot(site_set, outdir, mode)
	elif function == "plot_simhash":
		if not outfile:
			outfile = inputfile + ".plot_cluster"
		plot_simhash(inputfile, outfile, simhash_type, proto_type)
	elif function == "plot_sim_distance":
		if not outfile:
			outfile = inputfile + ".plot_sim_distance"
		plot_sim_distance(inputfile, outfile, simhash_type, proto_type,
				avg_dist)
	elif function == "get_domains":
		observed_sites_list = [line[:-1] for line in sys.stdin]
		get_domains(observed_sites_list, outfile)
	elif function == "get_domain_scores":
		domains = filter(bool, open(inputfile, 'r').read().split('\n'))
		result = domain_scores(domains, outfile)
	elif function == "domain_filter":
		"""
		Three steps for computed sites.
		1. filter known benign
		2. de-duplicate
		3. sample $count number of sites
		"""
		bar_points = 60
		observed_sites_list = filter(bool, open(inputfile, 'r').read().split('\n'))
		for filename in observed_sites_list:
			get_bad(bar_points, filename, filename + ".filt")
	elif function == "dedup":
		text_filenames = filter(bool, open(inputfile, 'r').read().split('\n'))
		count = 0
		for filename in text_filenames:
			if ((not 'text' in filename) or ('google' in filename) or
					('dom' in filename)):
				response = interact_query("The input file doesn't seem to \
						be valid! Press [Yes/No] to continue or exit!")
				if not response:
					sys.exit(0)
			count += dedup(filename)

		logger = logging.getLogger("global")
		logger.info("total sites after dedup: {0}".format(count))
	elif function == "sample":
		text_filenames = filter(bool, open(inputfile, 'r').read().split('\n'))
		sample(text_filenames, outfile, int(count))
		evaluation_form(outfile + '.user.sample.text', outfile +
				".user.sample.text.eval", "ObservedSites")
		evaluation_form(outfile + '.google.sample.text', outfile +
				".google.sample.text.eval", "ObservedSites")
	elif function == "merge_sites":
		observed_sites_names = [line[:-1] for line in sys.stdin]
		observed_sites = merge_observed_sites(observed_sites_names)
		logger = logging.getLogger("global")
		logger.info("total sites after merge: {0}".format(len(observed_sites.site)))
		write_proto_to_file(observed_sites, outfile)
	elif function == "merge_user_sites":
		"""
		-i input_file
		"""
		filenames = filter(bool, open(inputfile, 'r').read().split('\n'))
		text_filenames = [filename + '.text' for filename in filenames]
		dom_filenames = [filename + '.dom' for filename in filenames]
		text_observed_sites = merge_observed_sites(text_filenames)
		logger = logging.getLogger("global")
		logger.info("total sites after merge: {0}".format(len(text_observed_sites.site)))
		write_proto_to_file(text_observed_sites, inputfile + '.text')
		dom_observed_sites = merge_observed_sites(dom_filenames)
		logger.info("total sites after merge: {0}".format(len(dom_observed_sites.site)))
		write_proto_to_file(dom_observed_sites, inputfile + '.dom')
	elif function == "get_learned_eval":
		"""
		-l learned_file -i detected_file
		"""
		learned_file = link
		observed_file = inputfile
		result_sites = get_learned_eval(learned_file, observed_file)
		write_proto_to_file(result_sites, outfile)
		evaluation_form(outfile, outfile + ".eval", "LearnedSites")
	elif function == "export_db":
		"""
		-i table_name -o outfile
		"""
		export_db_to_file(inputfile, outfile)
		export_db_to_file(inputfile, outfile + ".noise", ["PageBroken"])
	elif function == "de_noise":
		"""
		remove noise: index.html not found, feature count = 0
		"""
		if "learn" in inputfile:
			response = interact_query("The input file seems to \
					be learned sites, we only support observed \
					sites! Press [Yes/No] to continue or exit!")
			if not response:
				sys.exit(0)

		logger = logging.getLogger("global")
		logger.info("processing {0}".format(inputfile))
		de_noise_config = CD.DeNoiseConfig()
		de_noise_config.zero_feature = True
		original = CD.ObservedSites()
		read_proto_from_file(original, inputfile)
		observed_sites = de_noise(original, de_noise_config)
		logger.info("before de-noise {0}".format(len(original.site)))
		logger.info("after de-noise: {0}".format(len(observed_sites.site)))
		outfile = outfile if outfile else inputfile
		write_proto_to_file(observed_sites, outfile)
	elif function == "update_groundtruth":
		"""
		This function is too specific. It is to add more malicious
		examples to the collected groundtruth.
		"""
		filenames = filter(bool, open(inputfile, 'r').read().split('\n'))
		if len(filenames) == 15:
			original_expected = filenames[0]
			original_u_text = filenames[1]
			original_u_dom = filenames[2]
			original_g_text = filenames[3]
			original_g_dom = filenames[4]
			# observed site may have same URL.
			add_count = count
			add_expected = filenames[5]
			add_u_text = filenames[6]
			add_u_dom = filenames[7]
			add_g_text = filenames[8]
			add_g_dom = filenames[9]
			# outfile
			out_expected = filenames[10]
			out_u_text = filenames[11]
			out_u_dom = filenames[12]
			out_g_text = filenames[13]
			out_g_dom = filenames[14]
			# in this case we will add all
			update_groundtruth(original_expected,
					original_u_text, original_u_dom,
					original_g_text, original_g_dom,
					add_expected, add_u_text, add_u_dom,
					add_g_text, add_g_dom,
					out_expected, out_u_text, out_u_dom,
					out_g_text, out_g_dom)
		elif len(filenames) == 12:
			original_expected = filenames[0]
			original_u_text = filenames[1]
			original_u_dom = filenames[2]
			original_g_text = filenames[3]
			original_g_dom = filenames[4]
			# observed site may have same URL.
			add_count = int(count)
			add_expected = filenames[5]
			add_all = filenames[6]
			'''
			add_u_text = filenames[6]
			add_u_dom = filenames[7]
			add_g_text = filenames[8]
			add_g_dom = filenames[9]
			# outfile
			out_expected = filenames[10]
			out_u_text = filenames[11]
			out_u_dom = filenames[12]
			out_g_text = filenames[13]
			out_g_dom = filenames[14]
			'''
			out_expected = filenames[7]
			out_u_text = filenames[8]
			out_u_dom = filenames[9]
			out_g_text = filenames[10]
			out_g_dom = filenames[11]

			update_groundtruth_redundant(add_count, original_expected,
					original_u_text, original_u_dom,
					original_g_text, original_g_dom,
					add_expected, add_all,
					out_expected, out_u_text, out_u_dom,
					out_g_text, out_g_dom)
		else:
			raise Exception("Cannot handle now!")
	else:
		print help_msg
		sys.exit(2)
コード例 #16
0
def update_groundtruth_redundant(count, original_expected,
		original_u_text, original_u_dom,
		original_g_text, original_g_dom,
		add_expected, add_all,
		out_expected, out_u_text, out_u_dom,
		out_g_text, out_g_dom):
	valid_instance(count, int)
	in_e = CD.ObservedSites()
	read_proto_from_file(in_e, original_expected)
	in_u_t = CD.ObservedSites()
	read_proto_from_file(in_u_t, original_u_text)
	in_u_d = CD.ObservedSites()
	read_proto_from_file(in_u_d, original_u_dom)
	in_g_t = CD.ObservedSites()
	read_proto_from_file(in_g_t, original_g_text)
	in_g_d = CD.ObservedSites()
	read_proto_from_file(in_g_d, original_g_dom)
	add_e = CD.ObservedSites()
	read_proto_from_file(add_e, add_expected)

	in_e_set = sites_name_set(in_e)
	add_e_set = sites_name_set(add_e)
	diff_e_set = add_e_set - in_e_set
	logger = logging.getLogger("global")
	logger.info("size of original set: {0}, size of add set: {1}, size of diff set: {2}".format(
		len(in_e_set), len(add_e_set), len(diff_e_set)))
	logger.info("diff set is")	
	logger.info(diff_e_set)
	diff_e_list = list(diff_e_set)
	logger.info(len(diff_e_list))
	random.shuffle(diff_e_list)
	diff_e_sample = diff_e_list[:count]
	

	"""
	get the sites that are in "de-deduplicated" examples and add them
	this is necessary, because there are sites, that are cloaking, but
	remove in de-dup phase. Doesn't know why.
	"""
	add_u_text_fs = filter(bool, open(add_all, 'r').read().split('\n'))
	diff_e_sample = set(_output_sample_sites(diff_e_sample, add_u_text_fs, add_all
		+ ".u.text.temp"))

	# use the updated diff expected set, to generate the new data
	_output_sample_sites(diff_e_sample, [add_expected], add_expected + ".temp")
	add_u_dom_fs = _replace_list_by(add_u_text_fs, 'text', 'dom')
	_output_sample_sites(diff_e_sample, add_u_dom_fs, add_all + ".u.dom.temp")
	add_g_text_fs = _replace_list_by(add_u_text_fs, 'user', 'google')
	_output_sample_sites(diff_e_sample, add_g_text_fs, add_all + ".g.text.temp")
	add_g_dom_fs = _replace_list_by(add_u_dom_fs, 'user', 'google')
	_output_sample_sites(diff_e_sample, add_g_dom_fs, add_all + ".g.dom.temp")
	out_expected_sites = merge_observed_sites([original_expected, add_expected + ".temp"])
	out_u_t_sites = merge_observed_sites([original_u_text, add_all + ".u.text.temp"])
	out_u_d_sites = merge_observed_sites([original_u_dom, add_all + ".u.dom.temp"])
	out_g_t_sites = merge_observed_sites([original_g_text, add_all +
		".g.text.temp"])
	out_g_d_sites = merge_observed_sites([original_g_dom, add_all +
		".g.dom.temp"])
	out_u_t_sites.config.CopyFrom(in_u_t.config)
	out_u_d_sites.config.CopyFrom(in_u_d.config)
	out_g_t_sites.config.CopyFrom(in_g_t.config)
	out_g_d_sites.config.CopyFrom(in_g_d.config)
	write_proto_to_file(out_expected_sites, out_expected)
	write_proto_to_file(out_u_t_sites, out_u_text)
	write_proto_to_file(out_u_d_sites, out_u_dom)
	write_proto_to_file(out_g_t_sites, out_g_text)
	write_proto_to_file(out_g_d_sites, out_g_dom)
コード例 #17
0
def search_and_revisit(word_file, n, threads=6, ad_only=False):
	"""
	This function does the following things.
	1. Search each word in word file.
	2. Grab the top 200 returned results and corresponding ads
	3. Visit all the results and ads with "chrome user agent", repeat n times
	4. Visit all the landing pages in step 3 with "google ads bot user agent"

	@parameter
	word_file: the filename containing the words to search
	n: repeat step 3 for n times
	ad_only: Only retrieve the advertisements. In this case, we only view the first 5 pages.

	@output
	Following are output of this function
	Running log:
	[WORD_FILE].selenium.crawl/running_log.[SEARCH_TIME]
	"chrome user agent" result is:
	[WORD_FILE].selenium.crawl/ad_crawl_log.[SEARCH_TIME].[WORD_MD5]
	[WORD_FILE].selenium.crawl/search_crawl_log.[SEARCH_TIME].[WORD_MD5]
	[WORD_FILE].selenium.crawl/[WORD_MD5]/[UA_MD5].[SEARCH_TIME]/[URL_MD5]/index.html
	"google ads bot user agent" result is:
	[WORD_FILE].selenium.crawl/ad_crawl_log.[SEARCH_TIME].[WORD_MD5].google
	[WORD_FILE].selenium.crawl/search_crawl_log.[SEARCH_TIME].[WORD_MD5].google
	[WORD_FILE].selenium.crawl/[WORD_MD5]/[UA_MD5].[SEARCH_TIME].revisit.[REVISIT_TIME]/[URL_MD5]/index.html
	"""
	valid_instance(threads, int)
	# prepare search and visit
	user_UA = "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/" \
			"537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36"
	user_suffix = "selenium.crawl/"
	search_now_suffix = datetime.now().strftime(".%Y%m%d-%H%M%S")
	word_md5_delimiter = "WORD_MD5"

	# compute base_dir and start logging
	base_dir = '.'.join([word_file, user_suffix])
	mkdir_if_not_exist(base_dir)
	logging.basicConfig(filename=base_dir+'running_log'+search_now_suffix, level=logging.DEBUG)
	logging.getLogger("global")

	# set search and visit crawl_config
	search_config = CD.CrawlConfig()
	search_config.maximum_threads = threads
	search_config.user_agent = user_UA
	# number of top search results to be inspected
	if ad_only:
		search_config.count = 50
	search_config.browser_type = CD.CrawlConfig.CHROME

	ad_crawl_config = CD.CrawlConfig()
	ad_crawl_config.CopyFrom(search_config)
	ad_crawl_config.result_type = CD.AD
	ad_crawl_config.crawl_log_dir = base_dir
	ad_log_filename_prefix = 'ad_crawl_log' + search_now_suffix
	ad_dir_prefix = base_dir + word_md5_delimiter + "/" + \
			hex_md5(ad_crawl_config.user_agent) + search_now_suffix + '/'
	search_crawl_config = CD.CrawlConfig()
	search_crawl_config.CopyFrom(search_config)
	search_crawl_config.result_type = CD.SEARCH
	search_crawl_config.crawl_log_dir = base_dir
	search_log_filename_prefix = 'search_crawl_log' + search_now_suffix
	search_dir_prefix = base_dir + word_md5_delimiter + "/" + \
			hex_md5(search_crawl_config.user_agent) + search_now_suffix + '/'

	# print crawl_config.user_agent
	words = SearchTerm(word_file)
	search = Search(search_config)
	ad_visit = Visit(ad_crawl_config, 1)
	search_visit = Visit(search_crawl_config, 1)

	# prepare the revisit
	google_ad_UA = "AdsBot-Google (+http://www.google.com/adsbot.html)"
	google_search_UA = "Googlebot/2.1 (+http://www.google.com/bot.html)"

	# set revisit crawl_config
	revisit_crawl_config = CD.CrawlConfig()
	revisit_crawl_config.maximum_threads = threads
	revisit_crawl_config.browser_type = CD.CrawlConfig.CHROME
	# base directory uses search_now_suffix to correlate these two
	revisit_crawl_config.crawl_log_dir = base_dir

	# search, visit and revisit each word
	for word in words.get_word_list():
		print "Processing {0} word: {1}".format(words.current(), word)
		# update word_md5 related directories
		print word
		word_md5 = hex_md5(word)
		ad_crawl_config.log_filename = ad_log_filename_prefix + "." + word_md5
		ad_crawl_config.user_agent_md5_dir = word_md5.join(
				ad_dir_prefix.split(word_md5_delimiter))
		search_crawl_config.log_filename = search_log_filename_prefix + "." + word_md5
		search_crawl_config.user_agent_md5_dir = word_md5.join(
				search_dir_prefix.split(word_md5_delimiter))
		ad_visit.update_crawl_config(ad_crawl_config)
		search_visit.update_crawl_config(search_crawl_config)
		
		# search and crawl
		right_click = not ad_only
		ad_set, search_set = search.search(word, right_click)
		ad_crawl_log_filename = ad_visit.visit(ad_set, word)
		if ad_only:
			search_crawl_log_filename = None
		else:
			search_crawl_log_filename = search_visit.visit(search_set, word)

		# revisit
		crawl_log_file_list = list()
		if ad_crawl_log_filename:
			crawl_log_file_list.append(ad_crawl_log_filename)
		if search_crawl_log_filename:
			crawl_log_file_list.append(search_crawl_log_filename)
		for crawl_log_file in crawl_log_file_list:
			if crawl_log_file == ad_crawl_log_filename:
				revisit_crawl_config.user_agent = google_ad_UA
			else:
				revisit_crawl_config.user_agent = google_search_UA
			revisit_dir_prefix = base_dir + word_md5_delimiter + "/" + \
					hex_md5(revisit_crawl_config.user_agent) + search_now_suffix
			revisit_crawl_config.log_filename = crawl_log_file.split('/')[-1] + '.google'
			revisit = Visit(revisit_crawl_config)
			crawl_log = CD.CrawlLog()
			read_proto_from_file(crawl_log, crawl_log_file)
			revisit.visit_landing_url_n_times(crawl_log, int(n), revisit_dir_prefix,
					word_md5, word_md5_delimiter)
		words.next()
		"""
コード例 #18
0
ファイル: util.py プロジェクト: slitayem/cloaking-detection-1
def generate_test(observed_sites_filename, test_size=5000, positive_size=1000):
	text_observed_sites_filename = observed_sites_filename + ".text"
	dom_observed_sites_filename = observed_sites_filename + ".dom"
	if not (os.path.exists(dom_observed_sites_filename) and os.path.exists(text_observed_sites_filename)):
		raise Exception("Computed observed sites file doesn't exist!")

	# select for text simhash first
	computed_observed_sites_filename = text_observed_sites_filename
	observed_sites = CD.ObservedSites()
	read_proto_from_file(observed_sites, computed_observed_sites_filename)
	observed_site_list = list()
	for observed_site in observed_sites.site:
		observed_site_list.append(observed_site)
	random.shuffle(observed_site_list)
	# test_size is number of sites, actual observation should be more than this.
	test_sites = CD.ObservedSites()
	mismatch_sites = CD.ObservedSites()
	test_sites.config.CopyFrom(observed_sites.config)
	mismatch_sites.config.CopyFrom(observed_sites.config)

	test_list = observed_site_list[0:test_size]
	mismatch_list = test_list[0:positive_size]
	# original_label_list and mismatch_label_mapping is used in dom select.
	original_label_list = [observed_site.name for observed_site in test_list]
	mismatch_label_mapping = dict()
	for observed_site in mismatch_list:
		# observed_site in test_list are also changed.
		current_label = observed_site.name
		mismatch_label = random.sample(observed_site_list, 1)[0].name
		while (top_domain(current_label) == top_domain(mismatch_label)):
			mismatch_label = random.sample(observed_site_list, 1)[0].name
		observed_site.name = mismatch_label
		mismatch_site = mismatch_sites.site.add()
		mismatch_site.CopyFrom(observed_site)
		mismatch_label_mapping[current_label] = mismatch_label
	for observed_site in test_list:
		test_site = test_sites.site.add()
		test_site.CopyFrom(observed_site)
	mismatch_sites_filename = computed_observed_sites_filename + ".mismatch"
	test_sites_filename = computed_observed_sites_filename + ".test"
	write_proto_to_file(mismatch_sites, mismatch_sites_filename)
	write_proto_to_file(test_sites, test_sites_filename)

	# select for dom simhash now
	computed_observed_sites_filename = dom_observed_sites_filename
	observed_sites = CD.ObservedSites()
	read_proto_from_file(observed_sites, computed_observed_sites_filename)
	observed_sites_map = dict()
	for observed_site in observed_sites.site:
		observed_sites_map[observed_site.name] = observed_site
	test_sites = CD.ObservedSites()
	mismatch_sites = CD.ObservedSites()
	test_sites.config.CopyFrom(observed_sites.config)
	mismatch_sites.config.CopyFrom(observed_sites.config)

	test_list = list()
	for label in original_label_list:
		test_list.append(observed_sites_map[label])
	for label in mismatch_label_mapping:
		observed_sites_map[label].name = mismatch_label_mapping[label]
		mismatch_site = mismatch_sites.site.add()
		mismatch_site.CopyFrom(observed_sites_map[label])
	for observed_site in test_list:
		test_site = test_sites.site.add()
		test_site.CopyFrom(observed_site)
	mismatch_sites_filename = computed_observed_sites_filename + ".mismatch"
	test_sites_filename = computed_observed_sites_filename + ".test"
	write_proto_to_file(mismatch_sites, mismatch_sites_filename)
	write_proto_to_file(test_sites, test_sites_filename)