Esempio n. 1
0
def update_groundtruth(original_expected,
		original_u_text, original_u_dom, original_g_text, original_g_dom, 
		add_expected, add_u_text, add_u_dom, add_g_text, add_g_dom, 
		out_expected, out_u_text, out_u_dom, out_g_text, out_g_dom):
	in_e = CD.ObservedSites()
	read_proto_from_file(in_e, original_expected)
	in_u_t = CD.ObservedSites()
	read_proto_from_file(in_u_t, original_u_text)
	in_u_d = CD.ObservedSites()
	read_proto_from_file(in_u_d, original_u_dom)
	in_g_t = CD.ObservedSites()
	read_proto_from_file(in_g_t, original_g_text)
	in_g_d = CD.ObservedSites()
	read_proto_from_file(in_g_d, original_g_dom)

	# add google is list
	add_e = CD.ObservedSites()
	read_proto_from_file(add_e, add_expected)
	#add_u_t = CD.ObservedSites()
	#read_proto_from_file(add_u_t, add_u_text)
	#add_u_d = CD.ObservedSites()
	#read_proto_from_file(add_u_d, add_u_dom)
	#add_g_t = merge_observed_sites(add_g_text)
	#add_g_d = merge_observed_sites(add_g_dom)
	in_e_set = sites_name_set(in_e)
	add_e_set = sites_name_set(add_e)
	diff_e_set = add_e_set - in_e_set
	logger = logging.getLogger("global")
	logger.info("size of original set: {0}, size of add set: {1}, size of diff set: {2}".format(
		len(in_e_set), len(add_e_set), len(diff_e_set)))
	logger.info("diff set is")	
	logger.info(diff_e_set)

	_output_sample_sites(diff_e_set, [add_expected], add_expected + ".temp")
	_output_sample_sites(diff_e_set, [add_u_text], add_u_text + ".temp")
	_output_sample_sites(diff_e_set, [add_u_dom], add_u_dom + ".temp")
	add_g_text_fs = filter(bool, open(add_g_text, 'r').read().split('\n'))
	add_g_dom_fs = filter(bool, open(add_g_dom, 'r').read().split('\n'))
	_output_sample_sites(diff_e_set, add_g_text_fs, add_g_text + ".temp")
	_output_sample_sites(diff_e_set, add_g_dom_fs, add_g_dom + ".temp")
	out_expected_sites = merge_observed_sites([original_expected,
		add_expected + ".temp"])
	out_u_t_sites = merge_observed_sites([original_u_text, add_u_text + ".temp"])
	out_u_d_sites = merge_observed_sites([original_u_dom, add_u_dom + ".temp"])
	out_g_t_sites = merge_observed_sites([original_g_text, add_g_text + ".temp"])
	out_g_d_sites = merge_observed_sites([original_g_dom, add_g_dom + ".temp"])
	out_u_t_sites.config.CopyFrom(in_u_t.config)
	out_u_d_sites.config.CopyFrom(in_u_d.config)
	out_g_t_sites.config.CopyFrom(in_g_t.config)
	out_g_d_sites.config.CopyFrom(in_g_d.config)
	write_proto_to_file(out_expected_sites, out_expected)
	write_proto_to_file(out_u_t_sites, out_u_text)
	write_proto_to_file(out_u_d_sites, out_u_dom)
	write_proto_to_file(out_g_t_sites, out_g_text)
	write_proto_to_file(out_g_d_sites, out_g_dom)
Esempio n. 2
0
def update_groundtruth_redundant(count, original_expected,
		original_u_text, original_u_dom,
		original_g_text, original_g_dom,
		add_expected, add_all,
		out_expected, out_u_text, out_u_dom,
		out_g_text, out_g_dom):
	valid_instance(count, int)
	in_e = CD.ObservedSites()
	read_proto_from_file(in_e, original_expected)
	in_u_t = CD.ObservedSites()
	read_proto_from_file(in_u_t, original_u_text)
	in_u_d = CD.ObservedSites()
	read_proto_from_file(in_u_d, original_u_dom)
	in_g_t = CD.ObservedSites()
	read_proto_from_file(in_g_t, original_g_text)
	in_g_d = CD.ObservedSites()
	read_proto_from_file(in_g_d, original_g_dom)
	add_e = CD.ObservedSites()
	read_proto_from_file(add_e, add_expected)

	in_e_set = sites_name_set(in_e)
	add_e_set = sites_name_set(add_e)
	diff_e_set = add_e_set - in_e_set
	logger = logging.getLogger("global")
	logger.info("size of original set: {0}, size of add set: {1}, size of diff set: {2}".format(
		len(in_e_set), len(add_e_set), len(diff_e_set)))
	logger.info("diff set is")	
	logger.info(diff_e_set)
	diff_e_list = list(diff_e_set)
	logger.info(len(diff_e_list))
	random.shuffle(diff_e_list)
	diff_e_sample = diff_e_list[:count]
	

	"""
	get the sites that are in "de-deduplicated" examples and add them
	this is necessary, because there are sites, that are cloaking, but
	remove in de-dup phase. Doesn't know why.
	"""
	add_u_text_fs = filter(bool, open(add_all, 'r').read().split('\n'))
	diff_e_sample = set(_output_sample_sites(diff_e_sample, add_u_text_fs, add_all
		+ ".u.text.temp"))

	# use the updated diff expected set, to generate the new data
	_output_sample_sites(diff_e_sample, [add_expected], add_expected + ".temp")
	add_u_dom_fs = _replace_list_by(add_u_text_fs, 'text', 'dom')
	_output_sample_sites(diff_e_sample, add_u_dom_fs, add_all + ".u.dom.temp")
	add_g_text_fs = _replace_list_by(add_u_text_fs, 'user', 'google')
	_output_sample_sites(diff_e_sample, add_g_text_fs, add_all + ".g.text.temp")
	add_g_dom_fs = _replace_list_by(add_u_dom_fs, 'user', 'google')
	_output_sample_sites(diff_e_sample, add_g_dom_fs, add_all + ".g.dom.temp")
	out_expected_sites = merge_observed_sites([original_expected, add_expected + ".temp"])
	out_u_t_sites = merge_observed_sites([original_u_text, add_all + ".u.text.temp"])
	out_u_d_sites = merge_observed_sites([original_u_dom, add_all + ".u.dom.temp"])
	out_g_t_sites = merge_observed_sites([original_g_text, add_all +
		".g.text.temp"])
	out_g_d_sites = merge_observed_sites([original_g_dom, add_all +
		".g.dom.temp"])
	out_u_t_sites.config.CopyFrom(in_u_t.config)
	out_u_d_sites.config.CopyFrom(in_u_d.config)
	out_g_t_sites.config.CopyFrom(in_g_t.config)
	out_g_d_sites.config.CopyFrom(in_g_d.config)
	write_proto_to_file(out_expected_sites, out_expected)
	write_proto_to_file(out_u_t_sites, out_u_text)
	write_proto_to_file(out_u_d_sites, out_u_dom)
	write_proto_to_file(out_g_t_sites, out_g_text)
	write_proto_to_file(out_g_d_sites, out_g_dom)