def plot_ROC(to_detect, to_learn, expected, noise, outfile, coefficient=None, radius=None):
	to_learn_sites = CD.ObservedSites()
	read_proto_from_file(to_learn_sites, to_learn)
	expected_sites = CD.ObservedSites()
	read_proto_from_file(expected_sites, expected)
	print "number of expected_sites"
	print len(expected_sites.site)
	to_detect_sites = CD.ObservedSites()
	read_proto_from_file(to_detect_sites, to_detect)
	noise_sites = CD.ObservedSites()
	read_proto_from_file(noise_sites, noise)
	filt_to_detect_sites = remove_noise(to_detect_sites, noise_sites)
	filt_to_learn_sites = remove_noise(to_learn_sites, noise_sites)
	expected_set = sites_name_set(expected_sites)

	"""
	Following are Stratified-K-Fold.
	"""
	kfold = 5
	total_site_name_list = list()
	to_detect_sites_map = dict()
	for site in filt_to_detect_sites.site:
		total_site_name_list.append(site.name)
		to_detect_sites_map[site.name] = site
	to_learn_sites_map = dict()
	for site in filt_to_learn_sites.site:
		to_learn_sites_map[site.name] = site
	
	# prepare the train and test
	to_detect_sites_list = list()
	to_learn_sites_list = list()
	Y = list()
	for name in total_site_name_list:
		to_detect_sites_list.append(to_detect_sites_map[name])
		to_learn_sites_list.append(to_learn_sites_map[name])
		if name in expected_set:
			Y.append(1)
		else:
			Y.append(0)

	to_detect_sites_list = np.array(to_detect_sites_list)
	to_learn_sites_list = np.array(to_learn_sites_list)
	Y = np.array(Y)
	skf = StratifiedKFold(Y, kfold)

	# scan the parameters and output the FPR and TPR
	params = dict()
	params['train_inconsistent'] = coefficient[0] 
	params['test_inconsistent'] = coefficient[1]
	# params['test_diameter'] = radius[0]
	outf = open(outfile, 'a')
	for r in np.arange(radius[0], radius[1], 1):
		tpr = np.zeros(5, dtype=np.float)
		fpr = np.zeros(5, dtype=np.float)
		params['test_diameter'] = r
		count = 0
		for train, test in skf:
			print "train size: {0}, test size: {1}".format(len(train),
					len(test))
			learn_train, learn_test = to_learn_sites_list[train], to_learn_sites_list[test]
			detect_train, detect_test = to_detect_sites_list[train], to_detect_sites_list[test]
			y_train, y_test = Y[train], Y[test]

			estimator = Estimator(filt_to_detect_sites.config)
			estimator.score(learn_test, detect_test, y_test, params)
			tpr[count] = estimator.rate[0]
			fpr[count] = estimator.rate[1]
			count += 1

		res_str = ",".join([ str(params["train_inconsistent"]),
				str(params["test_inconsistent"]),
				str(params["test_diameter"]) ]) + "\n"
		res_str += ",".join([ str(tpr.mean()), str(fpr.mean()) ]) + "\n"
		outf.write(res_str)
def integrated_plot_ROC(text_input, text_train, dom_input, dom_train, expected, noise, outfile,
		c_text_train, c_text_test, c_dom_train, c_dom_test, 
		r_text_start, r_text_end, r_dom_start, r_dom_end):
	text_to_learn_sites = CD.ObservedSites()
	read_proto_from_file(text_to_learn_sites, text_train)
	dom_to_learn_sites = CD.ObservedSites()
	read_proto_from_file(dom_to_learn_sites, dom_train)

	expected_sites = CD.ObservedSites()
	read_proto_from_file(expected_sites, expected)
	print "number of expected_sites"
	print len(expected_sites.site)
	text_to_detect_sites = CD.ObservedSites()
	read_proto_from_file(text_to_detect_sites, text_input)
	dom_to_detect_sites = CD.ObservedSites()
	read_proto_from_file(dom_to_detect_sites, dom_input)
	noise_sites = CD.ObservedSites()
	read_proto_from_file(noise_sites, noise)
	filt_text_to_detect_sites = remove_noise(text_to_detect_sites, noise_sites)
	filt_text_to_learn_sites = remove_noise(text_to_learn_sites, noise_sites)
	filt_dom_to_detect_sites = remove_noise(dom_to_detect_sites, noise_sites)
	filt_dom_to_learn_sites = remove_noise(dom_to_learn_sites, noise_sites)

	expected_set = sites_name_set(expected_sites)
	"""
	Following are Stratified-K-Fold.
	"""
	kfold = 5
	total_site_name_list = list()
	text_to_detect_sites_map = dict()
	for site in filt_text_to_detect_sites.site:
		total_site_name_list.append(site.name)
		text_to_detect_sites_map[site.name] = site
	text_to_learn_sites_map = dict()
	for site in filt_text_to_learn_sites.site:
		text_to_learn_sites_map[site.name] = site

	dom_to_detect_sites_map = dict()
	for site in filt_dom_to_detect_sites.site:
		dom_to_detect_sites_map[site.name] = site
	dom_to_learn_sites_map = dict()
	for site in filt_dom_to_learn_sites.site:
		dom_to_learn_sites_map[site.name] = site
	
	# prepare the train and test
	text_to_detect_sites_list = list()
	text_to_learn_sites_list = list()
	dom_to_detect_sites_list = list()
	dom_to_learn_sites_list = list()

	Y = list()
	for name in total_site_name_list:
		text_to_detect_sites_list.append(text_to_detect_sites_map[name])
		text_to_learn_sites_list.append(text_to_learn_sites_map[name])
		dom_to_detect_sites_list.append(dom_to_detect_sites_map[name])
		dom_to_learn_sites_list.append(dom_to_learn_sites_map[name])

		if name in expected_set:
			Y.append(1)
		else:
			Y.append(0)

	text_to_detect_sites_list = np.array(text_to_detect_sites_list)
	text_to_learn_sites_list = np.array(text_to_learn_sites_list)
	dom_to_detect_sites_list = np.array(dom_to_detect_sites_list)
	dom_to_learn_sites_list = np.array(dom_to_learn_sites_list)

	Y = np.array(Y)
	skf = StratifiedKFold(Y, kfold)

	# scan the parameters and output the FPR and TPR
	text_params = dict()
	text_params['train_inconsistent'] = c_text_train
	text_params['test_inconsistent'] = c_text_test
	dom_params = dict()
	dom_params['train_inconsistent'] = c_dom_train
	dom_params['test_inconsistent'] = c_dom_test

	# params['test_diameter'] = radius[0]
	outf = open(outfile + "_text_r_" + str(r_text_start), 'a')
	for t_r in np.arange(r_text_start, r_text_end, 1):
		text_params['test_diameter'] = t_r 
		for d_r in np.arange(r_dom_start, r_dom_end, 1):
			dom_params['test_diameter'] = d_r
			tpr = np.zeros(5, dtype=np.float)
			fpr = np.zeros(5, dtype=np.float)
			count = 0
			for train, test in skf:
				print "train size: {0}, test size: {1}".format(len(train),
						len(test))
				text_learn_train, text_learn_test = text_to_learn_sites_list[train], text_to_learn_sites_list[test]
				text_detect_train, text_detect_test = text_to_detect_sites_list[train], text_to_detect_sites_list[test]
				dom_learn_train, dom_learn_test = dom_to_learn_sites_list[train], dom_to_learn_sites_list[test]
				dom_detect_train, dom_detect_test = dom_to_detect_sites_list[train], dom_to_detect_sites_list[test]
				y_train, y_test = Y[train], Y[test]
				text_estimator = Estimator(filt_text_to_detect_sites.config)
				text_estimator.score(text_learn_test, text_detect_test, y_test, text_params)

				dom_estimator = Estimator(simhash_config = filt_dom_to_detect_sites.config, 
						cloaking_sites = text_estimator.cloaking_sites)
				dom_estimator.score(dom_learn_test, dom_detect_test, y_test, dom_params)
				tpr[count] = dom_estimator.rate[0]
				fpr[count] = dom_estimator.rate[1]
				count += 1

			res_str = ",".join([ str(dom_params["train_inconsistent"]),
					str(dom_params["test_inconsistent"]),
					str(dom_params["test_diameter"]) ]) + "\n"
			res_str += ",".join([ str(tpr.mean()), str(fpr.mean()) ]) + "\n"
			outf.write(res_str)
def cross_validation(to_detect, to_learn, expected, noise, outfile, coefficient=None, radius=None):
	to_learn_sites = CD.ObservedSites()
	read_proto_from_file(to_learn_sites, to_learn)
	expected_sites = CD.ObservedSites()
	read_proto_from_file(expected_sites, expected)
	print "number of expected_sites"
	print len(expected_sites.site)
	to_detect_sites = CD.ObservedSites()
	read_proto_from_file(to_detect_sites, to_detect)
	noise_sites = CD.ObservedSites()
	read_proto_from_file(noise_sites, noise)
	filt_to_detect_sites = remove_noise(to_detect_sites, noise_sites)
	filt_to_learn_sites = remove_noise(to_learn_sites, noise_sites)
	expected_set = sites_name_set(expected_sites)

	"""
	Following are Stratified-K-Fold.
	"""
	kfold = 5
	total_site_name_list = list()
	to_detect_sites_map = dict()
	for site in filt_to_detect_sites.site:
		total_site_name_list.append(site.name)
		to_detect_sites_map[site.name] = site
	to_learn_sites_map = dict()
	for site in filt_to_learn_sites.site:
		to_learn_sites_map[site.name] = site
	
	# prepare the train and test
	to_detect_sites_list = list()
	to_learn_sites_list = list()
	Y = list()
	for name in total_site_name_list:
		to_detect_sites_list.append(to_detect_sites_map[name])
		to_learn_sites_list.append(to_learn_sites_map[name])
		if name in expected_set:
			Y.append(1)
		else:
			Y.append(0)

	to_detect_sites_list = np.array(to_detect_sites_list)
	to_learn_sites_list = np.array(to_learn_sites_list)
	Y = np.array(Y)
	skf = StratifiedKFold(Y, kfold)
	scores = list()
	best_params = list()
	for train, test in skf:
		print "train size: {0}, test size: {1}".format(len(train),
				len(test))
		learn_train, learn_test = to_learn_sites_list[train], to_learn_sites_list[test]
		detect_train, detect_test = to_detect_sites_list[train], to_detect_sites_list[test]
		y_train, y_test = Y[train], Y[test]

		estimator = Estimator(filt_to_detect_sites.config, outfile, coefficient, radius)
		estimator.fit(learn_train, detect_train, y_train)
		best_params.append(estimator.best_params)
		scores.append(estimator.score(learn_test, detect_test, y_test))
		# filt_cloaking_sites = remove_noise(both_detected, noise_sites)
		# cross_val_score(svc, X_digits, y_digits, cv=kfold, n_jobs=-1)
	print "best_parameters"
	print best_params
	print scores