Example #1
0
def main2():
	f = open(sys.argv[1])

	data = [[1,2,3,4,5],[1,2,3,4,5,6,7,7], [5,3,2,4,5,6]]*3
	#boxNames = ["Gauss-Jordan", "Gauss-Jordan (confidence variant)", "Genie", "Naive"]
	#print data
	#boxNames = ["Gauss", "Naive", "SMILE"]
	#boxNames = ["Gauss", "Naive"]
	boxNames = ["Gauss", "Gauss(f)", "Naive"]

	#boxColors = ['darkkhaki', 'indianred', 'forestgreen', 'royalblue' ]
	boxColors = ['darkkhaki', '#aaaa00', 'indianred', 'forestgreen', 'royalblue' ]
	labels = ["Size1","Size2","Size3"]
	#make_boxplot(data, labels, boxColors, boxNames, title="Euclidian distance for network %s")

	data = []
	labels = []
	subdatas = [[] for i in xrange(len(boxNames))]
	for line in f.readlines():
		line = line[:-1]
		if line.startswith("CASE"):
			_, count = line.split(' ')
			if subdatas[0] != []:
				data.extend(subdatas)
			subdatas = [[] for i in xrange(len(boxNames))]

			if int(count) == 0:
				break
			#labels.append(line)
			labels.append("Size %s"%(count))
		else:
			results = [float(i) for i in line.split(' ')]
			for j in xrange(len(boxNames)):
				subdatas[j].append(results[j])
	#print labels
	#print data
	
	make_boxplot(data, labels, boxColors, boxNames, title="Weighted Euclidian Distance (as defined for CPT) for #parents = %s"%(sys.argv[2]))
def main():
	def eucl_dist(A, B):
		assert not any(((a < 0.0 or a > 1.0) for a in A)), "Error in set A: %s"%(A)
		assert not any(((b < 0.0 or b > 1.0) for b in B)), "Error in set B: %s"%(B)
		assert len(A) == len(B), "sets differ in size %d vs %d" % (len(A), len(B))

		return sqrt(sum((A[i] - B[i])**2 for i in range(len(A))))

	def kl_dist(P, Q):
		assert len(P) == len(Q)
		suma = 0.0;
		for i in range(len(P)):
	
			q = Q[i]
			if Q[i] < 10e-7:
				q = 10e-7
	
			suma += P[i]*log((P[i]/q),e);
		return suma

	def hellinger_dist(P, Q):
		assert len(P) == len(Q)
		assert all((Q[i]>=0 for i in xrange(len(Q)))), "%s"%(Q)
		suma = sum(( (sqrt(P[i]) - sqrt(Q[i]))**2 for i in range(len(P)) ))
		return sqrt(suma) * 0.7071067811865475  # * 1./sqrt(2)

# OLD PREGENERATED DATA
#	filenames = [ "CancerOR_1k.txt", "CancerOR_2k.txt", "CancerOR_3k.txt", "CancerOR_4k.txt", "CancerOR_1100.txt", "CancerOR_1200.txt", "CancerOR_1300.txt", "CancerOR_1400.txt", "CancerOR_1500.txt", "CancerOR_1600.txt",
#		"CancerOR_1800.txt", "CancerOR_1900.txt", "CancerOR_800.txt", "CancerOR_2500.txt", "CancerOR_2400.txt", "CancerOR_2300.txt", "CancerOR_2200.txt", "CancerOR_2100.txt", "CancerOR_3100.txt",
#		"CancerOR_3300.txt", "CancerOR_3500.txt", "CancerOR_3700.txt", "CancerOR_3900.txt"] 
#	nums = [1000, 2000, 3000, 4000, 1100, 1200, 1300, 1400, 1500, 1600, 1800, 1900, 800, 2500, 2400, 2300, 2200, 2100, 3100, 3300, 3500, 3700, 3900]


	#dist_func = hellinger_dist
	dist_func = eucl_dist

	cpp_generator = "./NoisyMAXSmile/generator"
	cpp_learner = "./NoisyMAXSmile/smile_learner"
	py_learner = "./noisyMAX.py"
	naive_learner = "./naiveMAX.py"
	def exec_command(command):
		p = os.popen(command, "r")
		output = ""
		while 1:
			line = p.readline()
			if not line:
				break
			output += line
		return output[:-1]
	
	def makedict(text):
		r_dict = {}
		for line in text.split('\n')[1:]:
			cut = line.split(' ')
			val = float(cut[-1])
			case = " ".join(cut[:-1])
			r_dict[case] = val
		return r_dict
	
	def distance(A,B, dist_f):
		valA = []
		valB = []
		for key, value in A.iteritems():
			if key.endswith("True"):
				valA.append(value)
				valB.append(B[key])
		return dist_f(valA, valB)
				

	#print makedict(max_based2(real_or))

	#real_or = [ [0.61, 0, 0.25, 0, 0.15, 0, 0.04, 0, 0.01,],
	#		[0.39, 1, 0.75, 0, 0.85, 0, 0.96, 0, 0.99,], ]
	#real_or_dim = 5
	#network = "./src/CancerOR.xdsl"
	#network_cpt = "./src/CancerOR_CPT.xdsl"

	real_or = [ [0.61, 0, 0.25, 0, 0.15, 0, 0.04, 0, 0.35, 0, 0.89, 0, 0.01,],
			[0.39, 1, 0.75, 1, 0.85, 1, 0.96, 1, 0.65, 1, 0.11, 1, 0.99,], ]
	real_or_dim = 7
	network = "./src/OR_6.xdsl"
	network_cpt = "./src/OR_6_CPT.xdsl"


	print real_or
	parent_dims = [2]*(real_or_dim - 1)
	#labels = ["Smoker","Genetic","CoalWorker","BadDiet","LungCancer"]
	labels = ["P%d"%(i) for i in range(1,7)] + ['C1',]
	real_or_cpt = CPT(real_or, parent_dims=parent_dims, network_type=CPT.TYPE_NOISY_MAX, labels=labels, states = [["True","False"]]*real_or_dim).max_based()
	#max_based_real = max_based(real_or, leakdef = LEAKDEF.HENRION)
	#print max_based_real
	#real_out = dict(real_or_cpt.print_raw())
	real_out = makedict(real_or_cpt.print_raw())
	#print real_out

	#sety = zip(filenames, nums)
	#dataset_file = "data/"+filename

	#nums = range(800,5001,500)
	nums = [1000, 3000, 10000, 50000, 100000]
	#nums = [100, 1000, 10000, 100000]
	#nums = [2000, 2500]
	#nums = [500, 1000 , 1500, 2000, 2500]
	labels = [ "%d records"%(num, )  for num in nums]
	#labels = ["1k records", "3k records", "10k records", "50k records", "100k records"]
	#boxNames = ["Gauss-Jordan elimination", "SMILE Noisy-MAX fitting"]
	#boxColors = ['darkkhaki', 'forestgreen']

	reps = 100
	#sety = xrange(1000,5000,50)
	sety = zip("A"*len(nums), nums)
	dataset_file = "tmp/tmp_data.txt"

	#network_generator_seed = 3333  # used for 1k-100k GJ vs naive
	#network_generator_seed = 123456  # used for 500 - 2500 GJ vs naive
	#network_generator_seed = 44444  # used for 500 - 2500 GJ vs smile
	#network_generator_seed = 444445  # used for 1k -100k GJ vs GJFit
	#network_generator_seed = 5555  # used for 500-2500 all
	#network_generator_seed = 3355  # used for 1k-100k all
	#network_generator_seed = 3357  # used for 1k-100k all
	network_generator_seed = 123412333

	rnd = random.Random()
	rnd.seed(network_generator_seed)
	data = []
	for filename, i in sety:
		subdata_gj = []
		subdata_gjf = []
		subdata_genie = []
		subdata_naive = []
		for t in xrange(reps):
			seed = rnd.randint(1,10**6)
			gen_command = "%s %d %s %d > %s" % (cpp_generator, i, network, seed, dataset_file)
			#learn_command_em = "%s %s %s EM" %(cpp_learner, dataset_file, network_cpt)
			learn_command_genie = "%s %s %s Smile" %(cpp_learner, dataset_file, network_cpt)
			learn_command_GJ = "python %s %s GJ" %(py_learner, dataset_file)
			learn_command_GJFit = "python %s %s GJFit" %(py_learner, dataset_file)
			learn_command_naive = "python %s %s" %(naive_learner, dataset_file)


			print "s = %d, t = %d"%(i,t,)
			exec_command(gen_command)
			#em_out = makedict(exec_command(learn_command_em))

			GJ_out = makedict(exec_command(learn_command_GJ))
			#print real_out, GJ_out
			d_r_gj =  distance(real_out, GJ_out, dist_func)
			print "GJ", d_r_gj

			GJFit_out = makedict(exec_command(learn_command_GJFit))
			d_r_gjf =  distance(real_out, GJFit_out, dist_func)
			print "GJFit", d_r_gjf

			print learn_command_genie
			command_out = exec_command(learn_command_genie)
			genie_out = makedict(command_out)
			d_r_g =  distance(real_out, genie_out, dist_func)
			print "Genie", d_r_g

			naive_out = makedict(exec_command(learn_command_naive))
			d_r_naive =  distance(real_out, naive_out, dist_func)
			print "Naive", d_r_naive

			subdata_gj.append(d_r_gj)
			subdata_gjf.append(d_r_gjf)
			subdata_genie.append(d_r_g)
			subdata_naive.append(d_r_naive)
			
		data.append(subdata_gj)
		data.append(subdata_gjf)
		data.append(subdata_genie)
		data.append(subdata_naive)

	boxNames = ["Gauss-Jordan", "Gauss-Jordan (confidence variant)", "Genie", "Naive"]
	boxColors = ['darkkhaki', 'indianred', 'forestgreen', 'royalblue' ]

	make_boxplot(data, labels, boxColors, boxNames, title="Euclidian distance for network %s"%(network,))
def main():
	def eucl_dist(A, B):
		return sqrt(sum((A[i] - B[i])**2 for i in range(len(A)) ))

	def kl_dist(P, Q):
		assert len(P) == len(Q)
		suma = 0.0;
		for i in range(len(P)):
	
			q = Q[i]
			if Q[i] < 10e-7:
				q = 10e-7
	
			suma += P[i]*log((P[i]/q),e);
		return suma

	def hellinger_dist(P, Q):
		assert len(P) == len(Q)
		assert all((Q[i]>=0 for i in xrange(len(Q)))), "%s"%(Q)
		suma = sum(( (sqrt(P[i]) - sqrt(Q[i]))**2 for i in range(len(P)) ))
		return sqrt(suma) * 0.7071067811865475  # * 1./sqrt(2)

	filenames = [ "CancerOR_1k.txt", "CancerOR_2k.txt", "CancerOR_3k.txt", "CancerOR_4k.txt", "CancerOR_1100.txt", "CancerOR_1200.txt", "CancerOR_1300.txt", "CancerOR_1400.txt", "CancerOR_1500.txt", "CancerOR_1600.txt",
"CancerOR_1800.txt", "CancerOR_1900.txt", "CancerOR_800.txt", "CancerOR_2500.txt", "CancerOR_2400.txt", "CancerOR_2300.txt", "CancerOR_2200.txt", "CancerOR_2100.txt", "CancerOR_3100.txt",
"CancerOR_3300.txt", "CancerOR_3500.txt", "CancerOR_3700.txt", "CancerOR_3900.txt"] 
	nums = [1000, 2000, 3000, 4000, 1100, 1200, 1300, 1400, 1500, 1600, 1800, 1900, 800, 2500, 2400, 2300, 2200, 2100, 3100, 3300, 3500, 3700, 3900]

	nums = range(800, 5001, 100) #+ [5000, 10000, 100000]
	filenames = [ "CancerOR_%d.txt"%(n) for n in nums]

	real_or = (0.61, 0.25, 0.15, 0.04, 0.01,)
	X_plot = []
	Y_plot = []

	#dist_func = hellinger_dist
	dist_func = eucl_dist

	cpp_generator = "./NoisyMAXSmile/generator"
	cpp_learner = "./NoisyMAXSmile/smile_learner"
	py_learner = "./noisyMAX.py"
	naive_learner = "./naiveMAX.py"
	#network = "./src/CancerOR.xdsl"
	#network_cpt = "./src/CancerOR_CPT.xdsl"

	network = "./src/CancerOR2.xdsl"
	network_cpt = "./src/CancerOR2_CPT.xdsl"
	def exec_command(command):
		p = os.popen(command, "r")
		output = ""
		while 1:
			line = p.readline()
			if not line:
				break
			output += line
		return output[:-1]
	
	def makedict(text):
		r_dict = {}
		for line in text.split('\n'):
			cut = line.split(' ')
			val = float(cut[-1])
			case = " ".join(cut[:-1])
			r_dict[case] = val
		return r_dict
	
	def distance(A,B, dist_f):
		valA = []
		valB = []
		for key, value in A.iteritems():
			if key.endswith("True"):
				valA.append(value)
				valB.append(B[key])
		return dist_f(valA, valB)
				

	#print makedict(max_based2(real_or))
	real_out = makedict(max_based2(real_or))


	#sety = zip(filenames, nums)
	#dataset_file = "data/"+filename

	#nums = range(800,5001,500)
	nums = [1000, 3000, 10000, 50000, 100000]
	#nums = [500, 1000 , 1500, 2000, 2500]
	labels = [ "%d records"%(num, )  for num in nums]
	#labels = ["1k records", "3k records", "10k records", "50k records", "100k records"]
	#boxNames = ["Gauss-Jordan elimination", "SMILE Noisy-MAX fitting"]
	#boxColors = ['darkkhaki', 'forestgreen']

	reps = 100
	#sety = xrange(1000,5000,50)
	sety = zip("A"*len(nums), nums)
	dataset_file = "tmp/tmp_data.txt"

	#network_generator_seed = 3333  # used for 1k-100k GJ vs naive
	#network_generator_seed = 123456  # used for 500 - 2500 GJ vs naive
	#network_generator_seed = 44444  # used for 500 - 2500 GJ vs smile
	#network_generator_seed = 444445  # used for 1k -100k GJ vs GJFit
	#network_generator_seed = 5555  # used for 500-2500 all
	network_generator_seed = 3355  # used for 1k-100k all

	rnd = random.Random()
	rnd.seed(network_generator_seed)
	data = []
	for filename, i in sety:
		subdata_gj = []
		subdata_gjf = []
		subdata_smile = []
		subdata_naive = []
		for t in xrange(reps):
			seed = rnd.randint(1,10**6)
			gen_command = "%s %d %s %d > %s" % (cpp_generator, i, network, seed, dataset_file)
			#learn_command_em = "%s %s %s EM" %(cpp_learner, dataset_file, network_cpt)
			learn_command_smile = "%s %s %s Smile" %(cpp_learner, dataset_file, network_cpt)
			learn_command_GJ = "python %s %s GJ" %(py_learner, dataset_file)
			learn_command_GJFit = "python %s %s GJFit" %(py_learner, dataset_file)
			learn_command_naive = "python %s %s" %(naive_learner, dataset_file)


			print "s = %d, t = %d"%(i,t,)
			exec_command(gen_command)
			#em_out = makedict(exec_command(learn_command_em))

			GJ_out = makedict(exec_command(learn_command_GJ))
			d_r_gj =  distance(real_out, GJ_out, dist_func)
			print "GJ", d_r_gj

			GJFit_out = makedict(exec_command(learn_command_GJFit))
			d_r_gjf =  distance(real_out, GJFit_out, dist_func)
			print "GJFit", d_r_gjf

			smile_out = makedict(exec_command(learn_command_smile))
			d_r_g =  distance(real_out, smile_out, dist_func)
			print "SMILE", d_r_g

			naive_out = makedict(exec_command(learn_command_naive))
			d_r_naive =  distance(real_out, naive_out, dist_func)
			print "Naive", d_r_naive

			subdata_gj.append(d_r_gj)
			subdata_gjf.append(d_r_gjf)
			subdata_smile.append(d_r_g)
			subdata_naive.append(d_r_naive)
			
		data.append(subdata_gj)
		data.append(subdata_gjf)
		data.append(subdata_smile)
		data.append(subdata_naive)

	boxNames = ["Gauss-Jordan", "Gauss-Jordan (confidence variant)", "SMILE", "Naive"]
	boxColors = ['darkkhaki', 'indianred', 'forestgreen', 'royalblue' ]

	make_boxplot(data, labels, boxColors, boxNames )