Example #1
0
	def _calc_statistics_(pvals,exp_quantiles,exp_median=0.5,exp_pvals=None):
		m = analyzePhenotype._calcMedian_(pvals,exp_median)
		ks_res = analyzePhenotype._calcKS_(pvals,exp_pvals)
		s = analyzePhenotype._estLogSlope_(pvals,exp_pvals)-1.0
		ks_stat = ks_res["D"]
		ks_pvalue = ks_res["p.value"]
		quantiles = analyzePhenotype._getQuantiles_(pvals, 1000)
		#exp_quantiles = analyzePhenotype.__getExpectedPvalueQuantiles__(1000)
		a = analyzePhenotype._estAreaBetweenCurves_(quantiles,exp_quantiles)
		
		return (m,a,ks_stat,ks_pvalue,s)
Example #2
0
	def _calc_statistics_(pvals,exp_quantiles,exp_median=0.5,exp_pvals=None):
		m = analyzePhenotype._calcMedian_(pvals,exp_median)
		ks_res = analyzePhenotype._calcKS_(pvals,exp_pvals)
		s = analyzePhenotype._estLogSlope_(pvals,exp_pvals)-1.0
		ks_stat = ks_res["D"]
		ks_pvalue = ks_res["p.value"]
		quantiles = analyzePhenotype._getQuantiles_(pvals, 1000)
		#exp_quantiles = analyzePhenotype.__getExpectedPvalueQuantiles__(1000)
		a = analyzePhenotype._estAreaBetweenCurves_(quantiles,exp_quantiles)
		
		return (m,a,ks_stat,ks_pvalue,s)
Example #3
0
def _perm_test_(all_snps,phenVals,numPerm,outputFile,filter=0.1,test_type = "KW",savePermutations=False,useSameSnps=False):

	def _calc_statistics_(pvals,exp_quantiles,exp_median=0.5,exp_pvals=None):
		m = analyzePhenotype._calcMedian_(pvals,exp_median)
		ks_res = analyzePhenotype._calcKS_(pvals,exp_pvals)
		s = analyzePhenotype._estLogSlope_(pvals,exp_pvals)-1.0
		ks_stat = ks_res["D"]
		ks_pvalue = ks_res["p.value"]
		quantiles = analyzePhenotype._getQuantiles_(pvals, 1000)
		#exp_quantiles = analyzePhenotype.__getExpectedPvalueQuantiles__(1000)
		a = analyzePhenotype._estAreaBetweenCurves_(quantiles,exp_quantiles)
		
		return (m,a,ks_stat,ks_pvalue,s)
	
	if filter <1.0:
		snps = random.sample(all_snps,int(len(all_snps)*filter))
		print "Number of SNPs:",len(snps)
	else:
		snps = all_snps 

	#Calc norm stats, and est. p-value 
#	print "running old KW"
#	t1 = time.time()
#	pvals = analyzeHaplotype._run_kw_(snps,phenVals)
#	t2 = time.time()
#	print "Took",t2-t1,"seconds."
	if test_type=="KW":
		print "running KW"
		t1 = time.time()
		true_pvals = util.kruskal_wallis(snps,phenVals)["ps"]
		t2 = time.time()
		print "Took",t2-t1,"seconds."
	elif test_type=="Fisher":
		print "running Fisher's exact test"
		t1 = time.time()
		true_pvals = run_fet(snps,phenVals)
		t2 = time.time()
		print "Took",t2-t1,"seconds."
		

	
	
	perm_pvalues_list = []
	for i in range(0,numPerm):#For every perm
		if filter <1.0:
			snps = random.sample(all_snps,int(len(all_snps)*filter))
			print "Number of SNPs:",len(snps)	
		print i
		random.shuffle(phenVals) #Permute phenotype
		#pvals = analyzeHaplotype._run_kw_(snps,phenVals)	#Run KW
		if test_type=="KW":
			print "running KW"
			t1 = time.time()
			pvals = util.kruskal_wallis(snps,phenVals)["ps"]
			t2 = time.time()
			print "Took",t2-t1,"seconds."
		elif test_type=="Fisher":
			print "running Fisher's exact test"
			t1 = time.time()
			pvals = run_fet(snps,phenVals)
			t2 = time.time()
			print "Took",t2-t1,"seconds."
		perm_pvalues_list.append(pvals)

	
	print "Combining p-values"
	quantiles = []
	all_pvals = []
	for pvals in perm_pvalues_list:
		for pval in pvals:
			all_pvals.append(pval)
	print len(all_pvals),"permuted pvals in all"
	quantiles = analyzePhenotype._getQuantiles_(all_pvals, 1000)
	print "len(quantiles):", len(quantiles)
	exp_median = (quantiles[499]+quantiles[500])/2.0

	(true_m,true_a,true_ks_stat,true_ks_pvalue,true_s) = _calc_statistics_(true_pvals,quantiles,exp_median,all_pvals)

	m_list = []
	a_list = []
	ks_stat_list = []
	ks_pvalue_list = []
	s_list = []
	for i in range(0,numPerm):
		pvals = perm_pvalues_list[i]
		(m,a,ks_stat,ks_pvalue,s) = _calc_statistics_(pvals,quantiles,exp_median,all_pvals) #Calc. statistic
		m_list.append(m)
		a_list.append(a)
		s_list.append(s)
		ks_stat_list.append(ks_stat)
		ks_pvalue_list.append(ks_pvalue)
	
	del all_pvals,quantiles

		
	if savePermutations:
		permOutputFile = outputFile+".perm.pvals"
		print "Writing to",permOutputFile
		f = open(permOutputFile,"w")
		i = 0
		for pvals in perm_pvalues_list:
			pvals_str = map(str,pvals)
			f.write(",".join(pvals_str)+"\n")
		print "Done writing to",permOutputFile
	
		f.close()


	#Output results
	outputFile = outputFile+".perm.stat.txt"
	f = open(outputFile,"w")
	f.write("Perm_nr, median, area, ks_stat, s_stat \n")
	for i in range(0,numPerm):
		str_l = map(str,[i, m_list[i],a_list[i],ks_stat_list[i],s_list[i]])
		f.write(", ".join(str_l)+"\n")
	
	f.write("\n"+"Observed values: "+str((true_m,true_a,true_ks_stat,true_s))+"\n")

	pvals = [0.0,0.0,0.0,0.0]
	
	#M stat p-value (two sided)
	#Assuming symm. dist.
	for i in range(0,numPerm):
		if abs(true_m) <= abs(m_list[i]):
			pvals[0]+=1.0/numPerm
	
	#A stat p-value (one tailed)
	for i in range(0,numPerm):
		if true_a <= a_list[i]:
			pvals[1]+=1.0/numPerm
		

	#KS stat p-value (one tailed)
	for i in range(0,numPerm):
		if true_ks_stat <= ks_stat_list[i]:
			pvals[2]+=1.0/numPerm
		
	#S stat p-value (one tailed)
	for i in range(0,numPerm):
		if abs(math.log(true_s+1.0)) <= abs(math.log(s_list[i]+1.0)):
			pvals[3]+=1.0/numPerm
		


	for i in range(0,len(pvals)):
		if pvals[i] == 0.0:
			pvals[i] = 0.5*(1.0/numPerm)
			
	str_pvals = map(str,pvals)
	f.write("\n"+"Estimated p-values: "+",".join(str_pvals)+"\n")
	f.close()
	
	#Plot results
	pngFile_median = outputFile+".perm.m.png"
	pngFile_area = outputFile+".perm.a.png"
	pngFile_ks = outputFile+".perm.ks.png"
	pngFile_s = outputFile+".perm.s.png"
	
	def _getBinning_(n_bins,min_val,max_val):
		bins = []
		delta = (max_val-min_val)/n_bins
		start_val = min_val-delta*0.5
		for i in range(0,n_bins+2):
			bins.append(start_val+delta*i)
		return (bins,delta)
		
	n_bins = 20+int(4*(math.log(numPerm)))
	import matplotlib
	matplotlib.use('Agg')
	import matplotlib.pyplot as plt
	min_val = min(min(m_list),true_m)
	max_val = max(max(m_list),true_m)
	(bins,delta) = _getBinning_(n_bins,min_val,max_val)
	print (bins,delta)

	plt.figure(figsize=(10,7))
	plt.hist(m_list+[true_m], bins = bins)#, range=[start_val,end_val])
	plt.hist([true_m], bins = bins)#, range=[start_val,end_val])
	plt.savefig(pngFile_median, format = "png")
	plt.legend()
	plt.clf()

	min_val = min(min(a_list),true_a)
	max_val = max(max(a_list),true_a)
	(bins,delta) = _getBinning_(n_bins,min_val,max_val)
	print (bins,delta)
	plt.figure(figsize=(10,7))
	plt.hist(a_list+[true_a], bins = bins)
	plt.hist([true_a], bins = bins)
	plt.savefig(pngFile_area, format = "png")
	plt.clf()

	min_val = min(min(ks_stat_list),true_ks_stat)
	max_val = max(max(ks_stat_list),true_ks_stat)
	(bins,delta) = _getBinning_(n_bins,min_val,max_val)
	print (bins,delta)
	plt.figure(figsize=(10,7))
	plt.hist(ks_stat_list+[true_ks_stat], bins = bins)
	plt.hist([true_ks_stat], bins = bins)
	plt.savefig(pngFile_ks, format = "png")
	plt.clf()

	min_val = min(min(s_list),true_s)
	max_val = max(max(s_list),true_s)
	(bins,delta) = _getBinning_(n_bins,min_val,max_val)
	print (bins,delta)
	plt.figure(figsize=(10,7))
	plt.hist(s_list+[true_s], bins = bins)
	plt.hist([true_s], bins = bins)
	plt.savefig(pngFile_s, format = "png")
	plt.clf()
Example #4
0
def _perm_test_(all_snps,phenVals,numPerm,outputFile,filter=0.1,test_type = "KW",savePermutations=False,useSameSnps=False):

	def _calc_statistics_(pvals,exp_quantiles,exp_median=0.5,exp_pvals=None):
		m = analyzePhenotype._calcMedian_(pvals,exp_median)
		ks_res = analyzePhenotype._calcKS_(pvals,exp_pvals)
		s = analyzePhenotype._estLogSlope_(pvals,exp_pvals)-1.0
		ks_stat = ks_res["D"]
		ks_pvalue = ks_res["p.value"]
		quantiles = analyzePhenotype._getQuantiles_(pvals, 1000)
		#exp_quantiles = analyzePhenotype.__getExpectedPvalueQuantiles__(1000)
		a = analyzePhenotype._estAreaBetweenCurves_(quantiles,exp_quantiles)
		
		return (m,a,ks_stat,ks_pvalue,s)
	
	if filter <1.0:
		snps = random.sample(all_snps,int(len(all_snps)*filter))
		print "Number of SNPs:",len(snps)
	else:
		snps = all_snps 

	#Calc norm stats, and est. p-value 
#	print "running old KW"
#	t1 = time.time()
#	pvals = analyzeHaplotype._run_kw_(snps,phenVals)
#	t2 = time.time()
#	print "Took",t2-t1,"seconds."
	if test_type=="KW":
		print "running KW"
		t1 = time.time()
		true_pvals = util.kruskal_wallis(snps,phenVals)["ps"]
		t2 = time.time()
		print "Took",t2-t1,"seconds."
	elif test_type=="Fisher":
		print "running Fisher's exact test"
		t1 = time.time()
		true_pvals = run_fet(snps,phenVals)
		t2 = time.time()
		print "Took",t2-t1,"seconds."
		

	
	
	perm_pvalues_list = []
	for i in range(0,numPerm):#For every perm
		if filter <1.0:
			snps = random.sample(all_snps,int(len(all_snps)*filter))
			print "Number of SNPs:",len(snps)	
		print i
		random.shuffle(phenVals) #Permute phenotype
		#pvals = analyzeHaplotype._run_kw_(snps,phenVals)	#Run KW
		if test_type=="KW":
			print "running KW"
			t1 = time.time()
			pvals = util.kruskal_wallis(snps,phenVals)["ps"]
			t2 = time.time()
			print "Took",t2-t1,"seconds."
		elif test_type=="Fisher":
			print "running Fisher's exact test"
			t1 = time.time()
			pvals = run_fet(snps,phenVals)
			t2 = time.time()
			print "Took",t2-t1,"seconds."
		perm_pvalues_list.append(pvals)

	
	print "Combining p-values"
	quantiles = []
	all_pvals = []
	for pvals in perm_pvalues_list:
		for pval in pvals:
			all_pvals.append(pval)
	print len(all_pvals),"permuted pvals in all"
	quantiles = analyzePhenotype._getQuantiles_(all_pvals, 1000)
	print "len(quantiles):", len(quantiles)
	exp_median = (quantiles[499]+quantiles[500])/2.0

	(true_m,true_a,true_ks_stat,true_ks_pvalue,true_s) = _calc_statistics_(true_pvals,quantiles,exp_median,all_pvals)

	m_list = []
	a_list = []
	ks_stat_list = []
	ks_pvalue_list = []
	s_list = []
	for i in range(0,numPerm):
		pvals = perm_pvalues_list[i]
		(m,a,ks_stat,ks_pvalue,s) = _calc_statistics_(pvals,quantiles,exp_median,all_pvals) #Calc. statistic
		m_list.append(m)
		a_list.append(a)
		s_list.append(s)
		ks_stat_list.append(ks_stat)
		ks_pvalue_list.append(ks_pvalue)
	
	del all_pvals,quantiles

		
	if savePermutations:
		permOutputFile = outputFile+".perm.pvals"
		print "Writing to",permOutputFile
		f = open(permOutputFile,"w")
		i = 0
		for pvals in perm_pvalues_list:
			pvals_str = map(str,pvals)
			f.write(",".join(pvals_str)+"\n")
		print "Done writing to",permOutputFile
	
	f.close()


	#Output results
	outputFile = outputFile+".perm.stat.txt"
	f = open(outputFile,"w")
	f.write("Perm_nr, median, area, ks_stat, s_stat \n")
	for i in range(0,numPerm):
		str_l = map(str,[i, m_list[i],a_list[i],ks_stat_list[i],s_list[i]])
		f.write(", ".join(str_l)+"\n")
	
	f.write("\n"+"Observed values: "+str((true_m,true_a,true_ks_stat,true_s))+"\n")

	pvals = [0.0,0.0,0.0,0.0]
	
	#M stat p-value (two sided)
	#Assuming symm. dist.
	for i in range(0,numPerm):
		if abs(true_m) <= abs(m_list[i]):
			pvals[0]+=1.0/numPerm
	
	#A stat p-value (one tailed)
	for i in range(0,numPerm):
		if true_a <= a_list[i]:
			pvals[1]+=1.0/numPerm
		

	#KS stat p-value (one tailed)
	for i in range(0,numPerm):
		if true_ks_stat <= ks_stat_list[i]:
			pvals[2]+=1.0/numPerm
		
	#S stat p-value (one tailed)
	for i in range(0,numPerm):
		if abs(math.log(true_s+1.0)) <= abs(math.log(s_list[i]+1.0)):
			pvals[3]+=1.0/numPerm
		


	for i in range(0,len(pvals)):
		if pvals[i] == 0.0:
			pvals[i] = 0.5*(1.0/numPerm)
			
	str_pvals = map(str,pvals)
	f.write("\n"+"Estimated p-values: "+",".join(str_pvals)+"\n")
	f.close()
	
	#Plot results
	pngFile_median = outputFile+".perm.m.png"
	pngFile_area = outputFile+".perm.a.png"
	pngFile_ks = outputFile+".perm.ks.png"
	pngFile_s = outputFile+".perm.s.png"
	
	def _getBinning_(n_bins,min_val,max_val):
		bins = []
		delta = (max_val-min_val)/n_bins
		start_val = min_val-delta*0.5
		for i in range(0,n_bins+2):
			bins.append(start_val+delta*i)
		return (bins,delta)
		
	n_bins = 20+int(4*(math.log(numPerm)))
	import matplotlib
	matplotlib.use('Agg')
	import matplotlib.pyplot as plt
	min_val = min(min(m_list),true_m)
	max_val = max(max(m_list),true_m)
	(bins,delta) = _getBinning_(n_bins,min_val,max_val)
	print (bins,delta)

	plt.figure(figsize=(10,7))
	plt.hist(m_list+[true_m], bins = bins)#, range=[start_val,end_val])
	plt.hist([true_m], bins = bins)#, range=[start_val,end_val])
	plt.savefig(pngFile_median, format = "png")
	plt.legend()
	plt.clf()

	min_val = min(min(a_list),true_a)
	max_val = max(max(a_list),true_a)
	(bins,delta) = _getBinning_(n_bins,min_val,max_val)
	print (bins,delta)
	plt.figure(figsize=(10,7))
	plt.hist(a_list+[true_a], bins = bins)
	plt.hist([true_a], bins = bins)
	plt.savefig(pngFile_area, format = "png")
	plt.clf()

	min_val = min(min(ks_stat_list),true_ks_stat)
	max_val = max(max(ks_stat_list),true_ks_stat)
	(bins,delta) = _getBinning_(n_bins,min_val,max_val)
	print (bins,delta)
	plt.figure(figsize=(10,7))
	plt.hist(ks_stat_list+[true_ks_stat], bins = bins)
	plt.hist([true_ks_stat], bins = bins)
	plt.savefig(pngFile_ks, format = "png")
	plt.clf()

	min_val = min(min(s_list),true_s)
	max_val = max(max(s_list),true_s)
	(bins,delta) = _getBinning_(n_bins,min_val,max_val)
	print (bins,delta)
	plt.figure(figsize=(10,7))
	plt.hist(s_list+[true_s], bins = bins)
	plt.hist([true_s], bins = bins)
	plt.savefig(pngFile_s, format = "png")
	plt.clf()