Beispiel #1
0
 def test_small(self):
     x = [1,2,3,3,4]
     y = [3,2,6,1,6,1,4,1]
     with warnings.catch_warnings(record=True):  # Ties preclude use ...
         W, pval = stats.ansari(x,y)
     assert_almost_equal(W,23.5,11)
     assert_almost_equal(pval,0.13499256881897437,11)
Beispiel #2
0
 def test_result_attributes(self):
     x = [1, 2, 3, 3, 4]
     y = [3, 2, 6, 1, 6, 1, 4, 1]
     with warnings.catch_warnings(record=True):  # Ties preclude use ...
         res = stats.ansari(x, y)
     attributes = ('statistic', 'pvalue')
     check_named_results(res, attributes)
Beispiel #3
0
 def test_small(self):
     x = [1, 2, 3, 3, 4]
     y = [3, 2, 6, 1, 6, 1, 4, 1]
     with warnings.catch_warnings(record=True):  # Ties preclude use ...
         W, pval = stats.ansari(x, y)
     assert_almost_equal(W, 23.5, 11)
     assert_almost_equal(pval, 0.13499256881897437, 11)
Beispiel #4
0
def nonparametric_check_for_d_similarity(df1, df2, alpha=0.01):
    common_features = set(df1.columns) & set(df2.columns)
    features_stats = []
    for col in common_features:
        # H0=same central parameter
        delta_test, delta_pvalue = stats.mannwhitneyu(df1[col], df2[col])
        if delta_pvalue > alpha:
            delta = 'Same central parameter'
        else:
            delta = 'Different central parameter'
        # H0=equality of the scale parameters
        scale1_test, scale1_pval = stats.ansari(df1[col], df2[col])
        if scale1_pval > alpha:
            scale1 = 'Same scale AnsariTest'
        else:
            scale1 = 'Different scale AnsariTest'
        # H0=equality of the scale parameters
        scale2_test, scale2_pval = stats.mood(df1[col], df2[col])
        if scale2_pval > alpha:
            scale2 = 'Same scale MoodTest'
        else:
            scale2 = 'Different scale MoodTest'
        features_stats.append([col, delta_pvalue, delta, scale1_pval, scale1, scale2_pval, scale2])
    features_stats = pd.DataFrame(features_stats)
    features_stats.columns = ['col_name', 'delta_pval', 'delta_status', \
                              'scale1_pval', 'scale1_status', 'scale2_pval', 'scale2_status']
    return features_stats
Beispiel #5
0
 def test_result_attributes(self):
     x = [1, 2, 3, 3, 4]
     y = [3, 2, 6, 1, 6, 1, 4, 1]
     with warnings.catch_warnings(record=True):  # Ties preclude use ...
         res = stats.ansari(x, y)
     attributes = ('statistic', 'pvalue')
     check_named_results(res, attributes)
Beispiel #6
0
 def test_approx(self):
     ramsay = np.array((111, 107, 100, 99, 102, 106, 109, 108, 104, 99, 101,
                        96, 97, 102, 107, 113, 116, 113, 110, 98))
     parekh = np.array((107, 108, 106, 98, 105, 103, 110, 105, 104, 100, 96,
                        108, 103, 104, 114, 114, 113, 108, 106, 99))
     W, pval = stats.ansari(ramsay, parekh)
     assert_almost_equal(W, 185.5, 11)
     assert_almost_equal(pval, 0.18145819972867083, 11)
Beispiel #7
0
 def test_approx(self):
     ramsay = np.array((111, 107, 100, 99, 102, 106, 109, 108, 104, 99,
                        101, 96, 97, 102, 107, 113, 116, 113, 110, 98))
     parekh = np.array((107, 108, 106, 98, 105, 103, 110, 105, 104,
                        100, 96, 108, 103, 104, 114, 114, 113, 108, 106, 99))
     W, pval = stats.ansari(ramsay, parekh)
     assert_almost_equal(W,185.5,11)
     assert_almost_equal(pval,0.18145819972867083,11)
Beispiel #8
0
    def test_approx(self):
        ramsay = np.array((111, 107, 100, 99, 102, 106, 109, 108, 104, 99,
                           101, 96, 97, 102, 107, 113, 116, 113, 110, 98))
        parekh = np.array((107, 108, 106, 98, 105, 103, 110, 105, 104,
                           100, 96, 108, 103, 104, 114, 114, 113, 108, 106, 99))

        with warnings.catch_warnings():
            warnings.filterwarnings('ignore',
                        message="Ties preclude use of exact statistic.")
            W, pval = stats.ansari(ramsay, parekh)

        assert_almost_equal(W,185.5,11)
        assert_almost_equal(pval,0.18145819972867083,11)
Beispiel #9
0
    def test_approx(self):
        ramsay = np.array((111, 107, 100, 99, 102, 106, 109, 108, 104, 99, 101,
                           96, 97, 102, 107, 113, 116, 113, 110, 98))
        parekh = np.array((107, 108, 106, 98, 105, 103, 110, 105, 104, 100, 96,
                           108, 103, 104, 114, 114, 113, 108, 106, 99))

        with warnings.catch_warnings():
            warnings.filterwarnings(
                'ignore', message="Ties preclude use of exact statistic.")
            W, pval = stats.ansari(ramsay, parekh)

        assert_almost_equal(W, 185.5, 11)
        assert_almost_equal(pval, 0.18145819972867083, 11)
Beispiel #10
0
    def test_approx(self):
        ramsay = np.array((111, 107, 100, 99, 102, 106, 109, 108, 104, 99, 101,
                           96, 97, 102, 107, 113, 116, 113, 110, 98))
        parekh = np.array((107, 108, 106, 98, 105, 103, 110, 105, 104, 100, 96,
                           108, 103, 104, 114, 114, 113, 108, 106, 99))

        warn_ctx = WarningManager()
        warn_ctx.__enter__()
        try:
            warnings.filterwarnings(
                'ignore', message="Ties preclude use of exact statistic.")
            W, pval = stats.ansari(ramsay, parekh)
        finally:
            warn_ctx.__exit__()

        assert_almost_equal(W, 185.5, 11)
        assert_almost_equal(pval, 0.18145819972867083, 11)
Beispiel #11
0
    def test_approx(self):
        ramsay = np.array((111, 107, 100, 99, 102, 106, 109, 108, 104, 99,
                           101, 96, 97, 102, 107, 113, 116, 113, 110, 98))
        parekh = np.array((107, 108, 106, 98, 105, 103, 110, 105, 104,
                           100, 96, 108, 103, 104, 114, 114, 113, 108, 106, 99))

        warn_ctx = WarningManager()
        warn_ctx.__enter__()
        try:
            warnings.filterwarnings('ignore',
                        message="Ties preclude use of exact statistic.")
            W, pval = stats.ansari(ramsay, parekh)
        finally:
            warn_ctx.__exit__()

        assert_almost_equal(W,185.5,11)
        assert_almost_equal(pval,0.18145819972867083,11)
Beispiel #12
0
def vector_hypotheses(a, b):

    dict_stat = {}
    dict_pval = {}
    pea = pearsonr(a, b)
    dict_stat["pearsonr"], dict_pval["pearsonr"] = pea[0], pea[1]
    ran = ranksums(a, b)
    dict_stat["ranksums"], dict_pval["ranksums"] = ran[0], ran[1]
    moo = mood(a, b)
    dict_stat["mood"], dict_pval["mood"] = moo[0], moo[1]
    fli = fligner(a, b)
    dict_stat["fligner"], dict_pval["fligner"] = fli[0], fli[1]
    ans = ansari(a, b)
    dict_stat["ansari"], dict_pval["ansari"] = ans[0], ans[1]
    bar = bartlett(a, b)
    dict_stat["bartlett"], dict_pval["bartlett"] = bar[0], bar[1]
    lev = levene(a, b)
    dict_stat["levene"], dict_pval["levene"] = lev[0], lev[1]
    man = mannwhitneyu(a, b)
    dict_stat["mannwhitneyu"], dict_pval["mannwhitneyu"] = man[0], man[1]
    return dict_stat, dict_pval
def plotExpBox_Main(inputFiles,headers,valcols,outputFile,sep,startRow,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,plotPvalueCluster,outputClusterPrefix,methodCluster,xlegendrotation,xlabe,ylabe,figsz,titl,showSampleSizes,trimToMinSize,relabels,logb,plotHistogramToFile,plotMedianForGroups,botta,showViolin,showBox,firstColAnnot,plotTrend,showLegend,makePzfxFile,makeBinMatrix,writeDataSummaryStat,summaryStatRange,minuslog10pvalue,minNDataToKeep,vfacecolor,valpha,outXYZPvalues,dividePlots):

	#if plotPvalueCluster:
		#if pvalue cluster is needed:
	#	from Bio.Cluster.cluster import *
	#	from Bio.Cluster import *
		#endif


	
	#the real deal!
	plotData=[]	
	xtickLabels=[]
	
	trendData={}
	annot={}
	
	minSize=-1

	for inputFile,header,cols in zip(inputFiles,headers,valcols):
		fin=generic_istream(inputFile)
		
		startIdx=len(plotData)
		
		if firstColAnnot:
			colAnnot=cols[0]
			cols=cols[1:]
			annotThisFile=[]
			annot[startIdx]=annotThisFile
		else:
			colAnnot=-1
			annotThisFile=None
			
		for col in cols:
			plotData.append([])
			xtickLabels.append(header[col])

		colIndices=range(startIdx,startIdx+len(cols))
		
		if plotTrend:
			#print >> stderr,"plotTrend"
			trendDataThisFile=[]
			trendData[startIdx]=trendDataThisFile
		else:
			trendDataThisFile=None
			
			
		lino=0
		for lin in fin:
			lino+=1
			if lino<startRow:
				continue		
			fields=lin.rstrip("\r\n").split(sep)
			
			if plotTrend:
				#print >> stderr,"a"
				trendDataThisLine=[]
			else:
				trendDataThisLine=None
			
			allDataOKThisLine=True
			
			if colAnnot>=0:
				annotThisFile.append(fields[colAnnot])
			
			for idx,col in zip(colIndices,cols):
				try:
					value=float(fields[col])
					if logb!=0:
						if value==0.0:
							raise ValueError
						value=log(value)/logb							
					plotData[idx].append(value)
					
					if plotTrend:
						trendDataThisLine.append(value)
						#print >> stderr,"value:",value
					
				except:
					allDataOKThisLine=False	
				
			if plotTrend:
				if allDataOKThisLine:
					trendDataThisFile.append(trendDataThisLine)
				else:
					trendDataThisFile.append(None)
			
		fin.close()
	
		
		if minSize==-1:
			minSize=len(plotData[idx]) #or startIDX?
		else:
			minSize=min([minSize,len(plotData[idx])])
		

	if trimToMinSize:
		print >> stderr,"trimming to min size =",minSize
		trimData(plotData,minSize)

	if len(relabels)>0:
		#if len(relabels)!=len(xtickLabels):
		#	print >> stderr,"relabels doesn't have the same length as original label vectors",xtickLabels,"=>",relabels
		#	exit()
		print >> stderr,xtickLabels
		print >> stderr,relabels
		for i,relabel in zip(range(0,len(relabels)),relabels):
			xtickLabels[i]=relabel
		
	
	for i in range(0,len(plotMedianForGroups)):
		plotMedianForGroups[i]=getCol0ListFromCol1ListStringAdv(xtickLabels,plotMedianForGroups[i])
			
	
	#drawing medians:
	medianToDraw=[]
	for mediangrouper in plotMedianForGroups:
		curD=[]		
		for c in mediangrouper:
			curD.extend(plotData[c])
		medianToDraw.append(median(curD))


	for c in range(len(plotData)-1,-1,-1):
		if len(plotData[c])<minNDataToKeep:
			print >> stderr,xtickLabels[c],"discarded because has only",len(plotData[c]),"data points <",minNDataToKeep
			del plotData[c]
			del xtickLabels[c]

	if not skipStat:
		print >> stdout,"student t-test (1 sample; mean=0)"
		print >> stdout,"sample","mean","p-val","median"
	
		if writeDataSummaryStat:
			fDSS=open(writeDataSummaryStat,"w")
			print >> fDSS,"sample\tmean\tvar\tsd\tmin\tmax\tN\tNInRange["+str(summaryStatRange[0])+","+str(summaryStatRange[1])+"]\t%NInRange\tNbelowRange\t%Nbelow\tNAboveRange\t%NAbove"
			
		for x in range(0,len(plotData)):
			#print >> stderr, len(plotData[x])
			try:
				print >> stdout, xtickLabels[x],mean(plotData[x]),ttest_1samp(plotData[x],0)[1],median(plotData[x])
			except:
				print >> stdout, xtickLabels[x],mean(plotData[x]),"NA",median(plotData[x])
			
			if writeDataSummaryStat:
				sumData,N,NIN,NBelow,NAbove=filterDataInRangeInclusive(plotData[x],summaryStatRange[0],summaryStatRange[1])
				
				if NIN>1:
					#print >> stderr,"sumData=",sumData
					#print >> stderr,mean
					mea=mean2(sumData)
					DDOF=1
					sd=std(sumData,ddof=DDOF)
					var=sd*sd
					mi=min(sumData)
					ma=max(sumData)
				else:
					mea="NA"
					sd="NA"
					var="NA"
					mi="NA"
					ma="NA"
				
			
					
				print >> fDSS,xtickLabels[x]+"\t"+str(mea)+"\t"+str(var)+"\t"+str(sd)+"\t"+str(mi)+"\t"+str(ma)+"\t"+str(N)+"\t"+str(NIN)+"\t"+str(float(NIN)*100/N)+"\t"+str(NBelow)+"\t"+str(float(NBelow)*100/N)+"\t"+str(NAbove)+"\t"+str(float(NAbove)*100/N)
			
	
		pvalueM=[]
		
		if writeDataSummaryStat:
			fDSS.close()
		
		print >> stdout,""
		
		print >> stdout,"student t-test (2 samples)"
		print >> stdout,"p-val",
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
		print >> stdout,""
	
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
				else:
					try:
						pvalue=ttest_ind(plotData[x],plotData[y])[1]
					except:
						pvalue=1.0
					
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
					
					print >> stdout, str(pvalue),
					pvalueRow.append(pvalue)
			print >> stdout,"";	
	
		
		print >> stdout,""
	
		
	
	
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_t_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_t",xtickLabels,pvalueM,methodCluster)
	
	
			
		pvalueM=[]
	
		print >> stdout,"welch t-test"
		print >> stdout,"p-val",
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
		print >> stdout,""
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
						
				else:
					try:
						pvalue=welchs_approximate_ttest_arr(plotData[x],plotData[y])[3]
					except:
						pvalue=1.0
	
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
	
					
					print >> stdout, str(pvalue),
					pvalueRow.append(pvalue)
			print >> stdout,"";
	
		if outXYZPvalues:
			writeXYZPvalues(outXYZPvalues+"_Welch.xyz",xtickLabels,pvalueM)
	
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_Welch_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_Welch",xtickLabels,pvalueM,methodCluster)
	
		
		print >> stdout,""
		print >> stdout,"non-parametric (Mann-Whitney U)" #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)"
		print >> stdout,"p-val",
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
	
		pvalueM=[]
	
		print >> stdout,""
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
				else:
					#if max(len(plotData[x]),len(plotData[y]))<=20:
					try:
						pvalue=mannwhitneyu(plotData[x],plotData[y])[1]*2				
					except:
						pvalue=1.0
	
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
	
	
					print >> stdout,pvalue, #mann-whiteney need to mul by 2 (one tail to two tail)
					pvalueRow.append(pvalue)
					#else:
					#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
			print >> stdout,"";	
	
		if outXYZPvalues:
			writeXYZPvalues(outXYZPvalues+"_U.xyz",xtickLabels,pvalueM)
		
	
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_U_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_U",xtickLabels,pvalueM,methodCluster)
		
		#####now the variance tests
		
		print >> stdout,""
		print >> stdout,"Ansari-Bradley Two-sample Test for difference in scale parameters " 
		print >> stdout,"p-val",
		
		
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
	
		pvalueM=[]
	
		print >> stdout,""
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
				else:
					#if max(len(plotData[x]),len(plotData[y]))<=20:
					try:
						pvalue=ansari(plotData[x],plotData[y])[1]		
					except:
						pvalue="NA"
	
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
	
	
						#pvalue=1.0
					print >> stdout,pvalue,
					pvalueRow.append(pvalue)
					#else:
					#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
			print >> stdout,"";	
		
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_Ansari_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_Ansari",xtickLabels,pvalueM,methodCluster)	
		
		
		#####
	
		#####now the variance tests
		
		print >> stdout,""
		print >> stdout,"Fligner's Two-sample Test for equal variance (non-parametrics)" 
		print >> stdout,"p-val",
		
		
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
	
		pvalueM=[]
	
		print >> stdout,""
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
				else:
					#if max(len(plotData[x]),len(plotData[y]))<=20:
					try:
						pvalue=fligner(plotData[x],plotData[y])[1]		
					except:
						pvalue="NA"
						#pvalue=1.0
						
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
	
	
					print >> stdout,pvalue,
					pvalueRow.append(pvalue)
					#else:
					#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
			print >> stdout,"";	
		
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_fligner_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_fligner",xtickLabels,pvalueM,methodCluster)	
		
		
		#####
	
		#####now the variance tests
		
		print >> stdout,""
		print >> stdout,"Levene's Two-sample Test for equal variance" 
		print >> stdout,"p-val",
		
		
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
	
		pvalueM=[]
	
		print >> stdout,""
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
				else:
					#if max(len(plotData[x]),len(plotData[y]))<=20:
					try:
						pvalue=levene(plotData[x],plotData[y])[1]		
					except:
						pvalue="NA"
						#pvalue=1.0
						
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
	
	
					print >> stdout,pvalue,
					pvalueRow.append(pvalue)
					#else:
					#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
			print >> stdout,"";	
		
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_levene_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_levene",xtickLabels,pvalueM,methodCluster)	
		
		
		#####
	
		#####now the variance tests
		
		print >> stdout,""
		print >> stdout,"Bartlett's Two-sample Test for equal variance (for normal distributions)" 
		print >> stdout,"p-val",
		
		
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
	
		pvalueM=[]
	
		print >> stdout,""
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
				else:
					#if max(len(plotData[x]),len(plotData[y]))<=20:
					try:
						pvalue=bartlett(plotData[x],plotData[y])[1]		
					except:
						pvalue="NA"
						#pvalue=1.0
	
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
	
	
					print >> stdout,pvalue,
					pvalueRow.append(pvalue)
					#else:
					#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
			print >> stdout,"";	
		
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_bartlett_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_bartlett",xtickLabels,pvalueM,methodCluster)	
		
		
		#####

	figure(figsize=figsz)
	subplots_adjust(top=0.9, bottom=botta, left=0.2, right=0.8)
	
	if len(titl)==0:
		titl=outputFile


	plotExpBox(plotData,xtickLabels,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,xlegendrotation,xlabe,ylabe,titl,showSampleSizes,showViolin,showBox,annot,trendData,showLegend,makePzfxFile,makeBinMatrix,dividePlots)
	
	#ylim([0,200])
	for m in medianToDraw:
		axhline(y=m,linestyle=':',color='gray')

	savefig(outputFile,bbox_inches="tight")

	if len(plotHistogramToFile)>0:
		drawHistogram(plotHistogramToFile,plotData,xtickLabels)
		drawDensigram(plotHistogramToFile+".density.png",plotData,xtickLabels)
def custom(a, b):
    v, p = stats.ansari(a, b)
    return p
Beispiel #15
0
 def test_exact(self):
     W,pval = stats.ansari([1,2,3,4],[15,5,20,8,10,12])
     assert_almost_equal(W,10.0,11)
     assert_almost_equal(pval,0.533333333333333333,7)
Beispiel #16
0
 def test_small(self):
     x = [1,2,3,3,4]
     y = [3,2,6,1,6,1,4,1]
     W, pval = stats.ansari(x,y)
     assert_almost_equal(W,23.5,11)
     assert_almost_equal(pval,0.13499256881897437,11)
Beispiel #17
0
def PH_Walker(y, walkerRule='prop', walkerParams=np.array([])):
    """

    PH_Walker simulates a hypothetical walker moving through the time domain

    the hypothetical particle (or 'walker') moves in response to values of the time series at each point

    Outputs from this operation are summaries of the walkers motion, and comparisons of it to the original time series

    :param y: the input time series
    :param walkerRule: the kinematic rule by which the walker moves in response to the time series over time
            (i) 'prop': the walker narrows the gap between its value and that of the time series by a given proportion p

            (ii) 'biasprop': the walker is biased to move more in one direction; when it is being pushed up by the time
            series, it narrows the gap by a proportion p_{up}, and when it is being pushed down by the
            time series it narrows the gap by a (potentially different) proportion p_{down}. walkerParams = [pup,pdown]

            (iii) 'momentum': the walker moves as if it has mass m and inertia
             from the previous time step and the time series acts
             as a force altering its motion in a classical
             Newtonian dynamics framework. [walkerParams = m], the mass.

             (iv) 'runningvar': the walker moves with inertia as above, but
             its values are also adjusted so as to match the local
             variance of time series by a multiplicative factor.
             walkerParams = [m,wl], where m is the inertial mass and wl
             is the window length.

    :param walkerParams: the parameters for the specified walker, explained above

    :return: include the mean, spread, maximum, minimum, and autocorrelation of
            the walker's trajectory, the number of crossings between the walker and the
            original time series, the ratio or difference of some basic summary statistics
            between the original time series and the walker, an Ansari-Bradley test
            comparing the distributions of the walker and original time series, and
            various statistics summarizing properties of the residuals between the
            walker's trajectory and the original time series.

    """

    # ----------------------------------------------------------------------------------------------------------------------------------
    # PRELIMINARIES
    #----------------------------------------------------------------------------------------------------------------------------------

    N = len(y)

    #----------------------------------------------------------------------------------------------------------------------------------
    # CHECK INPUTS
    #----------------------------------------------------------------------------------------------------------------------------------

    if (len(walkerParams) == 0):

        if walkerRule == 'prop':
            walkerParams = np.array([0.5])
        if walkerRule == 'biasprop':
            walkerParams = np.array([0.1, 0.2])
        if walkerRule == 'momentum':
            walkerParams = np.array([2])
        if walkerRule == 'runningvar':
            walkerParams = [1.5, 50]

    #----------------------------------------------------------------------------------------------------------------------------------
    # (1) WALK
    #----------------------------------------------------------------------------------------------------------------------------------

    w = np.zeros(N)

    if walkerRule == 'prop':

        # walker starts at zero and narrows the gap between its position
        # and the time series value at that point by the proportion given
        # in walkerParams, to give the value at the subsequent time step

        p = walkerParams
        w[0] = 0

        for i in range(1, N):
            w[i] = w[i - 1] + p * (y[i - 1] - w[i - 1])

    elif walkerRule == 'biasprop':
        # walker is biased in one or the other direction (i.e., prefers to
        # go up, or down). Requires a vector of inputs: [p_up, p_down]

        pup = walkerParams[0]
        pdown = walkerParams[1]

        w[0] = 0

        for i in range(1, N):
            if y[i] > y[i - 1]:
                w[i] = w[i - 1] + pup * (y[i - 1] - w[i - 1])
            else:
                w[i] = w[i - 1] + pdown * (y[i - 1] - w[i - 1])

    elif walkerRule == 'momentum':
        # walker moves as if it had inertia from the previous time step,
        # i.e., it 'wants' to move the same amount; the time series acts as
        # a force changing its motion

        m = walkerParams[0]  # inertial mass

        w[0] = y[0]
        w[1] = y[1]

        for i in range(2, N):
            w_inert = w[i - 1] + (w[i - 1] - w[i - 2])
            w[i] = w_inert + (y[i] - w_inert) / m  # dissipative term
            #equation of motion (s-s_0 = ut + F/m*t^2)
            #where the 'force' is F is the change in the original time series at the point

    elif walkerRule == 'runningvar':

        m = walkerParams[0]
        wl = walkerParams[1]

        w[0] = y[0]
        w[1] = y[1]

        for i in range(2, N):
            w_inert = w[i - 1] + (w[i - 1] - w[i - 2])
            w_mom = w_inert + (y[i] -
                               w_inert) / m  #dissipative term from time series

            if i > wl:
                w[i] = w_mom * (np.std(y[(i - wl):i])) / np.std(w[(i - wl):i])

            else:
                w[i] = w_mom

    else:

        print("Error: Unknown method: " + walkerRule +
              " for simulating walker on the time series")

    #----------------------------------------------------------------------------------------------------------------------------------
    # (2) STATISITICS ON THE WALK
    #----------------------------------------------------------------------------------------------------------------------------------

    out = {}  # dictionary for storing variables

    # (i) The walk itself -------------------------------------------------------------------------------------------

    out['w_mean'] = np.mean(w)
    out['w_median'] = np.median(w)
    out['w_std'] = np.std(w)
    out['w_ac1'] = ac.CO_AutoCorr(
        w, 1, method='timedomainstat'
    )  # this function call in MATLAB uses method='Fourier', but we don't have that case implemented yet in autoCorr, however this seems to output the same thing
    out['w_ac2'] = ac.CO_AutoCorr(w, 2, method='timedomainstat')
    out['w_tau'] = fz.CO_FirstZero(w, 'ac')
    out['w_min'] = np.min(w)
    out['w_max'] = np.max(w)
    out['propzcross'] = sum(
        np.multiply(w[0:(len(w) - 2)], w[1:(len(w) - 1)]) < 0) / (
            N - 1
        )  # np.multiply performs elementwise multiplication like matlab .*
    # differences between the walk at signal

    # (ii) Differences between the walk at signal -------------------------------------------------------------------

    out['sw_meanabsdiff'] = np.mean(np.abs(y - w))
    out['sw_taudiff'] = fz.CO_FirstZero(y, 'ac') - fz.CO_FirstZero(w, 'ac')
    out['sw_stdrat'] = np.std(w) / np.std(
        y)  # will be thse same as w_std for z-scored signal
    out['sw_ac1rat'] = out['w_ac1'] / ac.CO_AutoCorr(y, 1)
    out['sw_minrat'] = min(w) / min(y)
    out['sw_maxrat'] = max(w) / max(y)
    out['sw_propcross'] = sum(
        np.multiply(w[0:(len(w) - 1)] - y[0:(len(y) - 1)], w[1:(
            len(w))] - y[1:(len(y))]) < 0) / (
                N - 1
            )  #np.multiply performs elementwise multiplication like matlab .*

    ansari = stats.ansari(w, y)
    out['sw_ansarib_pval'] = ansari[1]

    # r = np.linspace( np.min(np.min(y), np.min(w)), np.max(np.max(y), np.max(w)), 200 )
    # dy = stats.gaussian_kde(y, r)

    # (iii) looking at residuals between time series and walker

    res = w - y

    # CLOSEST FUNCTION TO MATLAB RUNSTEST, found in statsmodels.sandbox.stats.runs
    # runstest = runs.runstest_2samp(res, groups=2)
    # out['res_runstest'] = runstest

    out['out.res_acl'] = ac.CO_AutoCorr(res, lag=1)

    return out
 def test_ansariBradleyTest_approxOdd_xResult(self):
     data_1 = np.arange(1, 101)
     data_2 = np.arange(50, 151)
     x1, p1 = ansari_bradley_test(data_1, data_2, alternative="two-sided")
     x2, p2 = ansari(data_1, data_2)
     assert pytest.approx(x2) == x1
Beispiel #19
0
 def test_exact(self):
     W, pval = stats.ansari([1, 2, 3, 4], [15, 5, 20, 8, 10, 12])
     assert_almost_equal(W, 10.0, 11)
     assert_almost_equal(pval, 0.533333333333333333, 7)
Beispiel #20
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--infile", required=True, help="Tabular file.")
    parser.add_argument("-o",
                        "--outfile",
                        required=True,
                        help="Path to the output file.")
    parser.add_argument("--sample_one_cols",
                        help="Input format, like smi, sdf, inchi")
    parser.add_argument("--sample_two_cols",
                        help="Input format, like smi, sdf, inchi")
    parser.add_argument(
        "--sample_cols",
        help="Input format, like smi, sdf, inchi,separate arrays using ;",
    )
    parser.add_argument("--test_id", help="statistical test method")
    parser.add_argument(
        "--mwu_use_continuity",
        action="store_true",
        default=False,
        help=
        "Whether a continuity correction (1/2.) should be taken into account.",
    )
    parser.add_argument(
        "--equal_var",
        action="store_true",
        default=False,
        help=
        "If set perform a standard independent 2 sample test that assumes equal population variances. If not set, perform Welch's t-test, which does not assume equal population variance.",
    )
    parser.add_argument(
        "--reta",
        action="store_true",
        default=False,
        help="Whether or not to return the internally computed a values.",
    )
    parser.add_argument(
        "--fisher",
        action="store_true",
        default=False,
        help="if true then Fisher definition is used",
    )
    parser.add_argument(
        "--bias",
        action="store_true",
        default=False,
        help=
        "if false,then the calculations are corrected for statistical bias",
    )
    parser.add_argument(
        "--inclusive1",
        action="store_true",
        default=False,
        help="if false,lower_limit will be ignored",
    )
    parser.add_argument(
        "--inclusive2",
        action="store_true",
        default=False,
        help="if false,higher_limit will be ignored",
    )
    parser.add_argument(
        "--inclusive",
        action="store_true",
        default=False,
        help="if false,limit will be ignored",
    )
    parser.add_argument(
        "--printextras",
        action="store_true",
        default=False,
        help=
        "If True, if there are extra points a warning is raised saying how many of those points there are",
    )
    parser.add_argument(
        "--initial_lexsort",
        action="store_true",
        default="False",
        help=
        "Whether to use lexsort or quicksort as the sorting method for the initial sort of the inputs.",
    )
    parser.add_argument(
        "--correction",
        action="store_true",
        default=False,
        help="continuity correction ",
    )
    parser.add_argument(
        "--axis",
        type=int,
        default=0,
        help=
        "Axis can equal None (ravel array first), or an integer (the axis over which to operate on a and b)",
    )
    parser.add_argument(
        "--n",
        type=int,
        default=0,
        help=
        "the number of trials. This is ignored if x gives both the number of successes and failures",
    )
    parser.add_argument("--b",
                        type=int,
                        default=0,
                        help="The number of bins to use for the histogram")
    parser.add_argument("--N",
                        type=int,
                        default=0,
                        help="Score that is compared to the elements in a.")
    parser.add_argument("--ddof",
                        type=int,
                        default=0,
                        help="Degrees of freedom correction")
    parser.add_argument(
        "--score",
        type=int,
        default=0,
        help="Score that is compared to the elements in a.",
    )
    parser.add_argument("--m", type=float, default=0.0, help="limits")
    parser.add_argument("--mf", type=float, default=2.0, help="lower limit")
    parser.add_argument("--nf", type=float, default=99.9, help="higher_limit")
    parser.add_argument(
        "--p",
        type=float,
        default=0.5,
        help=
        "The hypothesized probability of success. 0 <= p <= 1. The default value is p = 0.5",
    )
    parser.add_argument("--alpha", type=float, default=0.9, help="probability")
    parser.add_argument(
        "--new",
        type=float,
        default=0.0,
        help="Value to put in place of values in a outside of bounds",
    )
    parser.add_argument(
        "--proportiontocut",
        type=float,
        default=0.0,
        help="Proportion (in range 0-1) of total data set to trim of each end.",
    )
    parser.add_argument(
        "--lambda_",
        type=float,
        default=1.0,
        help=
        "lambda_ gives the power in the Cressie-Read power divergence statistic",
    )
    parser.add_argument(
        "--imbda",
        type=float,
        default=0,
        help=
        "If lmbda is not None, do the transformation for that value.If lmbda is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument.",
    )
    parser.add_argument(
        "--base",
        type=float,
        default=1.6,
        help="The logarithmic base to use, defaults to e",
    )
    parser.add_argument("--dtype", help="dtype")
    parser.add_argument("--med", help="med")
    parser.add_argument("--cdf", help="cdf")
    parser.add_argument("--zero_method", help="zero_method options")
    parser.add_argument("--dist", help="dist options")
    parser.add_argument("--ties", help="ties options")
    parser.add_argument("--alternative", help="alternative options")
    parser.add_argument("--mode", help="mode options")
    parser.add_argument("--method", help="method options")
    parser.add_argument("--md", help="md options")
    parser.add_argument("--center", help="center options")
    parser.add_argument("--kind", help="kind options")
    parser.add_argument("--tail", help="tail options")
    parser.add_argument("--interpolation", help="interpolation options")
    parser.add_argument("--statistic", help="statistic options")

    args = parser.parse_args()
    infile = args.infile
    outfile = open(args.outfile, "w+")
    test_id = args.test_id
    nf = args.nf
    mf = args.mf
    imbda = args.imbda
    inclusive1 = args.inclusive1
    inclusive2 = args.inclusive2
    sample0 = 0
    sample1 = 0
    sample2 = 0
    if args.sample_cols is not None:
        sample0 = 1
        barlett_samples = []
        for sample in args.sample_cols.split(";"):
            barlett_samples.append(map(int, sample.split(",")))
    if args.sample_one_cols is not None:
        sample1 = 1
        sample_one_cols = args.sample_one_cols.split(",")
    if args.sample_two_cols is not None:
        sample_two_cols = args.sample_two_cols.split(",")
        sample2 = 1
    for line in open(infile):
        sample_one = []
        sample_two = []
        cols = line.strip().split("\t")
        if sample0 == 1:
            b_samples = columns_to_values(barlett_samples, line)
        if sample1 == 1:
            for index in sample_one_cols:
                sample_one.append(cols[int(index) - 1])
        if sample2 == 1:
            for index in sample_two_cols:
                sample_two.append(cols[int(index) - 1])
        if test_id.strip() == "describe":
            size, min_max, mean, uv, bs, bk = stats.describe(
                map(float, sample_one))
            cols.append(size)
            cols.append(min_max)
            cols.append(mean)
            cols.append(uv)
            cols.append(bs)
            cols.append(bk)
        elif test_id.strip() == "mode":
            vals, counts = stats.mode(map(float, sample_one))
            cols.append(vals)
            cols.append(counts)
        elif test_id.strip() == "nanmean":
            m = stats.nanmean(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "nanmedian":
            m = stats.nanmedian(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "kurtosistest":
            z_value, p_value = stats.kurtosistest(map(float, sample_one))
            cols.append(z_value)
            cols.append(p_value)
        elif test_id.strip() == "variation":
            ra = stats.variation(map(float, sample_one))
            cols.append(ra)
        elif test_id.strip() == "itemfreq":
            freq = stats.itemfreq(map(float, sample_one))
            for list in freq:
                elements = ",".join(map(str, list))
                cols.append(elements)
        elif test_id.strip() == "nanmedian":
            m = stats.nanmedian(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "variation":
            ra = stats.variation(map(float, sample_one))
            cols.append(ra)
        elif test_id.strip() == "boxcox_llf":
            IIf = stats.boxcox_llf(imbda, map(float, sample_one))
            cols.append(IIf)
        elif test_id.strip() == "tiecorrect":
            fa = stats.tiecorrect(map(float, sample_one))
            cols.append(fa)
        elif test_id.strip() == "rankdata":
            r = stats.rankdata(map(float, sample_one), method=args.md)
            cols.append(r)
        elif test_id.strip() == "nanstd":
            s = stats.nanstd(map(float, sample_one), bias=args.bias)
            cols.append(s)
        elif test_id.strip() == "anderson":
            A2, critical, sig = stats.anderson(map(float, sample_one),
                                               dist=args.dist)
            cols.append(A2)
            for list in critical:
                cols.append(list)
            cols.append(",")
            for list in sig:
                cols.append(list)
        elif test_id.strip() == "binom_test":
            p_value = stats.binom_test(map(float, sample_one),
                                       n=args.n,
                                       p=args.p)
            cols.append(p_value)
        elif test_id.strip() == "gmean":
            gm = stats.gmean(map(float, sample_one), dtype=args.dtype)
            cols.append(gm)
        elif test_id.strip() == "hmean":
            hm = stats.hmean(map(float, sample_one), dtype=args.dtype)
            cols.append(hm)
        elif test_id.strip() == "kurtosis":
            k = stats.kurtosis(
                map(float, sample_one),
                axis=args.axis,
                fisher=args.fisher,
                bias=args.bias,
            )
            cols.append(k)
        elif test_id.strip() == "moment":
            n_moment = stats.moment(map(float, sample_one), n=args.n)
            cols.append(n_moment)
        elif test_id.strip() == "normaltest":
            k2, p_value = stats.normaltest(map(float, sample_one))
            cols.append(k2)
            cols.append(p_value)
        elif test_id.strip() == "skew":
            skewness = stats.skew(map(float, sample_one), bias=args.bias)
            cols.append(skewness)
        elif test_id.strip() == "skewtest":
            z_value, p_value = stats.skewtest(map(float, sample_one))
            cols.append(z_value)
            cols.append(p_value)
        elif test_id.strip() == "sem":
            s = stats.sem(map(float, sample_one), ddof=args.ddof)
            cols.append(s)
        elif test_id.strip() == "zscore":
            z = stats.zscore(map(float, sample_one), ddof=args.ddof)
            for list in z:
                cols.append(list)
        elif test_id.strip() == "signaltonoise":
            s2n = stats.signaltonoise(map(float, sample_one), ddof=args.ddof)
            cols.append(s2n)
        elif test_id.strip() == "percentileofscore":
            p = stats.percentileofscore(map(float, sample_one),
                                        score=args.score,
                                        kind=args.kind)
            cols.append(p)
        elif test_id.strip() == "bayes_mvs":
            c_mean, c_var, c_std = stats.bayes_mvs(map(float, sample_one),
                                                   alpha=args.alpha)
            cols.append(c_mean)
            cols.append(c_var)
            cols.append(c_std)
        elif test_id.strip() == "sigmaclip":
            c, c_low, c_up = stats.sigmaclip(map(float, sample_one),
                                             low=args.m,
                                             high=args.n)
            cols.append(c)
            cols.append(c_low)
            cols.append(c_up)
        elif test_id.strip() == "kstest":
            d, p_value = stats.kstest(
                map(float, sample_one),
                cdf=args.cdf,
                N=args.N,
                alternative=args.alternative,
                mode=args.mode,
            )
            cols.append(d)
            cols.append(p_value)
        elif test_id.strip() == "chi2_contingency":
            chi2, p, dof, ex = stats.chi2_contingency(
                map(float, sample_one),
                correction=args.correction,
                lambda_=args.lambda_)
            cols.append(chi2)
            cols.append(p)
            cols.append(dof)
            cols.append(ex)
        elif test_id.strip() == "tmean":
            if nf == 0 and mf == 0:
                mean = stats.tmean(map(float, sample_one))
            else:
                mean = stats.tmean(map(float, sample_one), (mf, nf),
                                   (inclusive1, inclusive2))
            cols.append(mean)
        elif test_id.strip() == "tmin":
            if mf == 0:
                min = stats.tmin(map(float, sample_one))
            else:
                min = stats.tmin(map(float, sample_one),
                                 lowerlimit=mf,
                                 inclusive=args.inclusive)
            cols.append(min)
        elif test_id.strip() == "tmax":
            if nf == 0:
                max = stats.tmax(map(float, sample_one))
            else:
                max = stats.tmax(map(float, sample_one),
                                 upperlimit=nf,
                                 inclusive=args.inclusive)
            cols.append(max)
        elif test_id.strip() == "tvar":
            if nf == 0 and mf == 0:
                var = stats.tvar(map(float, sample_one))
            else:
                var = stats.tvar(map(float, sample_one), (mf, nf),
                                 (inclusive1, inclusive2))
            cols.append(var)
        elif test_id.strip() == "tstd":
            if nf == 0 and mf == 0:
                std = stats.tstd(map(float, sample_one))
            else:
                std = stats.tstd(map(float, sample_one), (mf, nf),
                                 (inclusive1, inclusive2))
            cols.append(std)
        elif test_id.strip() == "tsem":
            if nf == 0 and mf == 0:
                s = stats.tsem(map(float, sample_one))
            else:
                s = stats.tsem(map(float, sample_one), (mf, nf),
                               (inclusive1, inclusive2))
            cols.append(s)
        elif test_id.strip() == "scoreatpercentile":
            if nf == 0 and mf == 0:
                s = stats.scoreatpercentile(
                    map(float, sample_one),
                    map(float, sample_two),
                    interpolation_method=args.interpolation,
                )
            else:
                s = stats.scoreatpercentile(
                    map(float, sample_one),
                    map(float, sample_two),
                    (mf, nf),
                    interpolation_method=args.interpolation,
                )
            for list in s:
                cols.append(list)
        elif test_id.strip() == "relfreq":
            if nf == 0 and mf == 0:
                rel, low_range, binsize, ex = stats.relfreq(
                    map(float, sample_one), args.b)
            else:
                rel, low_range, binsize, ex = stats.relfreq(
                    map(float, sample_one), args.b, (mf, nf))
            for list in rel:
                cols.append(list)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "binned_statistic":
            if nf == 0 and mf == 0:
                st, b_edge, b_n = stats.binned_statistic(
                    map(float, sample_one),
                    map(float, sample_two),
                    statistic=args.statistic,
                    bins=args.b,
                )
            else:
                st, b_edge, b_n = stats.binned_statistic(
                    map(float, sample_one),
                    map(float, sample_two),
                    statistic=args.statistic,
                    bins=args.b,
                    range=(mf, nf),
                )
            cols.append(st)
            cols.append(b_edge)
            cols.append(b_n)
        elif test_id.strip() == "threshold":
            if nf == 0 and mf == 0:
                o = stats.threshold(map(float, sample_one), newval=args.new)
            else:
                o = stats.threshold(map(float, sample_one),
                                    mf,
                                    nf,
                                    newval=args.new)
            for list in o:
                cols.append(list)
        elif test_id.strip() == "trimboth":
            o = stats.trimboth(map(float, sample_one),
                               proportiontocut=args.proportiontocut)
            for list in o:
                cols.append(list)
        elif test_id.strip() == "trim1":
            t1 = stats.trim1(
                map(float, sample_one),
                proportiontocut=args.proportiontocut,
                tail=args.tail,
            )
            for list in t1:
                cols.append(list)
        elif test_id.strip() == "histogram":
            if nf == 0 and mf == 0:
                hi, low_range, binsize, ex = stats.histogram(
                    map(float, sample_one), args.b)
            else:
                hi, low_range, binsize, ex = stats.histogram(
                    map(float, sample_one), args.b, (mf, nf))
            cols.append(hi)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "cumfreq":
            if nf == 0 and mf == 0:
                cum, low_range, binsize, ex = stats.cumfreq(
                    map(float, sample_one), args.b)
            else:
                cum, low_range, binsize, ex = stats.cumfreq(
                    map(float, sample_one), args.b, (mf, nf))
            cols.append(cum)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "boxcox_normmax":
            if nf == 0 and mf == 0:
                ma = stats.boxcox_normmax(map(float, sample_one))
            else:
                ma = stats.boxcox_normmax(map(float, sample_one), (mf, nf),
                                          method=args.method)
            cols.append(ma)
        elif test_id.strip() == "boxcox":
            if imbda == 0:
                box, ma, ci = stats.boxcox(map(float, sample_one),
                                           alpha=args.alpha)
                cols.append(box)
                cols.append(ma)
                cols.append(ci)
            else:
                box = stats.boxcox(map(float, sample_one),
                                   imbda,
                                   alpha=args.alpha)
                cols.append(box)
        elif test_id.strip() == "histogram2":
            h2 = stats.histogram2(map(float, sample_one),
                                  map(float, sample_two))
            for list in h2:
                cols.append(list)
        elif test_id.strip() == "ranksums":
            z_statistic, p_value = stats.ranksums(map(float, sample_one),
                                                  map(float, sample_two))
            cols.append(z_statistic)
            cols.append(p_value)
        elif test_id.strip() == "ttest_1samp":
            t, prob = stats.ttest_1samp(map(float, sample_one),
                                        map(float, sample_two))
            for list in t:
                cols.append(list)
            for list in prob:
                cols.append(list)
        elif test_id.strip() == "ansari":
            AB, p_value = stats.ansari(map(float, sample_one),
                                       map(float, sample_two))
            cols.append(AB)
            cols.append(p_value)
        elif test_id.strip() == "linregress":
            slope, intercept, r_value, p_value, stderr = stats.linregress(
                map(float, sample_one), map(float, sample_two))
            cols.append(slope)
            cols.append(intercept)
            cols.append(r_value)
            cols.append(p_value)
            cols.append(stderr)
        elif test_id.strip() == "pearsonr":
            cor, p_value = stats.pearsonr(map(float, sample_one),
                                          map(float, sample_two))
            cols.append(cor)
            cols.append(p_value)
        elif test_id.strip() == "pointbiserialr":
            r, p_value = stats.pointbiserialr(map(float, sample_one),
                                              map(float, sample_two))
            cols.append(r)
            cols.append(p_value)
        elif test_id.strip() == "ks_2samp":
            d, p_value = stats.ks_2samp(map(float, sample_one),
                                        map(float, sample_two))
            cols.append(d)
            cols.append(p_value)
        elif test_id.strip() == "mannwhitneyu":
            mw_stats_u, p_value = stats.mannwhitneyu(
                map(float, sample_one),
                map(float, sample_two),
                use_continuity=args.mwu_use_continuity,
            )
            cols.append(mw_stats_u)
            cols.append(p_value)
        elif test_id.strip() == "zmap":
            z = stats.zmap(map(float, sample_one),
                           map(float, sample_two),
                           ddof=args.ddof)
            for list in z:
                cols.append(list)
        elif test_id.strip() == "ttest_ind":
            mw_stats_u, p_value = stats.ttest_ind(map(float, sample_one),
                                                  map(float, sample_two),
                                                  equal_var=args.equal_var)
            cols.append(mw_stats_u)
            cols.append(p_value)
        elif test_id.strip() == "ttest_rel":
            t, prob = stats.ttest_rel(map(float, sample_one),
                                      map(float, sample_two),
                                      axis=args.axis)
            cols.append(t)
            cols.append(prob)
        elif test_id.strip() == "mood":
            z, p_value = stats.mood(map(float, sample_one),
                                    map(float, sample_two),
                                    axis=args.axis)
            cols.append(z)
            cols.append(p_value)
        elif test_id.strip() == "shapiro":
            W, p_value, a = stats.shapiro(map(float, sample_one),
                                          map(float, sample_two), args.reta)
            cols.append(W)
            cols.append(p_value)
            for list in a:
                cols.append(list)
        elif test_id.strip() == "kendalltau":
            k, p_value = stats.kendalltau(
                map(float, sample_one),
                map(float, sample_two),
                initial_lexsort=args.initial_lexsort,
            )
            cols.append(k)
            cols.append(p_value)
        elif test_id.strip() == "entropy":
            s = stats.entropy(map(float, sample_one),
                              map(float, sample_two),
                              base=args.base)
            cols.append(s)
        elif test_id.strip() == "spearmanr":
            if sample2 == 1:
                rho, p_value = stats.spearmanr(map(float, sample_one),
                                               map(float, sample_two))
            else:
                rho, p_value = stats.spearmanr(map(float, sample_one))
            cols.append(rho)
            cols.append(p_value)
        elif test_id.strip() == "wilcoxon":
            if sample2 == 1:
                T, p_value = stats.wilcoxon(
                    map(float, sample_one),
                    map(float, sample_two),
                    zero_method=args.zero_method,
                    correction=args.correction,
                )
            else:
                T, p_value = stats.wilcoxon(
                    map(float, sample_one),
                    zero_method=args.zero_method,
                    correction=args.correction,
                )
            cols.append(T)
            cols.append(p_value)
        elif test_id.strip() == "chisquare":
            if sample2 == 1:
                rho, p_value = stats.chisquare(map(float, sample_one),
                                               map(float, sample_two),
                                               ddof=args.ddof)
            else:
                rho, p_value = stats.chisquare(map(float, sample_one),
                                               ddof=args.ddof)
            cols.append(rho)
            cols.append(p_value)
        elif test_id.strip() == "power_divergence":
            if sample2 == 1:
                stat, p_value = stats.power_divergence(
                    map(float, sample_one),
                    map(float, sample_two),
                    ddof=args.ddof,
                    lambda_=args.lambda_,
                )
            else:
                stat, p_value = stats.power_divergence(map(float, sample_one),
                                                       ddof=args.ddof,
                                                       lambda_=args.lambda_)
            cols.append(stat)
            cols.append(p_value)
        elif test_id.strip() == "theilslopes":
            if sample2 == 1:
                mpe, met, lo, up = stats.theilslopes(map(float, sample_one),
                                                     map(float, sample_two),
                                                     alpha=args.alpha)
            else:
                mpe, met, lo, up = stats.theilslopes(map(float, sample_one),
                                                     alpha=args.alpha)
            cols.append(mpe)
            cols.append(met)
            cols.append(lo)
            cols.append(up)
        elif test_id.strip() == "combine_pvalues":
            if sample2 == 1:
                stat, p_value = stats.combine_pvalues(
                    map(float, sample_one),
                    method=args.med,
                    weights=map(float, sample_two),
                )
            else:
                stat, p_value = stats.combine_pvalues(map(float, sample_one),
                                                      method=args.med)
            cols.append(stat)
            cols.append(p_value)
        elif test_id.strip() == "obrientransform":
            ob = stats.obrientransform(*b_samples)
            for list in ob:
                elements = ",".join(map(str, list))
                cols.append(elements)
        elif test_id.strip() == "f_oneway":
            f_value, p_value = stats.f_oneway(*b_samples)
            cols.append(f_value)
            cols.append(p_value)
        elif test_id.strip() == "kruskal":
            h, p_value = stats.kruskal(*b_samples)
            cols.append(h)
            cols.append(p_value)
        elif test_id.strip() == "friedmanchisquare":
            fr, p_value = stats.friedmanchisquare(*b_samples)
            cols.append(fr)
            cols.append(p_value)
        elif test_id.strip() == "fligner":
            xsq, p_value = stats.fligner(center=args.center,
                                         proportiontocut=args.proportiontocut,
                                         *b_samples)
            cols.append(xsq)
            cols.append(p_value)
        elif test_id.strip() == "bartlett":
            T, p_value = stats.bartlett(*b_samples)
            cols.append(T)
            cols.append(p_value)
        elif test_id.strip() == "levene":
            w, p_value = stats.levene(center=args.center,
                                      proportiontocut=args.proportiontocut,
                                      *b_samples)
            cols.append(w)
            cols.append(p_value)
        elif test_id.strip() == "median_test":
            stat, p_value, m, table = stats.median_test(
                ties=args.ties,
                correction=args.correction,
                lambda_=args.lambda_,
                *b_samples)
            cols.append(stat)
            cols.append(p_value)
            cols.append(m)
            cols.append(table)
            for list in table:
                elements = ",".join(map(str, list))
                cols.append(elements)
        outfile.write("%s\n" % "\t".join(map(str, cols)))
    outfile.close()
Beispiel #21
0
print("bartlett")
data['bartlett'] = [
    bartlett(x, y)[0] for (x, y) in zip(np.nan_to_num(question1_vectors),
                                        np.nan_to_num(question2_vectors))
]

print("ranksums")
data['ranksums'] = [
    ranksums(x, y)[0] for (x, y) in zip(np.nan_to_num(question1_vectors),
                                        np.nan_to_num(question2_vectors))
]

print("ansari")
data['ansari'] = [
    ansari(x, y)[0] for (x, y) in zip(np.nan_to_num(question1_vectors),
                                      np.nan_to_num(question2_vectors))
]

#==============================================================================
# print("mannwhitneyu")
# data['mannwhitneyu'] = [mannwhitneyu(x, y)[0] for (x, y) in zip(np.nan_to_num(question1_vectors),
#                                                           np.nan_to_num(question2_vectors))]
#==============================================================================

print("fligner")
data['fligner'] = [
    fligner(x, y)[0] for (x, y) in zip(np.nan_to_num(question1_vectors),
                                       np.nan_to_num(question2_vectors))
]
Beispiel #22
0
 def test_result_attributes(self):
     x = [1, 2, 3, 3, 4]
     y = [3, 2, 6, 1, 6, 1, 4, 1]
     res = stats.ansari(x, y)
     attributes = ('statistic', 'pvalue')
     check_named_results(res, attributes)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--infile", required=True, help="Tabular file.")
    parser.add_argument("-o", "--outfile", required=True, help="Path to the output file.")
    parser.add_argument("--sample_one_cols", help="Input format, like smi, sdf, inchi")
    parser.add_argument("--sample_two_cols", help="Input format, like smi, sdf, inchi")
    parser.add_argument("--sample_cols", help="Input format, like smi, sdf, inchi,separate arrays using ;")
    parser.add_argument("--test_id", help="statistical test method")
    parser.add_argument(
        "--mwu_use_continuity",
        action="store_true",
        default=False,
        help="Whether a continuity correction (1/2.) should be taken into account.",
    )
    parser.add_argument(
        "--equal_var",
        action="store_true",
        default=False,
        help="If set perform a standard independent 2 sample test that assumes equal population variances. If not set, perform Welch's t-test, which does not assume equal population variance.",
    )
    parser.add_argument(
        "--reta", action="store_true", default=False, help="Whether or not to return the internally computed a values."
    )
    parser.add_argument("--fisher", action="store_true", default=False, help="if true then Fisher definition is used")
    parser.add_argument(
        "--bias",
        action="store_true",
        default=False,
        help="if false,then the calculations are corrected for statistical bias",
    )
    parser.add_argument("--inclusive1", action="store_true", default=False, help="if false,lower_limit will be ignored")
    parser.add_argument(
        "--inclusive2", action="store_true", default=False, help="if false,higher_limit will be ignored"
    )
    parser.add_argument("--inclusive", action="store_true", default=False, help="if false,limit will be ignored")
    parser.add_argument(
        "--printextras",
        action="store_true",
        default=False,
        help="If True, if there are extra points a warning is raised saying how many of those points there are",
    )
    parser.add_argument(
        "--initial_lexsort",
        action="store_true",
        default="False",
        help="Whether to use lexsort or quicksort as the sorting method for the initial sort of the inputs.",
    )
    parser.add_argument("--correction", action="store_true", default=False, help="continuity correction ")
    parser.add_argument(
        "--axis",
        type=int,
        default=0,
        help="Axis can equal None (ravel array first), or an integer (the axis over which to operate on a and b)",
    )
    parser.add_argument(
        "--n",
        type=int,
        default=0,
        help="the number of trials. This is ignored if x gives both the number of successes and failures",
    )
    parser.add_argument("--b", type=int, default=0, help="The number of bins to use for the histogram")
    parser.add_argument("--N", type=int, default=0, help="Score that is compared to the elements in a.")
    parser.add_argument("--ddof", type=int, default=0, help="Degrees of freedom correction")
    parser.add_argument("--score", type=int, default=0, help="Score that is compared to the elements in a.")
    parser.add_argument("--m", type=float, default=0.0, help="limits")
    parser.add_argument("--mf", type=float, default=2.0, help="lower limit")
    parser.add_argument("--nf", type=float, default=99.9, help="higher_limit")
    parser.add_argument(
        "--p",
        type=float,
        default=0.5,
        help="The hypothesized probability of success. 0 <= p <= 1. The default value is p = 0.5",
    )
    parser.add_argument("--alpha", type=float, default=0.9, help="probability")
    parser.add_argument("--new", type=float, default=0.0, help="Value to put in place of values in a outside of bounds")
    parser.add_argument(
        "--proportiontocut",
        type=float,
        default=0.0,
        help="Proportion (in range 0-1) of total data set to trim of each end.",
    )
    parser.add_argument(
        "--lambda_",
        type=float,
        default=1.0,
        help="lambda_ gives the power in the Cressie-Read power divergence statistic",
    )
    parser.add_argument(
        "--imbda",
        type=float,
        default=0,
        help="If lmbda is not None, do the transformation for that value.If lmbda is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument.",
    )
    parser.add_argument("--base", type=float, default=1.6, help="The logarithmic base to use, defaults to e")
    parser.add_argument("--dtype", help="dtype")
    parser.add_argument("--med", help="med")
    parser.add_argument("--cdf", help="cdf")
    parser.add_argument("--zero_method", help="zero_method options")
    parser.add_argument("--dist", help="dist options")
    parser.add_argument("--ties", help="ties options")
    parser.add_argument("--alternative", help="alternative options")
    parser.add_argument("--mode", help="mode options")
    parser.add_argument("--method", help="method options")
    parser.add_argument("--md", help="md options")
    parser.add_argument("--center", help="center options")
    parser.add_argument("--kind", help="kind options")
    parser.add_argument("--tail", help="tail options")
    parser.add_argument("--interpolation", help="interpolation options")
    parser.add_argument("--statistic", help="statistic options")

    args = parser.parse_args()
    infile = args.infile
    outfile = open(args.outfile, "w+")
    test_id = args.test_id
    nf = args.nf
    mf = args.mf
    imbda = args.imbda
    inclusive1 = args.inclusive1
    inclusive2 = args.inclusive2
    sample0 = 0
    sample1 = 0
    sample2 = 0
    if args.sample_cols != None:
        sample0 = 1
        barlett_samples = []
        for sample in args.sample_cols.split(";"):
            barlett_samples.append(map(int, sample.split(",")))
    if args.sample_one_cols != None:
        sample1 = 1
        sample_one_cols = args.sample_one_cols.split(",")
    if args.sample_two_cols != None:
        sample_two_cols = args.sample_two_cols.split(",")
        sample2 = 1
    for line in open(infile):
        sample_one = []
        sample_two = []
        cols = line.strip().split("\t")
        if sample0 == 1:
            b_samples = columns_to_values(barlett_samples, line)
        if sample1 == 1:
            for index in sample_one_cols:
                sample_one.append(cols[int(index) - 1])
        if sample2 == 1:
            for index in sample_two_cols:
                sample_two.append(cols[int(index) - 1])
        if test_id.strip() == "describe":
            size, min_max, mean, uv, bs, bk = stats.describe(map(float, sample_one))
            cols.append(size)
            cols.append(min_max)
            cols.append(mean)
            cols.append(uv)
            cols.append(bs)
            cols.append(bk)
        elif test_id.strip() == "mode":
            vals, counts = stats.mode(map(float, sample_one))
            cols.append(vals)
            cols.append(counts)
        elif test_id.strip() == "nanmean":
            m = stats.nanmean(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "nanmedian":
            m = stats.nanmedian(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "kurtosistest":
            z_value, p_value = stats.kurtosistest(map(float, sample_one))
            cols.append(z_value)
            cols.append(p_value)
        elif test_id.strip() == "variation":
            ra = stats.variation(map(float, sample_one))
            cols.append(ra)
        elif test_id.strip() == "itemfreq":
            freq = stats.itemfreq(map(float, sample_one))
            for list in freq:
                elements = ",".join(map(str, list))
                cols.append(elements)
        elif test_id.strip() == "nanmedian":
            m = stats.nanmedian(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "variation":
            ra = stats.variation(map(float, sample_one))
            cols.append(ra)
        elif test_id.strip() == "boxcox_llf":
            IIf = stats.boxcox_llf(imbda, map(float, sample_one))
            cols.append(IIf)
        elif test_id.strip() == "tiecorrect":
            fa = stats.tiecorrect(map(float, sample_one))
            cols.append(fa)
        elif test_id.strip() == "rankdata":
            r = stats.rankdata(map(float, sample_one), method=args.md)
            cols.append(r)
        elif test_id.strip() == "nanstd":
            s = stats.nanstd(map(float, sample_one), bias=args.bias)
            cols.append(s)
        elif test_id.strip() == "anderson":
            A2, critical, sig = stats.anderson(map(float, sample_one), dist=args.dist)
            cols.append(A2)
            for list in critical:
                cols.append(list)
            cols.append(",")
            for list in sig:
                cols.append(list)
        elif test_id.strip() == "binom_test":
            p_value = stats.binom_test(map(float, sample_one), n=args.n, p=args.p)
            cols.append(p_value)
        elif test_id.strip() == "gmean":
            gm = stats.gmean(map(float, sample_one), dtype=args.dtype)
            cols.append(gm)
        elif test_id.strip() == "hmean":
            hm = stats.hmean(map(float, sample_one), dtype=args.dtype)
            cols.append(hm)
        elif test_id.strip() == "kurtosis":
            k = stats.kurtosis(map(float, sample_one), axis=args.axis, fisher=args.fisher, bias=args.bias)
            cols.append(k)
        elif test_id.strip() == "moment":
            n_moment = stats.moment(map(float, sample_one), n=args.n)
            cols.append(n_moment)
        elif test_id.strip() == "normaltest":
            k2, p_value = stats.normaltest(map(float, sample_one))
            cols.append(k2)
            cols.append(p_value)
        elif test_id.strip() == "skew":
            skewness = stats.skew(map(float, sample_one), bias=args.bias)
            cols.append(skewness)
        elif test_id.strip() == "skewtest":
            z_value, p_value = stats.skewtest(map(float, sample_one))
            cols.append(z_value)
            cols.append(p_value)
        elif test_id.strip() == "sem":
            s = stats.sem(map(float, sample_one), ddof=args.ddof)
            cols.append(s)
        elif test_id.strip() == "zscore":
            z = stats.zscore(map(float, sample_one), ddof=args.ddof)
            for list in z:
                cols.append(list)
        elif test_id.strip() == "signaltonoise":
            s2n = stats.signaltonoise(map(float, sample_one), ddof=args.ddof)
            cols.append(s2n)
        elif test_id.strip() == "percentileofscore":
            p = stats.percentileofscore(map(float, sample_one), score=args.score, kind=args.kind)
            cols.append(p)
        elif test_id.strip() == "bayes_mvs":
            c_mean, c_var, c_std = stats.bayes_mvs(map(float, sample_one), alpha=args.alpha)
            cols.append(c_mean)
            cols.append(c_var)
            cols.append(c_std)
        elif test_id.strip() == "sigmaclip":
            c, c_low, c_up = stats.sigmaclip(map(float, sample_one), low=args.m, high=args.n)
            cols.append(c)
            cols.append(c_low)
            cols.append(c_up)
        elif test_id.strip() == "kstest":
            d, p_value = stats.kstest(
                map(float, sample_one), cdf=args.cdf, N=args.N, alternative=args.alternative, mode=args.mode
            )
            cols.append(d)
            cols.append(p_value)
        elif test_id.strip() == "chi2_contingency":
            chi2, p, dof, ex = stats.chi2_contingency(
                map(float, sample_one), correction=args.correction, lambda_=args.lambda_
            )
            cols.append(chi2)
            cols.append(p)
            cols.append(dof)
            cols.append(ex)
        elif test_id.strip() == "tmean":
            if nf is 0 and mf is 0:
                mean = stats.tmean(map(float, sample_one))
            else:
                mean = stats.tmean(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(mean)
        elif test_id.strip() == "tmin":
            if mf is 0:
                min = stats.tmin(map(float, sample_one))
            else:
                min = stats.tmin(map(float, sample_one), lowerlimit=mf, inclusive=args.inclusive)
            cols.append(min)
        elif test_id.strip() == "tmax":
            if nf is 0:
                max = stats.tmax(map(float, sample_one))
            else:
                max = stats.tmax(map(float, sample_one), upperlimit=nf, inclusive=args.inclusive)
            cols.append(max)
        elif test_id.strip() == "tvar":
            if nf is 0 and mf is 0:
                var = stats.tvar(map(float, sample_one))
            else:
                var = stats.tvar(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(var)
        elif test_id.strip() == "tstd":
            if nf is 0 and mf is 0:
                std = stats.tstd(map(float, sample_one))
            else:
                std = stats.tstd(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(std)
        elif test_id.strip() == "tsem":
            if nf is 0 and mf is 0:
                s = stats.tsem(map(float, sample_one))
            else:
                s = stats.tsem(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(s)
        elif test_id.strip() == "scoreatpercentile":
            if nf is 0 and mf is 0:
                s = stats.scoreatpercentile(
                    map(float, sample_one), map(float, sample_two), interpolation_method=args.interpolation
                )
            else:
                s = stats.scoreatpercentile(
                    map(float, sample_one), map(float, sample_two), (mf, nf), interpolation_method=args.interpolation
                )
            for list in s:
                cols.append(list)
        elif test_id.strip() == "relfreq":
            if nf is 0 and mf is 0:
                rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b)
            else:
                rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b, (mf, nf))
            for list in rel:
                cols.append(list)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "binned_statistic":
            if nf is 0 and mf is 0:
                st, b_edge, b_n = stats.binned_statistic(
                    map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b
                )
            else:
                st, b_edge, b_n = stats.binned_statistic(
                    map(float, sample_one),
                    map(float, sample_two),
                    statistic=args.statistic,
                    bins=args.b,
                    range=(mf, nf),
                )
            cols.append(st)
            cols.append(b_edge)
            cols.append(b_n)
        elif test_id.strip() == "threshold":
            if nf is 0 and mf is 0:
                o = stats.threshold(map(float, sample_one), newval=args.new)
            else:
                o = stats.threshold(map(float, sample_one), mf, nf, newval=args.new)
            for list in o:
                cols.append(list)
        elif test_id.strip() == "trimboth":
            o = stats.trimboth(map(float, sample_one), proportiontocut=args.proportiontocut)
            for list in o:
                cols.append(list)
        elif test_id.strip() == "trim1":
            t1 = stats.trim1(map(float, sample_one), proportiontocut=args.proportiontocut, tail=args.tail)
            for list in t1:
                cols.append(list)
        elif test_id.strip() == "histogram":
            if nf is 0 and mf is 0:
                hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b)
            else:
                hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b, (mf, nf))
            cols.append(hi)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "cumfreq":
            if nf is 0 and mf is 0:
                cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b)
            else:
                cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b, (mf, nf))
            cols.append(cum)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "boxcox_normmax":
            if nf is 0 and mf is 0:
                ma = stats.boxcox_normmax(map(float, sample_one))
            else:
                ma = stats.boxcox_normmax(map(float, sample_one), (mf, nf), method=args.method)
            cols.append(ma)
        elif test_id.strip() == "boxcox":
            if imbda is 0:
                box, ma, ci = stats.boxcox(map(float, sample_one), alpha=args.alpha)
                cols.append(box)
                cols.append(ma)
                cols.append(ci)
            else:
                box = stats.boxcox(map(float, sample_one), imbda, alpha=args.alpha)
                cols.append(box)
        elif test_id.strip() == "histogram2":
            h2 = stats.histogram2(map(float, sample_one), map(float, sample_two))
            for list in h2:
                cols.append(list)
        elif test_id.strip() == "ranksums":
            z_statistic, p_value = stats.ranksums(map(float, sample_one), map(float, sample_two))
            cols.append(z_statistic)
            cols.append(p_value)
        elif test_id.strip() == "ttest_1samp":
            t, prob = stats.ttest_1samp(map(float, sample_one), map(float, sample_two))
            for list in t:
                cols.append(list)
            for list in prob:
                cols.append(list)
        elif test_id.strip() == "ansari":
            AB, p_value = stats.ansari(map(float, sample_one), map(float, sample_two))
            cols.append(AB)
            cols.append(p_value)
        elif test_id.strip() == "linregress":
            slope, intercept, r_value, p_value, stderr = stats.linregress(
                map(float, sample_one), map(float, sample_two)
            )
            cols.append(slope)
            cols.append(intercept)
            cols.append(r_value)
            cols.append(p_value)
            cols.append(stderr)
        elif test_id.strip() == "pearsonr":
            cor, p_value = stats.pearsonr(map(float, sample_one), map(float, sample_two))
            cols.append(cor)
            cols.append(p_value)
        elif test_id.strip() == "pointbiserialr":
            r, p_value = stats.pointbiserialr(map(float, sample_one), map(float, sample_two))
            cols.append(r)
            cols.append(p_value)
        elif test_id.strip() == "ks_2samp":
            d, p_value = stats.ks_2samp(map(float, sample_one), map(float, sample_two))
            cols.append(d)
            cols.append(p_value)
        elif test_id.strip() == "mannwhitneyu":
            mw_stats_u, p_value = stats.mannwhitneyu(
                map(float, sample_one), map(float, sample_two), use_continuity=args.mwu_use_continuity
            )
            cols.append(mw_stats_u)
            cols.append(p_value)
        elif test_id.strip() == "zmap":
            z = stats.zmap(map(float, sample_one), map(float, sample_two), ddof=args.ddof)
            for list in z:
                cols.append(list)
        elif test_id.strip() == "ttest_ind":
            mw_stats_u, p_value = stats.ttest_ind(
                map(float, sample_one), map(float, sample_two), equal_var=args.equal_var
            )
            cols.append(mw_stats_u)
            cols.append(p_value)
        elif test_id.strip() == "ttest_rel":
            t, prob = stats.ttest_rel(map(float, sample_one), map(float, sample_two), axis=args.axis)
            cols.append(t)
            cols.append(prob)
        elif test_id.strip() == "mood":
            z, p_value = stats.mood(map(float, sample_one), map(float, sample_two), axis=args.axis)
            cols.append(z)
            cols.append(p_value)
        elif test_id.strip() == "shapiro":
            W, p_value, a = stats.shapiro(map(float, sample_one), map(float, sample_two), args.reta)
            cols.append(W)
            cols.append(p_value)
            for list in a:
                cols.append(list)
        elif test_id.strip() == "kendalltau":
            k, p_value = stats.kendalltau(
                map(float, sample_one), map(float, sample_two), initial_lexsort=args.initial_lexsort
            )
            cols.append(k)
            cols.append(p_value)
        elif test_id.strip() == "entropy":
            s = stats.entropy(map(float, sample_one), map(float, sample_two), base=args.base)
            cols.append(s)
        elif test_id.strip() == "spearmanr":
            if sample2 == 1:
                rho, p_value = stats.spearmanr(map(float, sample_one), map(float, sample_two))
            else:
                rho, p_value = stats.spearmanr(map(float, sample_one))
            cols.append(rho)
            cols.append(p_value)
        elif test_id.strip() == "wilcoxon":
            if sample2 == 1:
                T, p_value = stats.wilcoxon(
                    map(float, sample_one),
                    map(float, sample_two),
                    zero_method=args.zero_method,
                    correction=args.correction,
                )
            else:
                T, p_value = stats.wilcoxon(
                    map(float, sample_one), zero_method=args.zero_method, correction=args.correction
                )
            cols.append(T)
            cols.append(p_value)
        elif test_id.strip() == "chisquare":
            if sample2 == 1:
                rho, p_value = stats.chisquare(map(float, sample_one), map(float, sample_two), ddof=args.ddof)
            else:
                rho, p_value = stats.chisquare(map(float, sample_one), ddof=args.ddof)
            cols.append(rho)
            cols.append(p_value)
        elif test_id.strip() == "power_divergence":
            if sample2 == 1:
                stat, p_value = stats.power_divergence(
                    map(float, sample_one), map(float, sample_two), ddof=args.ddof, lambda_=args.lambda_
                )
            else:
                stat, p_value = stats.power_divergence(map(float, sample_one), ddof=args.ddof, lambda_=args.lambda_)
            cols.append(stat)
            cols.append(p_value)
        elif test_id.strip() == "theilslopes":
            if sample2 == 1:
                mpe, met, lo, up = stats.theilslopes(map(float, sample_one), map(float, sample_two), alpha=args.alpha)
            else:
                mpe, met, lo, up = stats.theilslopes(map(float, sample_one), alpha=args.alpha)
            cols.append(mpe)
            cols.append(met)
            cols.append(lo)
            cols.append(up)
        elif test_id.strip() == "combine_pvalues":
            if sample2 == 1:
                stat, p_value = stats.combine_pvalues(
                    map(float, sample_one), method=args.med, weights=map(float, sample_two)
                )
            else:
                stat, p_value = stats.combine_pvalues(map(float, sample_one), method=args.med)
            cols.append(stat)
            cols.append(p_value)
        elif test_id.strip() == "obrientransform":
            ob = stats.obrientransform(*b_samples)
            for list in ob:
                elements = ",".join(map(str, list))
                cols.append(elements)
        elif test_id.strip() == "f_oneway":
            f_value, p_value = stats.f_oneway(*b_samples)
            cols.append(f_value)
            cols.append(p_value)
        elif test_id.strip() == "kruskal":
            h, p_value = stats.kruskal(*b_samples)
            cols.append(h)
            cols.append(p_value)
        elif test_id.strip() == "friedmanchisquare":
            fr, p_value = stats.friedmanchisquare(*b_samples)
            cols.append(fr)
            cols.append(p_value)
        elif test_id.strip() == "fligner":
            xsq, p_value = stats.fligner(center=args.center, proportiontocut=args.proportiontocut, *b_samples)
            cols.append(xsq)
            cols.append(p_value)
        elif test_id.strip() == "bartlett":
            T, p_value = stats.bartlett(*b_samples)
            cols.append(T)
            cols.append(p_value)
        elif test_id.strip() == "levene":
            w, p_value = stats.levene(center=args.center, proportiontocut=args.proportiontocut, *b_samples)
            cols.append(w)
            cols.append(p_value)
        elif test_id.strip() == "median_test":
            stat, p_value, m, table = stats.median_test(
                ties=args.ties, correction=args.correction, lambda_=args.lambda_, *b_samples
            )
            cols.append(stat)
            cols.append(p_value)
            cols.append(m)
            cols.append(table)
            for list in table:
                elements = ",".join(map(str, list))
                cols.append(elements)
        outfile.write("%s\n" % "\t".join(map(str, cols)))
    outfile.close()
def plotExpBox_Main(inputFiles, headers, valcols, outputFile, sep, startRow,
                    showIndPoints, mark, markMean, showMean, notch, whisker,
                    outliers, plotPvalueCluster, outputClusterPrefix,
                    methodCluster, xlegendrotation, xlabe, ylabe, figsz, titl,
                    showSampleSizes, trimToMinSize, relabels, logb,
                    plotHistogramToFile, plotMedianForGroups, botta,
                    showViolin, showBox, firstColAnnot, plotTrend, showLegend,
                    makePzfxFile, makeBinMatrix, writeDataSummaryStat,
                    summaryStatRange, minuslog10pvalue, minNDataToKeep,
                    vfacecolor, valpha, outXYZPvalues, dividePlots):

    #if plotPvalueCluster:
    #if pvalue cluster is needed:
    #	from Bio.Cluster.cluster import *
    #	from Bio.Cluster import *
    #endif

    #the real deal!
    plotData = []
    xtickLabels = []

    trendData = {}
    annot = {}

    minSize = -1

    for inputFile, header, cols in zip(inputFiles, headers, valcols):
        fin = generic_istream(inputFile)

        startIdx = len(plotData)

        if firstColAnnot:
            colAnnot = cols[0]
            cols = cols[1:]
            annotThisFile = []
            annot[startIdx] = annotThisFile
        else:
            colAnnot = -1
            annotThisFile = None

        for col in cols:
            plotData.append([])
            xtickLabels.append(header[col])

        colIndices = range(startIdx, startIdx + len(cols))

        if plotTrend:
            #print >> stderr,"plotTrend"
            trendDataThisFile = []
            trendData[startIdx] = trendDataThisFile
        else:
            trendDataThisFile = None

        lino = 0
        for lin in fin:
            lino += 1
            if lino < startRow:
                continue
            fields = lin.rstrip("\r\n").split(sep)

            if plotTrend:
                #print >> stderr,"a"
                trendDataThisLine = []
            else:
                trendDataThisLine = None

            allDataOKThisLine = True

            if colAnnot >= 0:
                annotThisFile.append(fields[colAnnot])

            for idx, col in zip(colIndices, cols):
                try:
                    value = float(fields[col])
                    if logb != 0:
                        if value == 0.0:
                            raise ValueError
                        value = log(value) / logb
                    plotData[idx].append(value)

                    if plotTrend:
                        trendDataThisLine.append(value)
                        #print >> stderr,"value:",value

                except:
                    allDataOKThisLine = False

            if plotTrend:
                if allDataOKThisLine:
                    trendDataThisFile.append(trendDataThisLine)
                else:
                    trendDataThisFile.append(None)

        fin.close()

        if minSize == -1:
            minSize = len(plotData[idx])  #or startIDX?
        else:
            minSize = min([minSize, len(plotData[idx])])

    if trimToMinSize:
        print >> stderr, "trimming to min size =", minSize
        trimData(plotData, minSize)

    if len(relabels) > 0:
        #if len(relabels)!=len(xtickLabels):
        #	print >> stderr,"relabels doesn't have the same length as original label vectors",xtickLabels,"=>",relabels
        #	exit()
        print >> stderr, xtickLabels
        print >> stderr, relabels
        for i, relabel in zip(range(0, len(relabels)), relabels):
            xtickLabels[i] = relabel

    for i in range(0, len(plotMedianForGroups)):
        plotMedianForGroups[i] = getCol0ListFromCol1ListStringAdv(
            xtickLabels, plotMedianForGroups[i])

    #drawing medians:
    medianToDraw = []
    for mediangrouper in plotMedianForGroups:
        curD = []
        for c in mediangrouper:
            curD.extend(plotData[c])
        medianToDraw.append(median(curD))

    for c in range(len(plotData) - 1, -1, -1):
        if len(plotData[c]) < minNDataToKeep:
            print >> stderr, xtickLabels[c], "discarded because has only", len(
                plotData[c]), "data points <", minNDataToKeep
            del plotData[c]
            del xtickLabels[c]

    if not skipStat:
        print >> stdout, "student t-test (1 sample; mean=0)"
        print >> stdout, "sample", "mean", "p-val", "median"

        if writeDataSummaryStat:
            fDSS = open(writeDataSummaryStat, "w")
            print >> fDSS, "sample\tmean\tvar\tsd\tmin\tmax\tN\tNInRange[" + str(
                summaryStatRange[0]) + "," + str(
                    summaryStatRange[1]
                ) + "]\t%NInRange\tNbelowRange\t%Nbelow\tNAboveRange\t%NAbove"

        for x in range(0, len(plotData)):
            #print >> stderr, len(plotData[x])
            try:
                print >> stdout, xtickLabels[x], mean(
                    plotData[x]), ttest_1samp(plotData[x],
                                              0)[1], median(plotData[x])
            except:
                print >> stdout, xtickLabels[x], mean(
                    plotData[x]), "NA", median(plotData[x])

            if writeDataSummaryStat:
                sumData, N, NIN, NBelow, NAbove = filterDataInRangeInclusive(
                    plotData[x], summaryStatRange[0], summaryStatRange[1])

                if NIN > 1:
                    #print >> stderr,"sumData=",sumData
                    #print >> stderr,mean
                    mea = mean2(sumData)
                    DDOF = 1
                    sd = std(sumData, ddof=DDOF)
                    var = sd * sd
                    mi = min(sumData)
                    ma = max(sumData)
                else:
                    mea = "NA"
                    sd = "NA"
                    var = "NA"
                    mi = "NA"
                    ma = "NA"

                print >> fDSS, xtickLabels[x] + "\t" + str(mea) + "\t" + str(
                    var) + "\t" + str(sd) + "\t" + str(mi) + "\t" + str(
                        ma) + "\t" + str(N) + "\t" + str(NIN) + "\t" + str(
                            float(NIN) * 100 /
                            N) + "\t" + str(NBelow) + "\t" + str(
                                float(NBelow) * 100 /
                                N) + "\t" + str(NAbove) + "\t" + str(
                                    float(NAbove) * 100 / N)

        pvalueM = []

        if writeDataSummaryStat:
            fDSS.close()

        print >> stdout, ""

        print >> stdout, "student t-test (2 samples)"
        print >> stdout, "p-val",
        for x in range(0, len(plotData)):
            print >> stdout, xtickLabels[x],

        print >> stdout, ""

        for x in range(0, len(plotData)):
            pvalueRow = []
            pvalueM.append(pvalueRow)
            print >> stdout, xtickLabels[x],
            for y in range(0, len(plotData)):
                if y <= x:
                    print >> stdout, "",
                    if x == y:
                        if minuslog10pvalue:
                            pvalueRow.append(0.0)
                        else:
                            pvalueRow.append(1.0)
                    else:
                        pvalueRow.append(pvalueM[y][x])
                else:
                    try:
                        pvalue = ttest_ind(plotData[x], plotData[y])[1]
                    except:
                        pvalue = 1.0

                    if minuslog10pvalue and str(pvalue) != "NA":
                        try:
                            pvalue = -1 * log(pvalue, 10)
                        except:
                            pvalue = -1000.0

                    print >> stdout, str(pvalue),
                    pvalueRow.append(pvalue)
            print >> stdout, ""

        print >> stdout, ""

        if plotPvalueCluster:
            makePValueRawPlot(outputClusterPrefix + "_t_raw", xtickLabels,
                              pvalueM)
            makePValueClusterPlot(outputClusterPrefix + "_t", xtickLabels,
                                  pvalueM, methodCluster)

        pvalueM = []

        print >> stdout, "welch t-test"
        print >> stdout, "p-val",
        for x in range(0, len(plotData)):
            print >> stdout, xtickLabels[x],

        print >> stdout, ""
        for x in range(0, len(plotData)):
            pvalueRow = []
            pvalueM.append(pvalueRow)
            print >> stdout, xtickLabels[x],
            for y in range(0, len(plotData)):
                if y <= x:
                    print >> stdout, "",
                    if x == y:
                        if minuslog10pvalue:
                            pvalueRow.append(0.0)
                        else:
                            pvalueRow.append(1.0)
                    else:
                        pvalueRow.append(pvalueM[y][x])

                else:
                    try:
                        pvalue = welchs_approximate_ttest_arr(
                            plotData[x], plotData[y])[3]
                    except:
                        pvalue = 1.0

                    if minuslog10pvalue and str(pvalue) != "NA":
                        try:
                            pvalue = -1 * log(pvalue, 10)
                        except:
                            pvalue = -1000.0

                    print >> stdout, str(pvalue),
                    pvalueRow.append(pvalue)
            print >> stdout, ""

        if outXYZPvalues:
            writeXYZPvalues(outXYZPvalues + "_Welch.xyz", xtickLabels, pvalueM)

        if plotPvalueCluster:
            makePValueRawPlot(outputClusterPrefix + "_Welch_raw", xtickLabels,
                              pvalueM)
            makePValueClusterPlot(outputClusterPrefix + "_Welch", xtickLabels,
                                  pvalueM, methodCluster)

        print >> stdout, ""
        print >> stdout, "non-parametric (Mann-Whitney U)"  #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)"
        print >> stdout, "p-val",
        for x in range(0, len(plotData)):
            print >> stdout, xtickLabels[x],

        pvalueM = []

        print >> stdout, ""
        for x in range(0, len(plotData)):
            pvalueRow = []
            pvalueM.append(pvalueRow)
            print >> stdout, xtickLabels[x],
            for y in range(0, len(plotData)):
                if y <= x:
                    print >> stdout, "",
                    if x == y:
                        if minuslog10pvalue:
                            pvalueRow.append(0.0)
                        else:
                            pvalueRow.append(1.0)
                    else:
                        pvalueRow.append(pvalueM[y][x])
                else:
                    #if max(len(plotData[x]),len(plotData[y]))<=20:
                    try:
                        pvalue = mannwhitneyu(plotData[x], plotData[y])[1] * 2
                    except:
                        pvalue = 1.0

                    if minuslog10pvalue and str(pvalue) != "NA":
                        try:
                            pvalue = -1 * log(pvalue, 10)
                        except:
                            pvalue = -1000.0

                    print >> stdout, pvalue,  #mann-whiteney need to mul by 2 (one tail to two tail)
                    pvalueRow.append(pvalue)
                    #else:
                    #	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
            print >> stdout, ""

        if outXYZPvalues:
            writeXYZPvalues(outXYZPvalues + "_U.xyz", xtickLabels, pvalueM)

        if plotPvalueCluster:
            makePValueRawPlot(outputClusterPrefix + "_U_raw", xtickLabels,
                              pvalueM)
            makePValueClusterPlot(outputClusterPrefix + "_U", xtickLabels,
                                  pvalueM, methodCluster)

        #####now the variance tests

        print >> stdout, ""
        print >> stdout, "Ansari-Bradley Two-sample Test for difference in scale parameters "
        print >> stdout, "p-val",

        for x in range(0, len(plotData)):
            print >> stdout, xtickLabels[x],

        pvalueM = []

        print >> stdout, ""
        for x in range(0, len(plotData)):
            pvalueRow = []
            pvalueM.append(pvalueRow)
            print >> stdout, xtickLabels[x],
            for y in range(0, len(plotData)):
                if y <= x:
                    print >> stdout, "",
                    if x == y:
                        if minuslog10pvalue:
                            pvalueRow.append(0.0)
                        else:
                            pvalueRow.append(1.0)
                    else:
                        pvalueRow.append(pvalueM[y][x])
                else:
                    #if max(len(plotData[x]),len(plotData[y]))<=20:
                    try:
                        pvalue = ansari(plotData[x], plotData[y])[1]
                    except:
                        pvalue = "NA"

                    if minuslog10pvalue and str(pvalue) != "NA":
                        try:
                            pvalue = -1 * log(pvalue, 10)
                        except:
                            pvalue = -1000.0

                        #pvalue=1.0
                    print >> stdout, pvalue,
                    pvalueRow.append(pvalue)
                    #else:
                    #	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
            print >> stdout, ""

        if plotPvalueCluster:
            makePValueRawPlot(outputClusterPrefix + "_Ansari_raw", xtickLabels,
                              pvalueM)
            makePValueClusterPlot(outputClusterPrefix + "_Ansari", xtickLabels,
                                  pvalueM, methodCluster)

        #####

        #####now the variance tests

        print >> stdout, ""
        print >> stdout, "Fligner's Two-sample Test for equal variance (non-parametrics)"
        print >> stdout, "p-val",

        for x in range(0, len(plotData)):
            print >> stdout, xtickLabels[x],

        pvalueM = []

        print >> stdout, ""
        for x in range(0, len(plotData)):
            pvalueRow = []
            pvalueM.append(pvalueRow)
            print >> stdout, xtickLabels[x],
            for y in range(0, len(plotData)):
                if y <= x:
                    print >> stdout, "",
                    if x == y:
                        if minuslog10pvalue:
                            pvalueRow.append(0.0)
                        else:
                            pvalueRow.append(1.0)
                    else:
                        pvalueRow.append(pvalueM[y][x])
                else:
                    #if max(len(plotData[x]),len(plotData[y]))<=20:
                    try:
                        pvalue = fligner(plotData[x], plotData[y])[1]
                    except:
                        pvalue = "NA"
                        #pvalue=1.0

                    if minuslog10pvalue and str(pvalue) != "NA":
                        try:
                            pvalue = -1 * log(pvalue, 10)
                        except:
                            pvalue = -1000.0

                    print >> stdout, pvalue,
                    pvalueRow.append(pvalue)
                    #else:
                    #	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
            print >> stdout, ""

        if plotPvalueCluster:
            makePValueRawPlot(outputClusterPrefix + "_fligner_raw",
                              xtickLabels, pvalueM)
            makePValueClusterPlot(outputClusterPrefix + "_fligner",
                                  xtickLabels, pvalueM, methodCluster)

        #####

        #####now the variance tests

        print >> stdout, ""
        print >> stdout, "Levene's Two-sample Test for equal variance"
        print >> stdout, "p-val",

        for x in range(0, len(plotData)):
            print >> stdout, xtickLabels[x],

        pvalueM = []

        print >> stdout, ""
        for x in range(0, len(plotData)):
            pvalueRow = []
            pvalueM.append(pvalueRow)
            print >> stdout, xtickLabels[x],
            for y in range(0, len(plotData)):
                if y <= x:
                    print >> stdout, "",
                    if x == y:
                        if minuslog10pvalue:
                            pvalueRow.append(0.0)
                        else:
                            pvalueRow.append(1.0)
                    else:
                        pvalueRow.append(pvalueM[y][x])
                else:
                    #if max(len(plotData[x]),len(plotData[y]))<=20:
                    try:
                        pvalue = levene(plotData[x], plotData[y])[1]
                    except:
                        pvalue = "NA"
                        #pvalue=1.0

                    if minuslog10pvalue and str(pvalue) != "NA":
                        try:
                            pvalue = -1 * log(pvalue, 10)
                        except:
                            pvalue = -1000.0

                    print >> stdout, pvalue,
                    pvalueRow.append(pvalue)
                    #else:
                    #	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
            print >> stdout, ""

        if plotPvalueCluster:
            makePValueRawPlot(outputClusterPrefix + "_levene_raw", xtickLabels,
                              pvalueM)
            makePValueClusterPlot(outputClusterPrefix + "_levene", xtickLabels,
                                  pvalueM, methodCluster)

        #####

        #####now the variance tests

        print >> stdout, ""
        print >> stdout, "Bartlett's Two-sample Test for equal variance (for normal distributions)"
        print >> stdout, "p-val",

        for x in range(0, len(plotData)):
            print >> stdout, xtickLabels[x],

        pvalueM = []

        print >> stdout, ""
        for x in range(0, len(plotData)):
            pvalueRow = []
            pvalueM.append(pvalueRow)
            print >> stdout, xtickLabels[x],
            for y in range(0, len(plotData)):
                if y <= x:
                    print >> stdout, "",
                    if x == y:
                        if minuslog10pvalue:
                            pvalueRow.append(0.0)
                        else:
                            pvalueRow.append(1.0)
                    else:
                        pvalueRow.append(pvalueM[y][x])
                else:
                    #if max(len(plotData[x]),len(plotData[y]))<=20:
                    try:
                        pvalue = bartlett(plotData[x], plotData[y])[1]
                    except:
                        pvalue = "NA"
                        #pvalue=1.0

                    if minuslog10pvalue and str(pvalue) != "NA":
                        try:
                            pvalue = -1 * log(pvalue, 10)
                        except:
                            pvalue = -1000.0

                    print >> stdout, pvalue,
                    pvalueRow.append(pvalue)
                    #else:
                    #	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
            print >> stdout, ""

        if plotPvalueCluster:
            makePValueRawPlot(outputClusterPrefix + "_bartlett_raw",
                              xtickLabels, pvalueM)
            makePValueClusterPlot(outputClusterPrefix + "_bartlett",
                                  xtickLabels, pvalueM, methodCluster)

        #####

    figure(figsize=figsz)
    subplots_adjust(top=0.9, bottom=botta, left=0.2, right=0.8)

    if len(titl) == 0:
        titl = outputFile

    plotExpBox(plotData, xtickLabels, showIndPoints, mark, markMean, showMean,
               notch, whisker, outliers, xlegendrotation, xlabe, ylabe, titl,
               showSampleSizes, showViolin, showBox, annot, trendData,
               showLegend, makePzfxFile, makeBinMatrix, dividePlots)

    #ylim([0,200])
    for m in medianToDraw:
        axhline(y=m, linestyle=':', color='gray')

    savefig(outputFile, bbox_inches="tight")

    if len(plotHistogramToFile) > 0:
        drawHistogram(plotHistogramToFile, plotData, xtickLabels)
        drawDensigram(plotHistogramToFile + ".density.png", plotData,
                      xtickLabels)
Beispiel #25
0
 def test_small(self):
     x = [1, 2, 3, 3, 4]
     y = [3, 2, 6, 1, 6, 1, 4, 1]
     W, pval = stats.ansari(x, y)
     assert_almost_equal(W, 23.5, 11)
     assert_almost_equal(pval, 0.13499256881897437, 11)
Beispiel #26
0
 def test_result_attributes(self):
     x = [1, 2, 3, 3, 4]
     y = [3, 2, 6, 1, 6, 1, 4, 1]
     res = stats.ansari(x, y)
     attributes = ('statistic', 'pvalue')
     check_named_results(res, attributes)
Beispiel #27
0
def features(parameters: dict, source_id: int):
    print("Start Table: {0}".format(source_id))

    db_distribution = pymysql.connect(parameters['hostname'],
                                      parameters['username'],
                                      parameters['password'],
                                      parameters["database_distribution"],
                                      charset='utf8mb4')
    cursor_distribution = db_distribution.cursor()

    sql = "TRUNCATE `{0}`.`evaluation_{1}`;".format(
        parameters["database_distribution"], source_id)
    cursor_distribution.execute(sql)

    sql_cols = generate_cols(parameters, "DIS")
    sql = "SELECT `IPDDIS`,`COUNTDIS`,{2} FROM `{0}`.`distribution_{1}` " \
          "WHERE `PARAID`='0' and `SOURCEID`='{1}' LIMIT 1;".format(parameters["database_distribution"], source_id, sql_cols)
    cursor_distribution.execute(sql)
    cursor_result = cursor_distribution.fetchone()
    referenceIPDValues = dict_to_dis_list(json_map_to_dict(cursor_result[0]))
    referenceCOUNTValues = dict_to_dis_list(json_map_to_dict(cursor_result[1]))
    referenceWINValues = dict()
    columeBegin = 2
    for index in range(int(parameters["WIN_BEGIN"]),
                       int(parameters["WIN_END"]) + 1,
                       int(parameters["WIN_STEP"])):
        referenceWINValues[index] = dict_to_dis_list(
            json_map_to_dict(cursor_result[columeBegin]))
        columeBegin = columeBegin + 1

    sql_cols = generate_cols(parameters, "PMF")
    sql = "SELECT `IPDPMF`,`COUNTPMF`,{2} FROM `{0}`.`pmf_{1}` " \
          "WHERE `PARAID`='0' and `SOURCEID`='{1}' LIMIT 1;".format(parameters["database_distribution"], source_id, sql_cols)
    cursor_distribution.execute(sql)
    cursor_result = cursor_distribution.fetchone()
    referenceIPDPMF = json_map_to_dict(cursor_result[0])
    referenceCOUNTPMF = json_map_to_dict(cursor_result[1])
    referenceWINPMF = dict()
    columeBegin = 2
    for index in range(int(parameters["WIN_BEGIN"]),
                       int(parameters["WIN_END"]) + 1,
                       int(parameters["WIN_STEP"])):
        referenceWINPMF[index] = json_map_to_dict(cursor_result[columeBegin])
        columeBegin = columeBegin + 1

    sql_cols = generate_cols(parameters, "CDF")
    sql = "SELECT `IPDCDF`,`COUNTCDF`,{2} FROM `{0}`.`cdf_{1}` " \
          "WHERE `PARAID`='0' and `SOURCEID`='{1}' LIMIT 1;".format(parameters["database_distribution"], source_id, sql_cols)
    cursor_distribution.execute(sql)
    cursor_result = cursor_distribution.fetchone()
    referenceIPDCDF = json_map_to_dict(cursor_result[0])
    referenceCOUNTCDF = json_map_to_dict(cursor_result[1])
    referenceWINCDF = dict()
    columeBegin = 2
    for index in range(int(parameters["WIN_BEGIN"]),
                       int(parameters["WIN_END"]) + 1,
                       int(parameters["WIN_STEP"])):
        referenceWINCDF[index] = json_map_to_dict(cursor_result[columeBegin])
        columeBegin = columeBegin + 1

    sql = "SELECT `ID` FROM `{0}`.`distribution_{1}` WHERE" \
          " `ID`!=(SELECT `ID` FROM `{0}`.`distribution_{1}` WHERE `SOURCEID`='{1}' AND `PARAID`='0' LIMIT 1);".format(
        parameters["database_distribution"], source_id)
    cursor_distribution.execute(sql)
    cursor_result = cursor_distribution.fetchall()
    ids = list()
    for id in cursor_result:
        ids.append(int(id[0]))
    print("Table {0} : {1}".format(source_id, ids))

    for id in ids:
        sql_cols = generate_cols(parameters, "DIS")
        sql = "SELECT `SOURCEID`,`PARAID`,`IPDDIS`,`COUNTDIS`,{3} " \
              "FROM `{0}`.`distribution_{1}` WHERE `ID`='{2}';".format(
              parameters["database_distribution"], source_id, id, sql_cols)
        cursor_distribution.execute(sql)
        cursor_result = cursor_distribution.fetchone()
        sourceID = cursor_result[0]
        paraID = cursor_result[1]
        currentIPDValues = dict_to_dis_list(json_map_to_dict(cursor_result[2]))
        currentCOUNTValues = dict_to_dis_list(
            json_map_to_dict(cursor_result[3]))
        currentWINValues = dict()
        columeBegin = 4
        for index in range(int(parameters["WIN_BEGIN"]),
                           int(parameters["WIN_END"]) + 1,
                           int(parameters["WIN_STEP"])):
            currentWINValues[index] = dict_to_dis_list(
                json_map_to_dict(cursor_result[columeBegin]))
            columeBegin = columeBegin + 1

        sql_cols = generate_cols(parameters, "PMF")
        sql = "SELECT `IPDPMF`,`COUNTPMF`,{3} FROM `{0}`.`pmf_{1}` WHERE `PARAID`='{2}' AND `SOURCEID`='{4}';".format(
            parameters["database_distribution"], source_id, paraID, sql_cols,
            sourceID)
        cursor_distribution.execute(sql)
        cursor_result = cursor_distribution.fetchone()
        currentIPDPMF = json_map_to_dict(cursor_result[0])
        currentCOUNTPMF = json_map_to_dict(cursor_result[1])
        currentWINPMF = dict()
        columeBegin = 2
        for index in range(int(parameters["WIN_BEGIN"]),
                           int(parameters["WIN_END"]) + 1,
                           int(parameters["WIN_STEP"])):
            currentWINPMF[index] = json_map_to_dict(cursor_result[columeBegin])
            columeBegin = columeBegin + 1

        sql_cols = generate_cols(parameters, "CDF")
        sql = "SELECT `IPDCDF`,`COUNTCDF`,{3} FROM `{0}`.`cdf_{1}` WHERE `PARAID`='{2}' AND `SOURCEID`='{4}';".format(
            parameters["database_distribution"], source_id, paraID, sql_cols,
            sourceID)
        cursor_distribution.execute(sql)
        cursor_result = cursor_distribution.fetchone()
        currentIPDCDF = json_map_to_dict(cursor_result[0])
        currentCOUNTCDF = json_map_to_dict(cursor_result[1])
        currentWINCDF = dict()
        columeBegin = 2
        for index in range(int(parameters["WIN_BEGIN"]),
                           int(parameters["WIN_END"]) + 1,
                           int(parameters["WIN_STEP"])):
            currentWINCDF[index] = json_map_to_dict(cursor_result[columeBegin])
            columeBegin = columeBegin + 1

        nonzero_referenceIPDPMF = pmf_to_list(referenceIPDPMF)
        nonzero_currentIPDPMF = pmf_to_list(currentIPDPMF)
        aligned_IPDPMF = align_pmf(nonzero_currentIPDPMF,
                                   nonzero_referenceIPDPMF)

        ks_ipd = stats.ks_2samp(referenceIPDValues, currentIPDValues)
        ttest_ipd = stats.ttest_ind(referenceIPDValues,
                                    currentIPDValues,
                                    axis=0,
                                    equal_var=False)
        kld_ipd = stats.entropy(aligned_IPDPMF[0], aligned_IPDPMF[1])
        if math.isinf(kld_ipd):
            kld_ipd = 1.0
        whitney_ipd = stats.mannwhitneyu(referenceIPDValues,
                                         currentIPDValues,
                                         use_continuity=False,
                                         alternative='two-sided')
        ansari_ipd = stats.ansari(referenceIPDValues, currentIPDValues)

        max_ipd_cdf_count = min(len(referenceIPDCDF), len(currentIPDCDF))
        wasserstein_ipd = wasserstein_distance_cdf(
            cdf_to_list(referenceIPDCDF)[:max_ipd_cdf_count],
            cdf_to_list(currentIPDCDF)[:max_ipd_cdf_count])
        energy_ipd = energy_distance_cdf(
            cdf_to_list(referenceIPDCDF)[:max_ipd_cdf_count],
            cdf_to_list(currentIPDCDF)[:max_ipd_cdf_count])
        print("PARAID = {0} : ".format(paraID), end='\t')
        print(
            "IPD [ks={0},\t ttest={1},\t kld={2},\t whitney={3},\t ansari={4},\t wasserstein={5},\t energy={6}]"
            .format(ks_ipd[1], ttest_ipd[1], kld_ipd, whitney_ipd[1],
                    ansari_ipd[1], wasserstein_ipd, energy_ipd),
            end='\t')

        ansari_count = stats.ansari(referenceCOUNTValues, currentCOUNTValues)

        nonzero_referenceCOUNTPMF = pmf_to_list(referenceCOUNTPMF)
        nonzero_currentCOUNTPMF = pmf_to_list(currentCOUNTPMF)
        aligned_COUNTPMF = align_pmf(nonzero_currentCOUNTPMF,
                                     nonzero_referenceCOUNTPMF)
        kld_count = stats.entropy(aligned_COUNTPMF[0], aligned_COUNTPMF[1])
        if math.isinf(kld_count):
            kld_count = 1.0
        ks_count = stats.ks_2samp(referenceCOUNTValues, currentCOUNTValues)
        ttest_count = stats.ttest_ind(referenceCOUNTValues,
                                      currentCOUNTValues,
                                      axis=0,
                                      equal_var=False)
        whitney_count = stats.mannwhitneyu(referenceCOUNTValues,
                                           currentCOUNTValues,
                                           use_continuity=False,
                                           alternative='two-sided')

        aligned_referenceCOUNTCDF = cdf_to_list(referenceCOUNTCDF)
        aligned_currentCOUNTCDF = cdf_to_list(currentCOUNTCDF)
        max_count_cdf = min(len(aligned_referenceCOUNTCDF),
                            len(aligned_currentCOUNTCDF))
        wasserstein_count = wasserstein_distance_cdf(
            aligned_referenceCOUNTCDF[:max_count_cdf],
            aligned_currentCOUNTCDF[:max_count_cdf])
        energy_count = energy_distance_cdf(
            aligned_referenceCOUNTCDF[:max_count_cdf],
            aligned_currentCOUNTCDF[:max_count_cdf])
        print(
            "COUNT [ks={0},\t ttest={1},\t whitney={2},\t ansari={3},\t kld={4},\t wasserstein={5},\t energy={6}]"
            .format(ks_count[1], ttest_count[1], whitney_count[1],
                    ansari_count[1], kld_count, wasserstein_count,
                    energy_count),
            end='\t')

        kld_win = dict()
        wasserstein_win = dict()
        energy_win = dict()
        for index in range(int(parameters["WIN_BEGIN"]),
                           int(parameters["WIN_END"]) + 1,
                           int(parameters["WIN_STEP"])):
            nonzero_referenceWINCDF = pmf_to_list(referenceWINCDF[index])
            nonzero_currentWINCDF = pmf_to_list(currentWINCDF[index])
            aligned_WINCDF = align_pmf(nonzero_currentWINCDF,
                                       nonzero_referenceWINCDF)
            kld_win[index] = stats.entropy(aligned_WINCDF[0],
                                           aligned_WINCDF[1])

            # nonzero_referenceWINPMF = pmf_to_list(referenceWINPMF[index])
            # nonzero_currentWINPMF = pmf_to_list(currentWINPMF[index])
            # aligned_WINPMF = align_pmf(nonzero_currentWINPMF, nonzero_referenceWINPMF)
            # kld_win[index] = stats.entropy(aligned_WINPMF[0], aligned_WINPMF[1])

            # kld_win[index] = stats.entropy(cdf_to_list(referenceWINCDF[index]), cdf_to_list(currentWINCDF[index]))
            # wasserstein_win[index] = stats.wasserstein_distance(cdf_to_list(referenceWINCDF[index]), cdf_to_list(currentWINCDF[index]))
            # energy_win[index] = stats.energy_distance(cdf_to_list(referenceWINCDF[index]), cdf_to_list(currentWINCDF[index]))
            # wasserstein_win[index] = stats.wasserstein_distance(pmf_to_list(referenceWINPMF[index]), pmf_to_list(currentWINPMF[index]))
            #energy_win[index] = stats.energy_distance(pmf_to_list(referenceWINPMF[index]), pmf_to_list(currentWINPMF[index]))
            # print("WIN{0} [kld={1},\t wasserstein={2},\t energy={3}]".format(
            #     index, kld_win[index], wasserstein_win[index], energy_win[index]), end='\t')
            wasserstein_win[index] = wasserstein_distance_cdf(
                cdf_to_list(referenceWINCDF[index]),
                cdf_to_list(currentWINCDF[index]))
            energy_win[index] = energy_distance_cdf(
                cdf_to_list(referenceWINCDF[index]),
                cdf_to_list(currentWINCDF[index]))

        print("")

        col_names = str()
        col_values = str()
        for key, value in kld_win.items():
            col_names = col_names + "`KLD-WIN{0}`,`WASSERSTEIN-WIN{0}`,`ENERGY-WIN{0}`,".format(
                key)

            if math.isinf(value):
                value = 1
            col_values = col_values + "'{0}','{1}','{2}',".format(
                value, wasserstein_win[key], energy_win[key])
        sql = "INSERT INTO `{0}`.`evaluation_{1}` " \
              "({2}`KSP-IPD`,`KLD-IPD`,`TTESTP-IPD`,`WHITNEYP-IPD`,`ANSARIP-IPD`,`WASSERSTEIN-IPD`,`ENERGY-IPD`," \
              "`KSP-COUNT`,`KLD-COUNT`,`TTESTP-COUNT`,`WHITNEYP-COUNT`,`ANSARIP-COUNT`,`WASSERSTEIN-COUNT`,`ENERGY-COUNT`," \
              "`SOURCEID`,`PARAID`)" \
              " VALUES({3}'{4}','{5}','{6}','{7}','{8}','{9}','{10}','{11}','{12}','{13}','{14}','{15}','{16}','{17}'," \
              "'{18}', '{19}');".format(
            parameters["database_distribution"], source_id, col_names, col_values,
            ks_ipd[1], kld_ipd, ttest_ipd[1], whitney_ipd[1], ansari_ipd[1], wasserstein_ipd, energy_ipd,
            ks_count[1], kld_count, ttest_count[1], whitney_count[1], ansari_count[1], wasserstein_count, energy_count,
            sourceID, paraID)

        # print(sql)
        cursor_distribution.execute(sql)

    db_distribution.commit()
    cursor_distribution.close()
    db_distribution.close()
 def test_ansariBradleyTest_exact_xResult(self):
     data_1 = [-63, 18, 84, 160, 33, -82, 49, 74, 58, -31, 151]
     data_2 = [78, -124, -443, 225, -9, -3, 189, 164, 119, 184]
     x1, p1 = ansari_bradley_test(data_1, data_2, alternative="two-sided")
     x2, p2 = ansari(data_1, data_2)
     assert pytest.approx(x2) == x1