Ejemplo n.º 1
0
def sliding_MWU(values):
    """
    RETURN
    :param values:
    :return:
    """
    # ADD MEDIAN TO EITHER SIDE OF values
    prefix = [np.median(values[: i + weight_radius]) for i in range(weight_radius)]
    suffix = [
        np.median(values[-i - weight_radius:])
        for i in reversed(range(weight_radius))
    ]
    combined = np.array(prefix + list(values) + suffix)
    b = combined.itemsize
    window = as_strided(
        combined, shape=(len(values), weight_radius * 2), strides=(b, b)
    )

    med = (len(median_weight) + 1) / 2
    m_score = np.array(
        [
            stats.mannwhitneyu(
                w[:weight_radius],
                w[-weight_radius:],
                use_continuity=True,
                alternative="two-sided",
            )
            for v in window
            for r in [rankdata(v)]
            for w in [(r - med) * median_weight]
        ]
    )

    return m_score
Ejemplo n.º 2
0
def jitter_MWU(values, start, mid, end):
    """
    RETURN A BETTER MIDPOINT< ACCOUNTING FOR t-test RESULTS
    """

    # ADD SOME CONSTRAINTS TO THE RANGE OF VALUES TESTED
    m_start = min(mid, max(start + MIN_POINTS, mid - JITTER))
    m_end = max(mid, min(mid + JITTER, end - MIN_POINTS))
    if m_start == m_end:
        return no_good_edge, no_good_edge, mid
    mids = np.array(range(m_start, m_end))

    # MWU SCORES
    try:
        m_score = np.array([
            stats.mannwhitneyu(
                values[max(start, m - MAX_POINTS):m],
                values[m:min(end, m + MAX_POINTS)],
                use_continuity=True,
                alternative="two-sided",
            ) for m in mids
        ])

        t_score = np.array([
            stats.ttest_ind(
                values[max(start, m - MAX_POINTS):m],
                values[m:min(end, m + MAX_POINTS)],
                equal_var=False,
            ) for m in mids
        ])

    except Exception as e:
        e = Except.wrap(e)
        if "All numbers are identical" in e:
            return no_good_edge, no_good_edge, mids[0]
        raise e

    # TOTAL SUM-OF-SQUARES
    # DO NOT KNOW WHAT THIS WAS DOING
    # if m_start - start == 0:
    #     # WE CAN NOT OFFSET BY ONE, SO WE ADD A DUMMY VALUE
    #     v_prefix = np.array([np.nan] + list(not_right(cumSS(values[start:m_end]), 1)))
    # else:
    #     # OFFSET BY ONE, WE WANT cumSS OF ALL **PREVIOUS** VALUES
    #     v_prefix = not_right(
    #         not_left(cumSS(values[start:m_end]), m_start - start - 1), 1
    #     )
    # v_suffix = not_right(cumSS(values[m_start:end][::-1])[::-1], end - m_end)
    # v_score = v_prefix + v_suffix
    # pvalue = np.sqrt(m_score[:, 1] * v_score)  # GOEMEAN OF SCORES

    # PICK LOWEST
    pvalue = np.sqrt(m_score[:, 1] * t_score[:, 1])
    best = np.argmin(pvalue)

    return Data(pvalue=m_score[best, 1]), Data(pvalue=t_score[best,
                                                              1]), mids[best]
Ejemplo n.º 3
0
def square_vs_diag(codon_permutation_f, outdir):
    # Replicates the analysis presented in Fig. 5B and creates plot
    ret = Utils.Load(codon_permutation_f)
    npr = np.array(ret['n+_risk'])
    codes = ret['code']
    squares = np.array([issquare(i) for i in codes[1:]])
    diags = np.array([isdiag(i) for i in codes[1:]])
    print('n diags: {}'.format(sum(diags)))
    print('n squares: {}'.format(sum(squares)))
    _, ax = plt.subplots(1, figsize=(4.7, 5.2), dpi=144)
    grps = [npr[1:][squares], npr[1:][~squares & ~diags], npr[1:][diags]]
    ax.boxplot(grps,
               showfliers=False,
               whis=(5, 95),
               flierprops={
                   'color': 'k',
                   'marker': 'x',
                   'markersize': 2
               },
               boxprops={
                   'color': 'k',
                   'lw': 0.6
               },
               capprops={
                   'color': 'k',
                   'lw': 0.6
               },
               whiskerprops={
                   'color': 'k',
                   'lw': 0.6
               },
               medianprops={
                   'color': '',
                   'lw': 1.2
               })
    ax.set_ylim(0.15, 0.31)
    ax.set_yticks([0.15, 0.2, 0.25, 0.3])
    print('squares vs all: {}'.format(mannwhitneyu(grps[0], grps[1])))
    print('squares vs diags: {}'.format(mannwhitneyu(grps[0], grps[2])))
    print('diags vs all: {}'.format(mannwhitneyu(grps[2], grps[1])))
    plt.savefig(join(outdir, 'Squares_diags.png'), dpi=144)
Ejemplo n.º 4
0
def jitter_MWU(values, start, mid, end):
    # ADD SOME CONSTRAINTS TO THE RANGE OF VALUES TESTED
    m_start = min(mid, max(start + MIN_POINTS, mid - JITTER))
    m_end = max(mid, min(mid + JITTER, end - MIN_POINTS))
    if m_start == m_end:
        return no_good_edge, no_good_edge, mid
    mids = np.array(range(m_start, m_end))

    # MWU SCORES
    m_score = np.array(
        [
            stats.mannwhitneyu(
                values[max(start, m - MAX_POINTS) : m],
                values[m : min(end, m + MAX_POINTS)],
                use_continuity=True,
                alternative="two-sided",
            )
            for m in mids
        ]
    )

    t_score = np.array(
        [
            stats.ttest_ind(
                values[max(start, m - MAX_POINTS) : m],
                values[m : min(end, m + MAX_POINTS)],
                equal_var=False,
            )
            for m in mids
        ]
    )

    # TOTAL SUM-OF-SQUARES
    if m_start - start == 0:
        # WE CAN NOT OFFSET BY ONE, SO WE ADD A DUMMY VALUE
        v_prefix = np.array([np.nan] + list(not_right(cumSS(values[start:m_end]), 1)))
    else:
        # OFFSET BY ONE, WE WANT cumSS OF ALL **PREVIOUS** VALUES
        v_prefix = not_right(
            not_left(cumSS(values[start:m_end]), m_start - start - 1), 1
        )
    v_suffix = not_right(cumSS(values[m_start:end][::-1])[::-1], end - m_end)
    v_score = v_prefix + v_suffix

    # PICK LOWEST
    pvalue = np.sqrt(m_score[:, 1] * v_score)  # GOEMEAN OF SCORES
    best = np.argmin(pvalue)

    return Data(pvalue=m_score[best, 1]), Data(pvalue=t_score[best, 1]), mids[best]
Ejemplo n.º 5
0
def mannwhitneyu_test(approaches, accuracy_values, save_path):
    # Compute the Mann-Whitney rank test on samples x and y.
    mannwhitneyu_test_frame = pd.DataFrame()
    for i in range(len(approaches)):
        for j in range(i, len(approaches), 1):
            # iterate through approaches
            approach_i = approaches[i]
            approach_j = approaches[j]
            values_i = accuracy_values.loc[:, approach_i]
            values_j = accuracy_values.loc[:, approach_j]
            t_statistic, two_tailed_p_test = stats.mannwhitneyu(values_i, values_j)
            mannwhitneyu_test_frame.at[approach_i, approach_j] = two_tailed_p_test

    save_path.mkdir(parents=True, exist_ok=True)
    fig = plt.figure(figsize=(4, 2))
    ax = fig.subplots()
    ax = sns.heatmap(mannwhitneyu_test_frame, ax=ax, annot=True, fmt="0.3f", cmap="autumn", vmin=0, vmax=0.05)
    plt.xticks(rotation=45)
    fig.canvas.start_event_loop(sys.float_info.min)
    path = save_path / 'mannwhitneyu.png'
    fig.savefig(path, bbox_inches='tight', dpi=100)
    plt.close(fig)
Ejemplo n.º 6
0
def sliding_MWU(values):
    """
    RETURN
    :param values:
    :return:
    """
    # ADD MEDIAN TO EITHER SIDE OF values
    prefix = [
        np.median(values[:i + weight_radius]) for i in range(weight_radius)
    ]
    suffix = [
        np.median(values[-i - weight_radius:])
        for i in reversed(range(weight_radius))
    ]
    combined = np.array(prefix + list(values) + suffix)
    b = combined.itemsize
    window = as_strided(combined,
                        shape=(len(values), weight_radius * 2),
                        strides=(b, b))

    med = (len(median_weight) + 1) / 2
    try:
        m_score = np.array([
            stats.mannwhitneyu(
                w[:weight_radius],
                w[-weight_radius:],
                use_continuity=True,
                alternative="two-sided",
            ) for v in window for r in [rankdata(v)]
            for w in [(r - med) * median_weight]
        ])

        return m_score
    except Exception as cause:
        cause = Except.wrap(cause)
        if "All numbers are identical" in cause:
            return np.ones((window.shape[0], 2))
        raise cause
def plotExpBox_Main(inputFiles,headers,valcols,outputFile,sep,startRow,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,plotPvalueCluster,outputClusterPrefix,methodCluster,xlegendrotation,xlabe,ylabe,figsz,titl,showSampleSizes,trimToMinSize,relabels,logb,plotHistogramToFile,plotMedianForGroups,botta,showViolin,showBox,firstColAnnot,plotTrend,showLegend,makePzfxFile,makeBinMatrix,writeDataSummaryStat,summaryStatRange,minuslog10pvalue,minNDataToKeep,vfacecolor,valpha,outXYZPvalues,dividePlots):

	#if plotPvalueCluster:
		#if pvalue cluster is needed:
	#	from Bio.Cluster.cluster import *
	#	from Bio.Cluster import *
		#endif


	
	#the real deal!
	plotData=[]	
	xtickLabels=[]
	
	trendData={}
	annot={}
	
	minSize=-1

	for inputFile,header,cols in zip(inputFiles,headers,valcols):
		fin=generic_istream(inputFile)
		
		startIdx=len(plotData)
		
		if firstColAnnot:
			colAnnot=cols[0]
			cols=cols[1:]
			annotThisFile=[]
			annot[startIdx]=annotThisFile
		else:
			colAnnot=-1
			annotThisFile=None
			
		for col in cols:
			plotData.append([])
			xtickLabels.append(header[col])

		colIndices=range(startIdx,startIdx+len(cols))
		
		if plotTrend:
			#print >> stderr,"plotTrend"
			trendDataThisFile=[]
			trendData[startIdx]=trendDataThisFile
		else:
			trendDataThisFile=None
			
			
		lino=0
		for lin in fin:
			lino+=1
			if lino<startRow:
				continue		
			fields=lin.rstrip("\r\n").split(sep)
			
			if plotTrend:
				#print >> stderr,"a"
				trendDataThisLine=[]
			else:
				trendDataThisLine=None
			
			allDataOKThisLine=True
			
			if colAnnot>=0:
				annotThisFile.append(fields[colAnnot])
			
			for idx,col in zip(colIndices,cols):
				try:
					value=float(fields[col])
					if logb!=0:
						if value==0.0:
							raise ValueError
						value=log(value)/logb							
					plotData[idx].append(value)
					
					if plotTrend:
						trendDataThisLine.append(value)
						#print >> stderr,"value:",value
					
				except:
					allDataOKThisLine=False	
				
			if plotTrend:
				if allDataOKThisLine:
					trendDataThisFile.append(trendDataThisLine)
				else:
					trendDataThisFile.append(None)
			
		fin.close()
	
		
		if minSize==-1:
			minSize=len(plotData[idx]) #or startIDX?
		else:
			minSize=min([minSize,len(plotData[idx])])
		

	if trimToMinSize:
		print >> stderr,"trimming to min size =",minSize
		trimData(plotData,minSize)

	if len(relabels)>0:
		#if len(relabels)!=len(xtickLabels):
		#	print >> stderr,"relabels doesn't have the same length as original label vectors",xtickLabels,"=>",relabels
		#	exit()
		print >> stderr,xtickLabels
		print >> stderr,relabels
		for i,relabel in zip(range(0,len(relabels)),relabels):
			xtickLabels[i]=relabel
		
	
	for i in range(0,len(plotMedianForGroups)):
		plotMedianForGroups[i]=getCol0ListFromCol1ListStringAdv(xtickLabels,plotMedianForGroups[i])
			
	
	#drawing medians:
	medianToDraw=[]
	for mediangrouper in plotMedianForGroups:
		curD=[]		
		for c in mediangrouper:
			curD.extend(plotData[c])
		medianToDraw.append(median(curD))


	for c in range(len(plotData)-1,-1,-1):
		if len(plotData[c])<minNDataToKeep:
			print >> stderr,xtickLabels[c],"discarded because has only",len(plotData[c]),"data points <",minNDataToKeep
			del plotData[c]
			del xtickLabels[c]

	if not skipStat:
		print >> stdout,"student t-test (1 sample; mean=0)"
		print >> stdout,"sample","mean","p-val","median"
	
		if writeDataSummaryStat:
			fDSS=open(writeDataSummaryStat,"w")
			print >> fDSS,"sample\tmean\tvar\tsd\tmin\tmax\tN\tNInRange["+str(summaryStatRange[0])+","+str(summaryStatRange[1])+"]\t%NInRange\tNbelowRange\t%Nbelow\tNAboveRange\t%NAbove"
			
		for x in range(0,len(plotData)):
			#print >> stderr, len(plotData[x])
			try:
				print >> stdout, xtickLabels[x],mean(plotData[x]),ttest_1samp(plotData[x],0)[1],median(plotData[x])
			except:
				print >> stdout, xtickLabels[x],mean(plotData[x]),"NA",median(plotData[x])
			
			if writeDataSummaryStat:
				sumData,N,NIN,NBelow,NAbove=filterDataInRangeInclusive(plotData[x],summaryStatRange[0],summaryStatRange[1])
				
				if NIN>1:
					#print >> stderr,"sumData=",sumData
					#print >> stderr,mean
					mea=mean2(sumData)
					DDOF=1
					sd=std(sumData,ddof=DDOF)
					var=sd*sd
					mi=min(sumData)
					ma=max(sumData)
				else:
					mea="NA"
					sd="NA"
					var="NA"
					mi="NA"
					ma="NA"
				
			
					
				print >> fDSS,xtickLabels[x]+"\t"+str(mea)+"\t"+str(var)+"\t"+str(sd)+"\t"+str(mi)+"\t"+str(ma)+"\t"+str(N)+"\t"+str(NIN)+"\t"+str(float(NIN)*100/N)+"\t"+str(NBelow)+"\t"+str(float(NBelow)*100/N)+"\t"+str(NAbove)+"\t"+str(float(NAbove)*100/N)
			
	
		pvalueM=[]
		
		if writeDataSummaryStat:
			fDSS.close()
		
		print >> stdout,""
		
		print >> stdout,"student t-test (2 samples)"
		print >> stdout,"p-val",
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
		print >> stdout,""
	
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
				else:
					try:
						pvalue=ttest_ind(plotData[x],plotData[y])[1]
					except:
						pvalue=1.0
					
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
					
					print >> stdout, str(pvalue),
					pvalueRow.append(pvalue)
			print >> stdout,"";	
	
		
		print >> stdout,""
	
		
	
	
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_t_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_t",xtickLabels,pvalueM,methodCluster)
	
	
			
		pvalueM=[]
	
		print >> stdout,"welch t-test"
		print >> stdout,"p-val",
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
		print >> stdout,""
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
						
				else:
					try:
						pvalue=welchs_approximate_ttest_arr(plotData[x],plotData[y])[3]
					except:
						pvalue=1.0
	
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
	
					
					print >> stdout, str(pvalue),
					pvalueRow.append(pvalue)
			print >> stdout,"";
	
		if outXYZPvalues:
			writeXYZPvalues(outXYZPvalues+"_Welch.xyz",xtickLabels,pvalueM)
	
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_Welch_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_Welch",xtickLabels,pvalueM,methodCluster)
	
		
		print >> stdout,""
		print >> stdout,"non-parametric (Mann-Whitney U)" #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)"
		print >> stdout,"p-val",
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
	
		pvalueM=[]
	
		print >> stdout,""
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
				else:
					#if max(len(plotData[x]),len(plotData[y]))<=20:
					try:
						pvalue=mannwhitneyu(plotData[x],plotData[y])[1]*2				
					except:
						pvalue=1.0
	
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
	
	
					print >> stdout,pvalue, #mann-whiteney need to mul by 2 (one tail to two tail)
					pvalueRow.append(pvalue)
					#else:
					#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
			print >> stdout,"";	
	
		if outXYZPvalues:
			writeXYZPvalues(outXYZPvalues+"_U.xyz",xtickLabels,pvalueM)
		
	
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_U_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_U",xtickLabels,pvalueM,methodCluster)
		
		#####now the variance tests
		
		print >> stdout,""
		print >> stdout,"Ansari-Bradley Two-sample Test for difference in scale parameters " 
		print >> stdout,"p-val",
		
		
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
	
		pvalueM=[]
	
		print >> stdout,""
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
				else:
					#if max(len(plotData[x]),len(plotData[y]))<=20:
					try:
						pvalue=ansari(plotData[x],plotData[y])[1]		
					except:
						pvalue="NA"
	
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
	
	
						#pvalue=1.0
					print >> stdout,pvalue,
					pvalueRow.append(pvalue)
					#else:
					#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
			print >> stdout,"";	
		
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_Ansari_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_Ansari",xtickLabels,pvalueM,methodCluster)	
		
		
		#####
	
		#####now the variance tests
		
		print >> stdout,""
		print >> stdout,"Fligner's Two-sample Test for equal variance (non-parametrics)" 
		print >> stdout,"p-val",
		
		
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
	
		pvalueM=[]
	
		print >> stdout,""
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
				else:
					#if max(len(plotData[x]),len(plotData[y]))<=20:
					try:
						pvalue=fligner(plotData[x],plotData[y])[1]		
					except:
						pvalue="NA"
						#pvalue=1.0
						
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
	
	
					print >> stdout,pvalue,
					pvalueRow.append(pvalue)
					#else:
					#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
			print >> stdout,"";	
		
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_fligner_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_fligner",xtickLabels,pvalueM,methodCluster)	
		
		
		#####
	
		#####now the variance tests
		
		print >> stdout,""
		print >> stdout,"Levene's Two-sample Test for equal variance" 
		print >> stdout,"p-val",
		
		
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
	
		pvalueM=[]
	
		print >> stdout,""
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
				else:
					#if max(len(plotData[x]),len(plotData[y]))<=20:
					try:
						pvalue=levene(plotData[x],plotData[y])[1]		
					except:
						pvalue="NA"
						#pvalue=1.0
						
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
	
	
					print >> stdout,pvalue,
					pvalueRow.append(pvalue)
					#else:
					#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
			print >> stdout,"";	
		
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_levene_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_levene",xtickLabels,pvalueM,methodCluster)	
		
		
		#####
	
		#####now the variance tests
		
		print >> stdout,""
		print >> stdout,"Bartlett's Two-sample Test for equal variance (for normal distributions)" 
		print >> stdout,"p-val",
		
		
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
	
		pvalueM=[]
	
		print >> stdout,""
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
				else:
					#if max(len(plotData[x]),len(plotData[y]))<=20:
					try:
						pvalue=bartlett(plotData[x],plotData[y])[1]		
					except:
						pvalue="NA"
						#pvalue=1.0
	
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
	
	
					print >> stdout,pvalue,
					pvalueRow.append(pvalue)
					#else:
					#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
			print >> stdout,"";	
		
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_bartlett_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_bartlett",xtickLabels,pvalueM,methodCluster)	
		
		
		#####

	figure(figsize=figsz)
	subplots_adjust(top=0.9, bottom=botta, left=0.2, right=0.8)
	
	if len(titl)==0:
		titl=outputFile


	plotExpBox(plotData,xtickLabels,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,xlegendrotation,xlabe,ylabe,titl,showSampleSizes,showViolin,showBox,annot,trendData,showLegend,makePzfxFile,makeBinMatrix,dividePlots)
	
	#ylim([0,200])
	for m in medianToDraw:
		axhline(y=m,linestyle=':',color='gray')

	savefig(outputFile,bbox_inches="tight")

	if len(plotHistogramToFile)>0:
		drawHistogram(plotHistogramToFile,plotData,xtickLabels)
		drawDensigram(plotHistogramToFile+".density.png",plotData,xtickLabels)
def plotExpBox_Main(inputFile,header,cols,outputFile,sep,startRow,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,plotPvalueCluster,outputClusterPrefix,methodCluster,xlegendrotation,xlabe,ylabe,figsz,titl,showSampleSizes):

	#if plotPvalueCluster:
		#if pvalue cluster is needed:
	#	from Bio.Cluster.cluster import *
	#	from Bio.Cluster import *
		#endif



	#the real deal!


	fin=generic_istream(inputFile)
	
	plotData=[]	
	xtickLabels=[]
	for col in cols:
		plotData.append([])
		xtickLabels.append(header[col])

	colIndices=range(0,len(cols))

	lino=0
	for lin in fin:
		lino+=1
		if lino<startRow:
			continue		
		fields=lin.rstrip("\r\n").split(sep)
		
		for idx,col in zip(colIndices,cols):
			try:
				value=float(fields[col])			
				plotData[idx].append(value)
			except:
				pass		
	fin.close()

	print >> stdout,"student t-test (1 sample; mean=0)"
	print >> stdout,"sample","mean","p-val"


	for x in range(0,len(plotData)):
		print >> stdout, xtickLabels[x],mean(plotData[x]),ttest_1samp(plotData[x],0)[1]


	pvalueM=[]
	
	print >> stdout,""
	
	print >> stdout,"student t-test (2 samples)"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	
	print >> stdout,""

	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
			else:
				pvalue=ttest_ind(plotData[x],plotData[y])[1]
				print >> stdout, str(pvalue),
				pvalueRow.append(pvalue)
		print >> stdout,"";	

	
	print >> stdout,""

	makePValueRawPlot(outputClusterPrefix+"_t_raw",xtickLabels,pvalueM)


	if plotPvalueCluster:
		makePValueClusterPlot(outputClusterPrefix+"_t",xtickLabels,pvalueM,methodCluster)

	pvalueM=[]

	print >> stdout,"welch t-test"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	
	print >> stdout,""
	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
					
			else:
				pvalue=welchs_approximate_ttest_arr(plotData[x],plotData[y])[3]
				print >> stdout, str(pvalue),
				pvalueRow.append(pvalue)
		print >> stdout,"";

	makePValueRawPlot(outputClusterPrefix+"_Welch_raw",xtickLabels,pvalueM)

	if plotPvalueCluster:
		makePValueClusterPlot(outputClusterPrefix+"_Welch",xtickLabels,pvalueM,methodCluster)

	
	print >> stdout,""
	print >> stdout,"non-parametric (Mann-Whitney U)" #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	

	pvalueM=[]

	print >> stdout,""
	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
			else:
				#if max(len(plotData[x]),len(plotData[y]))<=20:
				pvalue=mannwhitneyu(plotData[x],plotData[y])[1]*2				
				print >> stdout,pvalue, #mann-whiteney need to mul by 2 (one tail to two tail)
				pvalueRow.append(pvalue)
				#else:
				#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
		print >> stdout,"";	
	
	makePValueRawPlot(outputClusterPrefix+"_U_raw",xtickLabels,pvalueM)

	if plotPvalueCluster:
		makePValueClusterPlot(outputClusterPrefix+"_U",xtickLabels,pvalueM,methodCluster)
	
	figure(figsize=figsz)
		
	if len(titl)==0:
		titl=outputFile

	plotExpBox(plotData,xtickLabels,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,xlegendrotation,xlabe,ylabe,titl,showSampleSizes)
	#ylim([0,200])

	savefig(outputFile,bbox_inches="tight")
Ejemplo n.º 9
0
def compute_auc(train, test, Y, y):
    """Computes measures of accuracy for train and test data.

    Computes the ROC curve and the area under that curve, as a
    measure of classification accuracy. The threshold corresponding
    to the point on the ROC curve farthest from `y=x` line is selected
    and fraction of correct predictions corresponding to that
    threshold is returned.
in
    Arguments
        train : float array
            Array of predictions of the model on training data where rows
            correspond to labels and columns correspond to samples.
        test : float array
            Array of predictions of the model on testing data where rows
            correspond to labels and columns correspond to samples.
        Y : float array
            Training labels where rows correspond to labels
            and columns to samples. This is an array of {1,-1}.
        y : float array
            Testing labels where rows correspond to labels
            and columns to samples. This is an array of {1,-1}.

    Returns
        performance : float array
            Array containing the AUC for training data, classification
            accuracy for training data, AUC for testing data and
            classification accuracy for testing data, in that order.

    .. note::
        * For binary-class classification, AUC is proportional to the Mann-Whitney U test statistic which computes a measure of the separation between values of positive labels and negative labels.

        * For multi-class classification, this formula for computing classifier AUC is one of many. A more principled way would involve computing the Volume under an ROC surface.

    """
    # BINARY CASE
    if (Y.shape[0] == 2):
        # computing train AUC
        NP = (Y[0, :] == 1).sum()
        NM = (Y[0, :] == -1).sum()
        try:
            U = stats.mannwhitneyu(train[0, (Y[0, :] == 1)], train[0, (Y[0, :] == -1)])
            train_auc = 1. - U[0] / (NP * NM)
        except ValueError:
            # if all function outputs are equal, AUC = 0.5
            train_auc = 0.5
        # computing test AUC
        NP = (y[0, :] == 1).sum()
        NM = (y[0, :] == -1).sum()
        try:
            U = stats.mannwhitneyu(test[0, (y[0, :] == 1)], test[0, (y[0, :] == -1)])
            test_auc = 1. - U[0] / (NP * NM)
        except ValueError:
            # if all function outputs are equal, AUC = 0.5
            test_auc = 0.5

    # MULTICLASS CASE
    else:
        # computing train AUC
        NP = (Y == 1).sum()
        NM = (Y == -1).sum()
        try:
            U = stats.mannwhitneyu(train[(Y == 1)], train[(Y == -1)])
            train_auc = 1. - U[0] / (NP * NM)
        except ValueError:
            # if all function outputs are equal, AUC = 0.5
            train_auc = 0.5
        # computing test AUC
        NP = (y == 1).sum()
        NM = (y == -1).sum()
        try:
            U = stats.mannwhitneyu(test[(y == 1)], test[(y == -1)])
            test_auc = 1. - U[0] / (NP * NM)
        except ValueError:
            # if all function outputs are equal, AUC = 0.5
            test_auc = 0.5

    # accuracy = number of examples where argmax of prediction
    # equals true label

    # train accuracy
    train_accuracy = (train.argmax(0) - Y.argmax(0) == 0).sum() / float(Y.shape[1])

    # test accuracy
    test_accuracy = (test.argmax(0) - y.argmax(0) == 0).sum() / float(y.shape[1])

    # train exploss
    train_loss = np.sum(np.exp(-1 * train * Y))

    # test exploss
    test_loss = np.sum(np.exp(-1 * test * y))

    return np.array([train_loss, train_auc, train_accuracy, test_loss, test_auc, test_accuracy])
def plotExpBox_Main(inputFiles,headers,valcols,outputFile,sep,startRow,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,plotPvalueCluster,outputClusterPrefix,methodCluster,xlegendrotation,xlabe,ylabe,figsz,titl,showSampleSizes,trimToMinSize,relabels,logb,plotHistogramToFile,plotMedianForGroups,botta):

	#if plotPvalueCluster:
		#if pvalue cluster is needed:
	#	from Bio.Cluster.cluster import *
	#	from Bio.Cluster import *
		#endif



	#the real deal!
	plotData=[]	
	xtickLabels=[]
	
	
	minSize=-1

	for inputFile,header,cols in zip(inputFiles,headers,valcols):
		fin=generic_istream(inputFile)
		
		startIdx=len(plotData)

		for col in cols:
			plotData.append([])
			xtickLabels.append(header[col])

		colIndices=range(startIdx,startIdx+len(cols))

		lino=0
		for lin in fin:
			lino+=1
			if lino<startRow:
				continue		
			fields=lin.rstrip("\r\n").split(sep)
		
			for idx,col in zip(colIndices,cols):
				try:
					
					value=float(fields[col])
					if logb!=0:
						if value==0.0:
							raise ValueError
						value=log(value)/logb							
					plotData[idx].append(value)
					
				except:
					pass		
		fin.close()
	
		
		if minSize==-1:
			minSize=len(plotData[idx]) #or startIDX?
		else:
			minSize=min([minSize,len(plotData[idx])])
		

	if trimToMinSize:
		print >> stderr,"trimming to min size =",minSize
		trimData(plotData,minSize)

	if len(relabels)>0:
		#if len(relabels)!=len(xtickLabels):
		#	print >> stderr,"relabels doesn't have the same length as original label vectors",xtickLabels,"=>",relabels
		#	exit()
		print >> stderr,xtickLabels
		print >> stderr,relabels
		for i,relabel in zip(range(0,len(relabels)),relabels):
			xtickLabels[i]=relabel
		
	
	for i in range(0,len(plotMedianForGroups)):
		plotMedianForGroups[i]=getCol0ListFromCol1ListStringAdv(xtickLabels,plotMedianForGroups[i])
			
	
	#drawing medians:
	medianToDraw=[]
	for mediangrouper in plotMedianForGroups:
		curD=[]		
		for c in mediangrouper:
			curD.extend(plotData[c])
		medianToDraw.append(median(curD))


	for c in range(len(plotData)-1,-1,-1):
		if len(plotData[c])==0:
			print >> stderr,xtickLabels[c],"discarded"
			del plotData[c]
			del xtickLabels[c]


	print >> stdout,"student t-test (1 sample; mean=0)"
	print >> stdout,"sample","mean","p-val"


	for x in range(0,len(plotData)):
		#print >> stderr, len(plotData[x])
		try:
			print >> stdout, xtickLabels[x],mean(plotData[x]),ttest_1samp(plotData[x],0)[1]
		except:
			print >> stdout, xtickLabels[x],"NA","NA"

	pvalueM=[]
	
	print >> stdout,""
	
	print >> stdout,"student t-test (2 samples)"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	
	print >> stdout,""

	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
			else:
				try:
					pvalue=ttest_ind(plotData[x],plotData[y])[1]
				except:
					pvalue=1.0

				print >> stdout, str(pvalue),
				pvalueRow.append(pvalue)
		print >> stdout,"";	

	
	print >> stdout,""

	


	if plotPvalueCluster:
		makePValueRawPlot(outputClusterPrefix+"_t_raw",xtickLabels,pvalueM)
		makePValueClusterPlot(outputClusterPrefix+"_t",xtickLabels,pvalueM,methodCluster)

	pvalueM=[]

	print >> stdout,"welch t-test"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	
	print >> stdout,""
	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
					
			else:
				pvalue=welchs_approximate_ttest_arr(plotData[x],plotData[y])[3]
				print >> stdout, str(pvalue),
				pvalueRow.append(pvalue)
		print >> stdout,"";

	

	if plotPvalueCluster:
		makePValueRawPlot(outputClusterPrefix+"_Welch_raw",xtickLabels,pvalueM)
		makePValueClusterPlot(outputClusterPrefix+"_Welch",xtickLabels,pvalueM,methodCluster)

	
	print >> stdout,""
	print >> stdout,"non-parametric (Mann-Whitney U)" #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	

	pvalueM=[]

	print >> stdout,""
	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
			else:
				#if max(len(plotData[x]),len(plotData[y]))<=20:
				try:
					pvalue=mannwhitneyu(plotData[x],plotData[y])[1]*2				
				except:
					pvalue=1.0
				print >> stdout,pvalue, #mann-whiteney need to mul by 2 (one tail to two tail)
				pvalueRow.append(pvalue)
				#else:
				#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
		print >> stdout,"";	
	
	

	if plotPvalueCluster:
		makePValueRawPlot(outputClusterPrefix+"_U_raw",xtickLabels,pvalueM)
		makePValueClusterPlot(outputClusterPrefix+"_U",xtickLabels,pvalueM,methodCluster)
	
	figure(figsize=figsz)
	subplots_adjust(top=0.9, bottom=botta, left=0.2, right=0.8)
	
	if len(titl)==0:
		titl=outputFile


	plotExpBox(plotData,xtickLabels,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,xlegendrotation,xlabe,ylabe,titl,showSampleSizes)
	
	#ylim([0,200])
	for m in medianToDraw:
		axhline(y=m,linestyle=':',color='gray')

	savefig(outputFile,bbox_inches="tight")

	if len(plotHistogramToFile)>0:
		drawHistogram(plotHistogramToFile,plotData,xtickLabels)
Ejemplo n.º 11
0
def plotExpBox_Main(inputFiles,headers,valcols,outputFile,sep,startRow,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,plotPvalueCluster,outputClusterPrefix,methodCluster,xlegendrotation,xlabe,ylabe,figsz,titl,showSampleSizes,trimToMinSize,relabels,logb,plotHistogramToFile,plotMedianForGroups,botta):

	#if plotPvalueCluster:
		#if pvalue cluster is needed:
	#	from Bio.Cluster.cluster import *
	#	from Bio.Cluster import *
		#endif



	#the real deal!
	plotData=[]	
	xtickLabels=[]
	
	
	minSize=-1

	for inputFile,header,cols in zip(inputFiles,headers,valcols):
		fin=generic_istream(inputFile)
		
		startIdx=len(plotData)

		for col in cols:
			plotData.append([])
			xtickLabels.append(header[col])

		colIndices=range(startIdx,startIdx+len(cols))

		lino=0
		for lin in fin:
			lino+=1
			if lino<startRow:
				continue		
			fields=lin.rstrip("\r\n").split(sep)
		
			for idx,col in zip(colIndices,cols):
				try:
					
					value=float(fields[col])
					if logb!=0:
						value=log(value)/logb	
						if value<-100000:
							raise ValueError						
					plotData[idx].append(value)
					
				except:
					pass		
		fin.close()
	
		
		if minSize==-1:
			minSize=len(plotData[idx]) #or startIDX?
		else:
			minSize=min([minSize,len(plotData[idx])])
		

	if trimToMinSize:
		print >> stderr,"trimming to min size =",minSize
		trimData(plotData,minSize)

	if len(relabels)>0:
		#if len(relabels)!=len(xtickLabels):
		#	print >> stderr,"relabels doesn't have the same length as original label vectors",xtickLabels,"=>",relabels
		#	exit()

		for i,relabel in zip(range(0,len(relabels)),relabels):
			xtickLabels[i]=relabel
		
	
	for i in range(0,len(plotMedianForGroups)):
		plotMedianForGroups[i]=getCol0ListFromCol1ListStringAdv(xtickLabels,plotMedianForGroups[i])
			
	
	#drawing medians:
	medianToDraw=[]
	for mediangrouper in plotMedianForGroups:
		curD=[]		
		for c in mediangrouper:
			curD.extend(plotData[c])
		medianToDraw.append(median(curD))


	for c in range(len(plotData)-1,-1,-1):
		if len(plotData[c])==0:
			print >> stderr,xtickLabels[c],"discarded"
			del plotData[c]
			del xtickLabels[c]


	print >> stdout,"student t-test (1 sample; mean=0)"
	print >> stdout,"sample","mean","p-val"


	for x in range(0,len(plotData)):
		#print >> stderr, len(plotData[x])
		try:
			print >> stdout, xtickLabels[x],mean(plotData[x]),ttest_1samp(plotData[x],0)[1]
		except:
			print >> stdout, xtickLabels[x],"NA","NA"

	pvalueM=[]
	
	print >> stdout,""
	
	print >> stdout,"student t-test (2 samples)"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	
	print >> stdout,""

	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
			else:
				try:
					pvalue=ttest_ind(plotData[x],plotData[y])[1]
				except:
					pvalue=1.0

				print >> stdout, str(pvalue),
				pvalueRow.append(pvalue)
		print >> stdout,"";	

	
	print >> stdout,""

	


	if plotPvalueCluster:
		makePValueRawPlot(outputClusterPrefix+"_t_raw",xtickLabels,pvalueM)
		makePValueClusterPlot(outputClusterPrefix+"_t",xtickLabels,pvalueM,methodCluster)

	pvalueM=[]

	print >> stdout,"welch t-test"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	
	print >> stdout,""
	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
					
			else:
				pvalue=welchs_approximate_ttest_arr(plotData[x],plotData[y])[3]
				print >> stdout, str(pvalue),
				pvalueRow.append(pvalue)
		print >> stdout,"";

	

	if plotPvalueCluster:
		makePValueRawPlot(outputClusterPrefix+"_Welch_raw",xtickLabels,pvalueM)
		makePValueClusterPlot(outputClusterPrefix+"_Welch",xtickLabels,pvalueM,methodCluster)

	
	print >> stdout,""
	print >> stdout,"non-parametric (Mann-Whitney U)" #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	

	pvalueM=[]

	print >> stdout,""
	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
			else:
				#if max(len(plotData[x]),len(plotData[y]))<=20:
				try:
					pvalue=mannwhitneyu(plotData[x],plotData[y])[1]*2				
				except:
					pvalue=1.0
				print >> stdout,pvalue, #mann-whiteney need to mul by 2 (one tail to two tail)
				pvalueRow.append(pvalue)
				#else:
				#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
		print >> stdout,"";	
	
	

	if plotPvalueCluster:
		makePValueRawPlot(outputClusterPrefix+"_U_raw",xtickLabels,pvalueM)
		makePValueClusterPlot(outputClusterPrefix+"_U",xtickLabels,pvalueM,methodCluster)
	
	figure(figsize=figsz)
	subplots_adjust(top=0.9, bottom=botta, left=0.2, right=0.8)
	
	if len(titl)==0:
		titl=outputFile


	plotExpBox(plotData,xtickLabels,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,xlegendrotation,xlabe,ylabe,titl,showSampleSizes)
	
	#ylim([0,200])
	for m in medianToDraw:
		axhline(y=m,linestyle=':',color='gray')

	savefig(outputFile,bbox_inches="tight")

	if len(plotHistogramToFile)>0:
		drawHistogram(plotHistogramToFile,plotData,xtickLabels)
Ejemplo n.º 12
0
def plotExpBox_Main(inputFiles,headers,valcols,outputFile,sep,startRow,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,plotPvalueCluster,outputClusterPrefix,methodCluster,xlegendrotation,xlabe,ylabe,figsz,titl,showSampleSizes,trimToMinSize,relabels):

	#if plotPvalueCluster:
		#if pvalue cluster is needed:
	#	from Bio.Cluster.cluster import *
	#	from Bio.Cluster import *
		#endif



	#the real deal!
	plotData=[]	
	xtickLabels=[]
	
	
	minSize=-1

	for inputFile,header,cols in zip(inputFiles,headers,valcols):
		fin=generic_istream(inputFile)
		
		startIdx=len(plotData)

		for col in cols:
			plotData.append([])
			xtickLabels.append(header[col])

		colIndices=range(startIdx,startIdx+len(cols))

		lino=0
		for lin in fin:
			lino+=1
			if lino<startRow:
				continue		
			fields=lin.rstrip("\r\n").split(sep)
		
			for idx,col in zip(colIndices,cols):
				try:
					
					value=float(fields[col])			
					plotData[idx].append(value)
					
				except:
					pass		
		fin.close()
	
		
		if minSize==-1:
			minSize=len(plotData[idx]) #or startIDX?
		else:
			minSize=min([minSize,len(plotData[idx])])
		

	if trimToMinSize:
		print >> stderr,"trimming to min size =",minSize
		trimData(plotData,minSize)

	if len(relabels)>0:
		if len(relabels)!=len(xtickLabels):
			print >> stderr,"relabels doesn't have the same length as original label vectors",xtickLabels,"=>",relabels
			exit()

		xtickLabels=relabels
		
	

	print >> stdout,"student t-test (1 sample; mean=0)"
	print >> stdout,"sample","mean","p-val"


	for x in range(0,len(plotData)):
		#print >> stderr, len(plotData[x])
		print >> stdout, xtickLabels[x],mean(plotData[x]),ttest_1samp(plotData[x],0)[1]


	pvalueM=[]
	
	print >> stdout,""
	
	print >> stdout,"student t-test (2 samples)"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	
	print >> stdout,""

	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
			else:
				pvalue=ttest_ind(plotData[x],plotData[y])[1]
				print >> stdout, str(pvalue),
				pvalueRow.append(pvalue)
		print >> stdout,"";	

	
	print >> stdout,""

	


	if plotPvalueCluster:
		makePValueRawPlot(outputClusterPrefix+"_t_raw",xtickLabels,pvalueM)
		makePValueClusterPlot(outputClusterPrefix+"_t",xtickLabels,pvalueM,methodCluster)

	pvalueM=[]

	print >> stdout,"welch t-test"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	
	print >> stdout,""
	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
					
			else:
				pvalue=welchs_approximate_ttest_arr(plotData[x],plotData[y])[3]
				print >> stdout, str(pvalue),
				pvalueRow.append(pvalue)
		print >> stdout,"";

	

	if plotPvalueCluster:
		makePValueRawPlot(outputClusterPrefix+"_Welch_raw",xtickLabels,pvalueM)
		makePValueClusterPlot(outputClusterPrefix+"_Welch",xtickLabels,pvalueM,methodCluster)

	
	print >> stdout,""
	print >> stdout,"non-parametric (Mann-Whitney U)" #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	

	pvalueM=[]

	print >> stdout,""
	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
			else:
				#if max(len(plotData[x]),len(plotData[y]))<=20:
				pvalue=mannwhitneyu(plotData[x],plotData[y])[1]*2				
				print >> stdout,pvalue, #mann-whiteney need to mul by 2 (one tail to two tail)
				pvalueRow.append(pvalue)
				#else:
				#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
		print >> stdout,"";	
	
	

	if plotPvalueCluster:
		makePValueRawPlot(outputClusterPrefix+"_U_raw",xtickLabels,pvalueM)
		makePValueClusterPlot(outputClusterPrefix+"_U",xtickLabels,pvalueM,methodCluster)
	
	figure(figsize=figsz)
		
	if len(titl)==0:
		titl=outputFile

	plotExpBox(plotData,xtickLabels,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,xlegendrotation,xlabe,ylabe,titl,showSampleSizes)
	#ylim([0,200])

	savefig(outputFile,bbox_inches="tight")
Ejemplo n.º 13
0
import numpy
import pandas
import datetime
import scipy.stats.stats as st
# import matplotlib.pyplot as plt

format = "%Y_%m_%d"
current_date=datetime.datetime.today()
print 'dataset path: /Volumes/daelsaid/inquisit/INQUISIT_RERUN/all_launches_dir/scored/clean_copies/scored_data_2017_07_01_comb_mturk.csv'
print ' '
print current_date


scored_data=pandas.read_csv('/Volumes/daelsaid/inquisit/INQUISIT_RERUN/all_launches_dir/scored/clean_copies/scored_data_2017_07_01_newageranges_comb_mturk.csv')
y=['trial1', 'trial2', 'trial3', 'trial4', 'trial5', 'listb', 'trial6', 'trial7']

test_df=pandas.DataFrame(data=scored_data.set_index(['gender', 'age_range']).loc[:,'trial1':'trial7'])

for col in y:
	print '     '
	print '*******' + col + '*******'
	for idx,val in test_df.groupby(level=1):
		if len(val) >3:
			u,p= st.mannwhitneyu(val.loc['female'][col],val.loc['male'][col], use_continuity=True,alternative='two-sided')
			if p < 0.05:
				print 'age_range ' + list(set(val.loc['female'][col].index.tolist()))[0] + ':', u, p, '**'
			else:
				print 'age_range ' + list(set(val.loc['female'][col].index.tolist()))[0] + ':', u, p, 'ns'
		else:
			print 'age_range ' + list(set(val.loc['female'][col].index.tolist()))[0]+ ': n is less than 3'
Ejemplo n.º 14
0
                           )
        ax['circ'].scatter(df_py.jitter, df_py.vector_strength, edgecolors='w', lw=.5,
                           color=sns.xkcd_rgb['dark fuchsia'], \
                           label='pyramidal', s=point_size
                           )
        
        ax['vs_freq_beat'].scatter(df_pu_b.frequency, df_pu_b.vector_strength, edgecolors='w', lw=.5,
                              color=sns.xkcd_rgb['azure'], \
                              label='p-units', s=point_size)
        ax['vs_freq_beat'].scatter(df_py_b.frequency, df_py_b.vector_strength, edgecolors='w', lw=.5,
                              color=sns.xkcd_rgb['dark fuchsia'], \
                              label='pyramidal', s=point_size)

        # --- circular variance scatter plots
        ax['circ_beat'].scatter(df_pu_b.jitter, df_pu_b.vector_strength, edgecolors='w', lw=.5,
                           color=sns.xkcd_rgb['azure'], \
                           label='p-units', s=point_size
                           )
        ax['circ_beat'].scatter(df_py_b.jitter, df_py_b.vector_strength, edgecolors='w', lw=.5,
                           color=sns.xkcd_rgb['dark fuchsia'], \
                           label='pyramidal', s=point_size
                           )
        
        print(stats.mannwhitneyu(df_pu.vector_strength, df_py.vector_strength, use_continuity=True, alternative=None))
        print(stats.mannwhitneyu(df_pu_b.vector_strength, df_py_b.vector_strength, use_continuity=True, alternative=None))
        print(np.median(df_pu.vector_strength))
        print(np.median(df_py.vector_strength))
        print(np.median(df_pu_b.vector_strength))
        print(np.median(df_py_b.vector_strength))

Ejemplo n.º 15
0
def stat_test(df,df_num):
    for i in df_num.columns:
        df_1=df[df['Response']==1][i]
        df_0=df[df['Response']==0][i]
        tsats,pval=stats.ttest_ind(df_1,df_0)
        tstas,pval=stats.mannwhitneyu(df_1,df_0)
Ejemplo n.º 16
0
def compute_auc(train, test, Y, y):
    """Computes measures of accuracy for train and test data.

    Computes the ROC curve and the area under that curve, as a 
    measure of classification accuracy. The threshold corresponding
    to the point on the ROC curve farthest from `y=x` line is selected
    and fraction of correct predictions corresponding to that
    threshold is returned.

    Arguments
        train : float array
            Array of predictions of the model on training data where rows
            correspond to labels and columns correspond to samples.
        test : float array
            Array of predictions of the model on testing data where rows
            correspond to labels and columns correspond to samples.
        Y : float array
            Training labels where rows correspond to labels
            and columns to samples. This is an array of {1,-1}.
        y : float array
            Testing labels where rows correspond to labels
            and columns to samples. This is an array of {1,-1}.

    Returns
        performance : float array
            Array containing the AUC for training data, classification 
            accuracy for training data, AUC for testing data and
            classification accuracy for testing data, in that order. 

    .. note::
        * For binary-class classification, AUC is proportional to the Mann-Whitney U test statistic which computes a measure   of the separation between values of positive labels and negative labels.
        
        * For multi-class classification, this formula for computing classifier AUC is one of many. A more principled way      would involve computing the Volume under an ROC surface.

    """

    # computing train AUC
    NP = (Y == 1).sum()
    NM = (Y == -1).sum()
    try:
        U = stats.mannwhitneyu(train[(Y == 1)], train[(Y == -1)])
        train_auc = 1. - U[0] / (NP * NM)
    except ValueError:
        train_auc = 0.5

    # computing test AUC
    NP = (y == 1).sum()
    NM = (y == -1).sum()
    try:
        U = stats.mannwhitneyu(test[(y == 1)], test[(y == -1)])
        test_auc = 1. - U[0] / (NP * NM)
    except ValueError:
        test_auc = 0.5

    # accuracy = number of examples where argmax of prediction
    # equals true label

    # train accuracy
    train_accuracy = (train.argmax(0) - Y.argmax(0) == 0).sum() / float(
        Y.shape[1])

    # test accuracy
    test_accuracy = (test.argmax(0) - y.argmax(0) == 0).sum() / float(
        y.shape[1])

    return np.array([train_auc, train_accuracy, test_auc, test_accuracy])
def plotExpBox_Main(inputFiles, headers, valcols, outputFile, sep, startRow,
                    showIndPoints, mark, markMean, showMean, notch, whisker,
                    outliers, plotPvalueCluster, outputClusterPrefix,
                    methodCluster, xlegendrotation, xlabe, ylabe, figsz, titl,
                    showSampleSizes, trimToMinSize, relabels, logb,
                    plotHistogramToFile, plotMedianForGroups, botta,
                    showViolin, showBox, firstColAnnot, plotTrend, showLegend,
                    makePzfxFile, makeBinMatrix, writeDataSummaryStat,
                    summaryStatRange, minuslog10pvalue, minNDataToKeep,
                    vfacecolor, valpha, outXYZPvalues, dividePlots):

    #if plotPvalueCluster:
    #if pvalue cluster is needed:
    #	from Bio.Cluster.cluster import *
    #	from Bio.Cluster import *
    #endif

    #the real deal!
    plotData = []
    xtickLabels = []

    trendData = {}
    annot = {}

    minSize = -1

    for inputFile, header, cols in zip(inputFiles, headers, valcols):
        fin = generic_istream(inputFile)

        startIdx = len(plotData)

        if firstColAnnot:
            colAnnot = cols[0]
            cols = cols[1:]
            annotThisFile = []
            annot[startIdx] = annotThisFile
        else:
            colAnnot = -1
            annotThisFile = None

        for col in cols:
            plotData.append([])
            xtickLabels.append(header[col])

        colIndices = range(startIdx, startIdx + len(cols))

        if plotTrend:
            #print >> stderr,"plotTrend"
            trendDataThisFile = []
            trendData[startIdx] = trendDataThisFile
        else:
            trendDataThisFile = None

        lino = 0
        for lin in fin:
            lino += 1
            if lino < startRow:
                continue
            fields = lin.rstrip("\r\n").split(sep)

            if plotTrend:
                #print >> stderr,"a"
                trendDataThisLine = []
            else:
                trendDataThisLine = None

            allDataOKThisLine = True

            if colAnnot >= 0:
                annotThisFile.append(fields[colAnnot])

            for idx, col in zip(colIndices, cols):
                try:
                    value = float(fields[col])
                    if logb != 0:
                        if value == 0.0:
                            raise ValueError
                        value = log(value) / logb
                    plotData[idx].append(value)

                    if plotTrend:
                        trendDataThisLine.append(value)
                        #print >> stderr,"value:",value

                except:
                    allDataOKThisLine = False

            if plotTrend:
                if allDataOKThisLine:
                    trendDataThisFile.append(trendDataThisLine)
                else:
                    trendDataThisFile.append(None)

        fin.close()

        if minSize == -1:
            minSize = len(plotData[idx])  #or startIDX?
        else:
            minSize = min([minSize, len(plotData[idx])])

    if trimToMinSize:
        print >> stderr, "trimming to min size =", minSize
        trimData(plotData, minSize)

    if len(relabels) > 0:
        #if len(relabels)!=len(xtickLabels):
        #	print >> stderr,"relabels doesn't have the same length as original label vectors",xtickLabels,"=>",relabels
        #	exit()
        print >> stderr, xtickLabels
        print >> stderr, relabels
        for i, relabel in zip(range(0, len(relabels)), relabels):
            xtickLabels[i] = relabel

    for i in range(0, len(plotMedianForGroups)):
        plotMedianForGroups[i] = getCol0ListFromCol1ListStringAdv(
            xtickLabels, plotMedianForGroups[i])

    #drawing medians:
    medianToDraw = []
    for mediangrouper in plotMedianForGroups:
        curD = []
        for c in mediangrouper:
            curD.extend(plotData[c])
        medianToDraw.append(median(curD))

    for c in range(len(plotData) - 1, -1, -1):
        if len(plotData[c]) < minNDataToKeep:
            print >> stderr, xtickLabels[c], "discarded because has only", len(
                plotData[c]), "data points <", minNDataToKeep
            del plotData[c]
            del xtickLabels[c]

    if not skipStat:
        print >> stdout, "student t-test (1 sample; mean=0)"
        print >> stdout, "sample", "mean", "p-val", "median"

        if writeDataSummaryStat:
            fDSS = open(writeDataSummaryStat, "w")
            print >> fDSS, "sample\tmean\tvar\tsd\tmin\tmax\tN\tNInRange[" + str(
                summaryStatRange[0]) + "," + str(
                    summaryStatRange[1]
                ) + "]\t%NInRange\tNbelowRange\t%Nbelow\tNAboveRange\t%NAbove"

        for x in range(0, len(plotData)):
            #print >> stderr, len(plotData[x])
            try:
                print >> stdout, xtickLabels[x], mean(
                    plotData[x]), ttest_1samp(plotData[x],
                                              0)[1], median(plotData[x])
            except:
                print >> stdout, xtickLabels[x], mean(
                    plotData[x]), "NA", median(plotData[x])

            if writeDataSummaryStat:
                sumData, N, NIN, NBelow, NAbove = filterDataInRangeInclusive(
                    plotData[x], summaryStatRange[0], summaryStatRange[1])

                if NIN > 1:
                    #print >> stderr,"sumData=",sumData
                    #print >> stderr,mean
                    mea = mean2(sumData)
                    DDOF = 1
                    sd = std(sumData, ddof=DDOF)
                    var = sd * sd
                    mi = min(sumData)
                    ma = max(sumData)
                else:
                    mea = "NA"
                    sd = "NA"
                    var = "NA"
                    mi = "NA"
                    ma = "NA"

                print >> fDSS, xtickLabels[x] + "\t" + str(mea) + "\t" + str(
                    var) + "\t" + str(sd) + "\t" + str(mi) + "\t" + str(
                        ma) + "\t" + str(N) + "\t" + str(NIN) + "\t" + str(
                            float(NIN) * 100 /
                            N) + "\t" + str(NBelow) + "\t" + str(
                                float(NBelow) * 100 /
                                N) + "\t" + str(NAbove) + "\t" + str(
                                    float(NAbove) * 100 / N)

        pvalueM = []

        if writeDataSummaryStat:
            fDSS.close()

        print >> stdout, ""

        print >> stdout, "student t-test (2 samples)"
        print >> stdout, "p-val",
        for x in range(0, len(plotData)):
            print >> stdout, xtickLabels[x],

        print >> stdout, ""

        for x in range(0, len(plotData)):
            pvalueRow = []
            pvalueM.append(pvalueRow)
            print >> stdout, xtickLabels[x],
            for y in range(0, len(plotData)):
                if y <= x:
                    print >> stdout, "",
                    if x == y:
                        if minuslog10pvalue:
                            pvalueRow.append(0.0)
                        else:
                            pvalueRow.append(1.0)
                    else:
                        pvalueRow.append(pvalueM[y][x])
                else:
                    try:
                        pvalue = ttest_ind(plotData[x], plotData[y])[1]
                    except:
                        pvalue = 1.0

                    if minuslog10pvalue and str(pvalue) != "NA":
                        try:
                            pvalue = -1 * log(pvalue, 10)
                        except:
                            pvalue = -1000.0

                    print >> stdout, str(pvalue),
                    pvalueRow.append(pvalue)
            print >> stdout, ""

        print >> stdout, ""

        if plotPvalueCluster:
            makePValueRawPlot(outputClusterPrefix + "_t_raw", xtickLabels,
                              pvalueM)
            makePValueClusterPlot(outputClusterPrefix + "_t", xtickLabels,
                                  pvalueM, methodCluster)

        pvalueM = []

        print >> stdout, "welch t-test"
        print >> stdout, "p-val",
        for x in range(0, len(plotData)):
            print >> stdout, xtickLabels[x],

        print >> stdout, ""
        for x in range(0, len(plotData)):
            pvalueRow = []
            pvalueM.append(pvalueRow)
            print >> stdout, xtickLabels[x],
            for y in range(0, len(plotData)):
                if y <= x:
                    print >> stdout, "",
                    if x == y:
                        if minuslog10pvalue:
                            pvalueRow.append(0.0)
                        else:
                            pvalueRow.append(1.0)
                    else:
                        pvalueRow.append(pvalueM[y][x])

                else:
                    try:
                        pvalue = welchs_approximate_ttest_arr(
                            plotData[x], plotData[y])[3]
                    except:
                        pvalue = 1.0

                    if minuslog10pvalue and str(pvalue) != "NA":
                        try:
                            pvalue = -1 * log(pvalue, 10)
                        except:
                            pvalue = -1000.0

                    print >> stdout, str(pvalue),
                    pvalueRow.append(pvalue)
            print >> stdout, ""

        if outXYZPvalues:
            writeXYZPvalues(outXYZPvalues + "_Welch.xyz", xtickLabels, pvalueM)

        if plotPvalueCluster:
            makePValueRawPlot(outputClusterPrefix + "_Welch_raw", xtickLabels,
                              pvalueM)
            makePValueClusterPlot(outputClusterPrefix + "_Welch", xtickLabels,
                                  pvalueM, methodCluster)

        print >> stdout, ""
        print >> stdout, "non-parametric (Mann-Whitney U)"  #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)"
        print >> stdout, "p-val",
        for x in range(0, len(plotData)):
            print >> stdout, xtickLabels[x],

        pvalueM = []

        print >> stdout, ""
        for x in range(0, len(plotData)):
            pvalueRow = []
            pvalueM.append(pvalueRow)
            print >> stdout, xtickLabels[x],
            for y in range(0, len(plotData)):
                if y <= x:
                    print >> stdout, "",
                    if x == y:
                        if minuslog10pvalue:
                            pvalueRow.append(0.0)
                        else:
                            pvalueRow.append(1.0)
                    else:
                        pvalueRow.append(pvalueM[y][x])
                else:
                    #if max(len(plotData[x]),len(plotData[y]))<=20:
                    try:
                        pvalue = mannwhitneyu(plotData[x], plotData[y])[1] * 2
                    except:
                        pvalue = 1.0

                    if minuslog10pvalue and str(pvalue) != "NA":
                        try:
                            pvalue = -1 * log(pvalue, 10)
                        except:
                            pvalue = -1000.0

                    print >> stdout, pvalue,  #mann-whiteney need to mul by 2 (one tail to two tail)
                    pvalueRow.append(pvalue)
                    #else:
                    #	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
            print >> stdout, ""

        if outXYZPvalues:
            writeXYZPvalues(outXYZPvalues + "_U.xyz", xtickLabels, pvalueM)

        if plotPvalueCluster:
            makePValueRawPlot(outputClusterPrefix + "_U_raw", xtickLabels,
                              pvalueM)
            makePValueClusterPlot(outputClusterPrefix + "_U", xtickLabels,
                                  pvalueM, methodCluster)

        #####now the variance tests

        print >> stdout, ""
        print >> stdout, "Ansari-Bradley Two-sample Test for difference in scale parameters "
        print >> stdout, "p-val",

        for x in range(0, len(plotData)):
            print >> stdout, xtickLabels[x],

        pvalueM = []

        print >> stdout, ""
        for x in range(0, len(plotData)):
            pvalueRow = []
            pvalueM.append(pvalueRow)
            print >> stdout, xtickLabels[x],
            for y in range(0, len(plotData)):
                if y <= x:
                    print >> stdout, "",
                    if x == y:
                        if minuslog10pvalue:
                            pvalueRow.append(0.0)
                        else:
                            pvalueRow.append(1.0)
                    else:
                        pvalueRow.append(pvalueM[y][x])
                else:
                    #if max(len(plotData[x]),len(plotData[y]))<=20:
                    try:
                        pvalue = ansari(plotData[x], plotData[y])[1]
                    except:
                        pvalue = "NA"

                    if minuslog10pvalue and str(pvalue) != "NA":
                        try:
                            pvalue = -1 * log(pvalue, 10)
                        except:
                            pvalue = -1000.0

                        #pvalue=1.0
                    print >> stdout, pvalue,
                    pvalueRow.append(pvalue)
                    #else:
                    #	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
            print >> stdout, ""

        if plotPvalueCluster:
            makePValueRawPlot(outputClusterPrefix + "_Ansari_raw", xtickLabels,
                              pvalueM)
            makePValueClusterPlot(outputClusterPrefix + "_Ansari", xtickLabels,
                                  pvalueM, methodCluster)

        #####

        #####now the variance tests

        print >> stdout, ""
        print >> stdout, "Fligner's Two-sample Test for equal variance (non-parametrics)"
        print >> stdout, "p-val",

        for x in range(0, len(plotData)):
            print >> stdout, xtickLabels[x],

        pvalueM = []

        print >> stdout, ""
        for x in range(0, len(plotData)):
            pvalueRow = []
            pvalueM.append(pvalueRow)
            print >> stdout, xtickLabels[x],
            for y in range(0, len(plotData)):
                if y <= x:
                    print >> stdout, "",
                    if x == y:
                        if minuslog10pvalue:
                            pvalueRow.append(0.0)
                        else:
                            pvalueRow.append(1.0)
                    else:
                        pvalueRow.append(pvalueM[y][x])
                else:
                    #if max(len(plotData[x]),len(plotData[y]))<=20:
                    try:
                        pvalue = fligner(plotData[x], plotData[y])[1]
                    except:
                        pvalue = "NA"
                        #pvalue=1.0

                    if minuslog10pvalue and str(pvalue) != "NA":
                        try:
                            pvalue = -1 * log(pvalue, 10)
                        except:
                            pvalue = -1000.0

                    print >> stdout, pvalue,
                    pvalueRow.append(pvalue)
                    #else:
                    #	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
            print >> stdout, ""

        if plotPvalueCluster:
            makePValueRawPlot(outputClusterPrefix + "_fligner_raw",
                              xtickLabels, pvalueM)
            makePValueClusterPlot(outputClusterPrefix + "_fligner",
                                  xtickLabels, pvalueM, methodCluster)

        #####

        #####now the variance tests

        print >> stdout, ""
        print >> stdout, "Levene's Two-sample Test for equal variance"
        print >> stdout, "p-val",

        for x in range(0, len(plotData)):
            print >> stdout, xtickLabels[x],

        pvalueM = []

        print >> stdout, ""
        for x in range(0, len(plotData)):
            pvalueRow = []
            pvalueM.append(pvalueRow)
            print >> stdout, xtickLabels[x],
            for y in range(0, len(plotData)):
                if y <= x:
                    print >> stdout, "",
                    if x == y:
                        if minuslog10pvalue:
                            pvalueRow.append(0.0)
                        else:
                            pvalueRow.append(1.0)
                    else:
                        pvalueRow.append(pvalueM[y][x])
                else:
                    #if max(len(plotData[x]),len(plotData[y]))<=20:
                    try:
                        pvalue = levene(plotData[x], plotData[y])[1]
                    except:
                        pvalue = "NA"
                        #pvalue=1.0

                    if minuslog10pvalue and str(pvalue) != "NA":
                        try:
                            pvalue = -1 * log(pvalue, 10)
                        except:
                            pvalue = -1000.0

                    print >> stdout, pvalue,
                    pvalueRow.append(pvalue)
                    #else:
                    #	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
            print >> stdout, ""

        if plotPvalueCluster:
            makePValueRawPlot(outputClusterPrefix + "_levene_raw", xtickLabels,
                              pvalueM)
            makePValueClusterPlot(outputClusterPrefix + "_levene", xtickLabels,
                                  pvalueM, methodCluster)

        #####

        #####now the variance tests

        print >> stdout, ""
        print >> stdout, "Bartlett's Two-sample Test for equal variance (for normal distributions)"
        print >> stdout, "p-val",

        for x in range(0, len(plotData)):
            print >> stdout, xtickLabels[x],

        pvalueM = []

        print >> stdout, ""
        for x in range(0, len(plotData)):
            pvalueRow = []
            pvalueM.append(pvalueRow)
            print >> stdout, xtickLabels[x],
            for y in range(0, len(plotData)):
                if y <= x:
                    print >> stdout, "",
                    if x == y:
                        if minuslog10pvalue:
                            pvalueRow.append(0.0)
                        else:
                            pvalueRow.append(1.0)
                    else:
                        pvalueRow.append(pvalueM[y][x])
                else:
                    #if max(len(plotData[x]),len(plotData[y]))<=20:
                    try:
                        pvalue = bartlett(plotData[x], plotData[y])[1]
                    except:
                        pvalue = "NA"
                        #pvalue=1.0

                    if minuslog10pvalue and str(pvalue) != "NA":
                        try:
                            pvalue = -1 * log(pvalue, 10)
                        except:
                            pvalue = -1000.0

                    print >> stdout, pvalue,
                    pvalueRow.append(pvalue)
                    #else:
                    #	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
            print >> stdout, ""

        if plotPvalueCluster:
            makePValueRawPlot(outputClusterPrefix + "_bartlett_raw",
                              xtickLabels, pvalueM)
            makePValueClusterPlot(outputClusterPrefix + "_bartlett",
                                  xtickLabels, pvalueM, methodCluster)

        #####

    figure(figsize=figsz)
    subplots_adjust(top=0.9, bottom=botta, left=0.2, right=0.8)

    if len(titl) == 0:
        titl = outputFile

    plotExpBox(plotData, xtickLabels, showIndPoints, mark, markMean, showMean,
               notch, whisker, outliers, xlegendrotation, xlabe, ylabe, titl,
               showSampleSizes, showViolin, showBox, annot, trendData,
               showLegend, makePzfxFile, makeBinMatrix, dividePlots)

    #ylim([0,200])
    for m in medianToDraw:
        axhline(y=m, linestyle=':', color='gray')

    savefig(outputFile, bbox_inches="tight")

    if len(plotHistogramToFile) > 0:
        drawHistogram(plotHistogramToFile, plotData, xtickLabels)
        drawDensigram(plotHistogramToFile + ".density.png", plotData,
                      xtickLabels)