def apply_stats(data,runTTest):
	peakList = getPeakList(data)
	tempList = list()

	colNames = ['Fatty Acid Type',           #1
				'Peak Name',                 #2
				'Pearson Coefficient',       #3
				'Pearson P Value',           #4
				'Spearman Coefficient',      #5
				'Spearman P Value',          #6
				'P Geometric Mean (%)',      #7
				'Q Geometric Mean (ug/ml)',  #8
				'P Mean (%)',                #9
				'P Stdev',                   #10
				'Q Mean (ug/ml)',            #11
				'Q Stdev',                   #12
				'P T-test',                  #13
				'P T-test P value',          #14
				'Q T-test',                  #15
				'Q T-test P value',          #16
				'Common Name']               #17


	for entry in peakList:
		try:
			pearson = pearsonr(entry['p'],entry['q'])
			spearman = spearmanr(entry['p'],entry['q'])
			if runTTest == 'y':
				ttestP = ttest_1samp(entry['p'],0)
				ttestQ = ttest_1samp(entry['q'],0)
			else:
				ttestP = ('-','-')
				ttestQ = ('-','-')
			tempList += [entry['FAtype'],            #1
						entry['peakName'],           #2
						pearson[0],                  #3
						pearson[1],                  #4
						spearman[0],                 #5
						spearman[1],                 #6
						gmean(entry['p']),           #7
						gmean(entry['q']),           #8
						np.mean(entry['p']),         #9
						np.std(entry['p'],ddof=1),   #10
						np.mean(entry['q']),         #11
						np.std(entry['q'],ddof=1),   #12
						ttestP[0],                   #13
						ttestP[1],                   #14
						ttestQ[0],                   #15
						ttestQ[1],                   #16
						entry['common']],            #17
		except: pass

	return pd.DataFrame(tempList, columns=colNames)
def doStudentT(allData):
    fileIndex = 0
    for data in allData:
        print('***************************** FILE: test_', fileIndex,
              '******************************')
        resultv0 = stats.ttest_1samp(data['v0'], 0)
        resultv1 = stats.ttest_1samp(data['v1'], 0)
        resultv2 = stats.ttest_1samp(data['v2'], 0)
        resultv3 = stats.ttest_1samp(data['v3'], 0)
        resultv4 = stats.ttest_1samp(data['v4'], 0)
        resultv5 = stats.ttest_1samp(data['v5'], 0)
        resultv6 = stats.ttest_1samp(data['v6'], 0)
        resultv7 = stats.ttest_1samp(data['v7'], 0)
        plt.plot(resultv0, label="v0")
        plt.plot(resultv1, label="v1")
        plt.plot(resultv2, label="v2")
        plt.plot(resultv3, label="v3")
        plt.plot(resultv4, label="v4")
        plt.plot(resultv5, label="v5")
        plt.plot(resultv6, label="v6")
        plt.plot(resultv7, label="v7")
        plt.xlabel('P value')
        plt.ylabel('T statistic')
        plt.title("Histogram for dataset " + str(fileIndex))
        plt.legend()
        plt.show()

        fileIndex = fileIndex + 1
Beispiel #3
0
    def detect_trend(self, time_series_x: np.ndarray,
                     time_series_y: np.ndarray):
        """
        Method that performs the Innovative Trend Analysis to the given time-series
        or signal. This method is visual so the result will be the creation of a
        file with the plot of the result.

        :param time_series_x: time variable of the time series to analyze
        :param time_series_y: value of the time series to analyze
        """
        # Odd time series are problematic
        if time_series_y.shape[0] // 2 != 0:
            time_series_y = time_series_y[:-1]
        first_half, second_half = np.split(time_series_y,
                                           indices_or_sections=2)
        first_half = np.sort(first_half)
        second_half = np.sort(second_half)

        self._plot_ita(first_half=first_half,
                       second_half=second_half,
                       time_series_min=np.min(time_series_y),
                       time_series_max=np.max(time_series_y))

        second_half = second_half - first_half
        np.random.shuffle(second_half)

        # comparing with no trend line mean
        if second_half.shape[0] < 30:
            _, p_score = stats.ttest_1samp(second_half, 0.0)
        else:
            _, p_score = ztest(second_half, value=0.0)

        trend = p_score <= self.confidence_level
        return trend,
Beispiel #4
0
def approximate_random_effects(data, labels, group):

    correlation_per_donor = {}
    for donor_id in set(data[group]):
        correlation_per_donor[donor_id], _, _, _, _ = linregress(list(data[labels[0]][data[group] == donor_id]),list(data[labels[1]][data[group] == donor_id]))
    average_slope = np.array(correlation_per_donor.values()).mean()
    t, p_val = ttest_1samp(correlation_per_donor.values(), 0)
    print "Averaged slope across donors = %g (t=%g, p=%g)"%(average_slope, t, p_val)    
    return average_slope, t, p_val
Beispiel #5
0
def calculate_gene_expression_similarity(reduced_stat_map_data, mask="full"):
    store_file = "/ahba_data/store_max1_reduced.h5"
    subcortex_mask = "/ahba_data/subcortex_mask.npy"

    results_dfs = []
    with pd.HDFStore(store_file, 'r') as store:
        for donor_id in store.keys():
            print "Loading expression data (%s)" % donor_id
            expression_data = store.get(donor_id.replace(".", "_"))

            print "Getting statmap values (%s)" % donor_id
            nifti_values = reduced_stat_map_data[expression_data.columns]

            print "Removing missing values (%s)" % donor_id
            na_mask = np.isnan(nifti_values)
            if mask == "subcortex":
                na_mask = np.logical_or(na_mask,
                    np.isnan(np.load(subcortex_mask)[expression_data.columns]))
            elif mask == "cortex":
                na_mask = np.logical_or(na_mask, np.logical_not(np.isnan(
                    np.load(subcortex_mask)[expression_data.columns])))
            else:
                assert mask == "full"

            nifti_values = np.array(nifti_values)[np.logical_not(na_mask)]
            expression_data.drop(expression_data.columns[na_mask], axis=1, inplace=True)

            print "z scoring (%s)" % donor_id
            expression_data = pd.DataFrame(zscore(expression_data, axis=1), columns=expression_data.columns,
                                           index=expression_data.index)
            nifti_values = zscore(nifti_values)

            print "Calculating linear regressions (%s)" % donor_id
            regression_results = np.linalg.lstsq(np.c_[nifti_values, np.ones_like(nifti_values)], expression_data.T)
            results_df = pd.DataFrame({"slope": regression_results[0][0]}, index=expression_data.index)

            results_df.columns = pd.MultiIndex.from_tuples([(donor_id[1:], c,) for c in results_df.columns],
                                                           names=['donor_id', 'parameter'])

            results_dfs.append(results_df)

        print "Concatenating results"
        results_df = pd.concat(results_dfs, axis=1)
        del results_dfs

    t, p = ttest_1samp(results_df, 0.0, axis=1)
    group_results_df = pd.DataFrame({"t": t, "p": p}, columns=['t', 'p'], index=expression_data.index)
    _, group_results_df["p (FDR corrected)"], _, _ = multipletests(group_results_df.p, method='fdr_bh')
    group_results_df["variance explained (mean)"] = (results_df.xs('slope', axis=1, level=1) ** 2 * 100).mean(axis=1)
    group_results_df["variance explained (std)"] = (results_df.xs('slope', axis=1, level=1) ** 2 * 100).std(axis=1)
    del results_df
    probe_info = pd.read_csv("/ahba_data/probe_info_max1.csv", index_col=0).drop(['chromosome', "gene_id"], axis=1)
    group_results_df = group_results_df.join(probe_info)
    group_results_df = group_results_df[["gene_symbol", "entrez_id.1", "gene_name","t", "p", "p (FDR corrected)",
                                         "variance explained (mean)", "variance explained (std)"]]

    return group_results_df
Beispiel #6
0
def approximate_random_effects(data, labels, group):
    slope_per_donor = np.array([])
    rval_per_donor = np.array([])
    #print "Performing approximate random effect analysis..."
    for donor_id in set(
            data[group]):  #for donor_id in donorids, perform linear regression
        #print "Total usable datapoints of donor %s: %d" % (donor_id, len(list(data[labels[0]][data[group] == donor_id]))) #shows usable datapoints per donor
        slope, _, rval, p_val, stderr = linregress(
            list(data[labels[0]][data[group] == donor_id]),
            list(data[labels[1]][data[group] == donor_id]))
        slope_per_donor = np.append(slope_per_donor, slope)
        rval_per_donor = np.append(rval_per_donor, rval)

    #average_slope = round(slope_per_donor.mean(),6) #get mean r-value across donors
    #average_rval = round(rval_per_donor.mean(),6) #get mean r-value across donors
    average_slope = round(np.nanmean(slope_per_donor),
                          6)  #get mean r-value across donors
    average_rval = round(np.nanmean(rval_per_donor),
                         6)  #get mean r-value across donors
    t_value, p_value = ttest_1samp(
        slope_per_donor,
        0)  #t-test (redundant information for downstream analyses)
    with open(output_file, 'a') as f:  #saving full data to .csv
        w = csv.writer(f)
        #print "Saving the analysis results..."
        w.writerow([
            gene, average_rval, average_slope, rval_per_donor[0],
            rval_per_donor[1], rval_per_donor[2], rval_per_donor[3],
            rval_per_donor[4], rval_per_donor[5], t_value, p_value
        ])

    with open(output_file_GSEA, 'a') as f:  #saving GSEA input data to .csv
        w = csv.writer(f, delimiter='\t')
        #print "Saving to GSEA input file..."
        w.writerow([gene, average_rval])

    #Scatterplot of gene expression against reverse inference fMRI map z-score
    print "Plotting the correlation graph..."
    ax = sns.lmplot(labels[0],
                    labels[1],
                    data,
                    hue=group,
                    legend=True,
                    fit_reg=True)  #comment-out for no plotting
    ax.set(xlabel="%s map z-score value" % (cog_function.capitalize()))
    ax = plot.title(gene)
    print "Saving the correlation graph..."
    plot.savefig(plot_pdf, format='pdf')
    plot.close()
    return
def approximate_random_effects(data, labels, group):
    correlation_per_donor = {}
    for donor_id in set(data[group]):
        correlation_per_donor[donor_id], _, _, _, _ = linregress(list(data[labels[0]][data[group] == donor_id]),
                                                                 list(data[labels[1]][data[group] == donor_id]))
    average_slope = np.array(correlation_per_donor.values()).mean()
    t, p_val = ttest_1samp(correlation_per_donor.values(), 0)
    print "Averaged slope across donors = %g (t=%g, p=%g)" % (average_slope, t, p_val)
    sns.violinplot([correlation_per_donor.values()], inner="points", names=["donors"])
    plt.ylabel("Linear regression slopes between %s and %s" % (labels[0], labels[1]))
    plt.axhline(0, color="red")

    sns.lmplot(labels[0], labels[1], data, hue=group, col=group, col_wrap=3)
    plt.show()

    return average_slope, t, p_val
Beispiel #8
0
def approximate_random_effects(data, labels, group):

    correlation_per_donor = {}
    for donor_id in set(data[group]):
        correlation_per_donor[donor_id], _, _, _, _ = linregress(list(data[labels[0]][data[group] == donor_id]),
                                                       list(data[labels[1]][data[group] == donor_id]))
    average_slope = np.array(correlation_per_donor.values()).mean()
    t, p_val = ttest_1samp(correlation_per_donor.values(), 0)
    print "Averaged slope across donors = %g (t=%g, p=%g)"%(average_slope, t, p_val)
    sns.violinplot([correlation_per_donor.values()], inner="points", names=["donors"])
    plt.ylabel("Linear regression slopes between %s and %s"%(labels[0],labels[1]))
    plt.axhline(0, color="red")
    
    sns.lmplot(labels[0], labels[1], data, hue=group, col=group, col_wrap=3)
    plt.show()
    
    return average_slope, t, p_val
Beispiel #9
0
def roc_stats():
    path = "/root/robbis/fmri/carlo_ofp/0_results/"
    import glob
    ##### ROC t-test ####
    
    rocfile = glob.glob(os.path.join(path, "0_roc*total.nii.gz"))
    rocimg = ni.load(rocfile[0])
    
    t, p = ttest_1samp(rocimg.get_data(), 0.5, axis=3)
    q = np.zeros_like(p)
    q[np.logical_not(p == 0)] = 1 - p[np.logical_not(p == 0)]
    t[np.isinf(t)] = 0
    
    p[np.isnan(p)] = 0
    
    
    ni.save(ni.Nifti1Image(q, rocimg.affine), os.path.join(path, "0_roc_ttest_q.nii.gz"))
    ni.save(ni.Nifti1Image(p, rocimg.affine), os.path.join(path, "0_roc_ttest_p.nii.gz"))
    ni.save(ni.Nifti1Image(t, rocimg.affine), os.path.join(path, "0_roc_ttest_t.nii.gz"))
Beispiel #10
0
    def transform(self, labels):

        ds = self.dataset
        conditions = self.conditions
        single_value = self.sample_value

        if conditions == single_value == None:
            raise ValueError()
        elif len(conditions) > 2:
            raise ValueError()

        if single_value != None:
            t, p = ttest_1samp(ds, single_value, axis=0)
            return t, p

        t, p = ttest_ind(ds[labels == conditions[0]],
                         ds[labels == conditions[1]],
                         axis=0)
        #print ds.shape
        #print t.shape
        t[np.isnan(t)] = 1
        return t
Beispiel #11
0
 def run(self, labels):
     
     ds = self.dataset
     conditions = self.conditions
     single_value = self.sample_value
     
     if conditions == single_value == None:
         raise ValueError()
     elif len(conditions)>2:
         raise ValueError()
 
     if single_value != None:
         t, p = ttest_1samp(ds, single_value, axis=0)
         return t, p
 
 
     t, p = ttest_ind(ds[labels == conditions[0]],
                  ds[labels == conditions[1]],
                  axis=0
                  )
     #print ds.shape
     #print t.shape
     t[np.isnan(t)] = 1
     return t
Beispiel #12
0
# read the data frame
df = pd.read_csv('data_hier.csv')

# approximate_random_effects
#slope[0], intercept[1], r_value[2], p_value[3], std_err[4]
correlation_per_study = {}
#correlation_per_study = df.groupby('study_id').apply(lambda v: linregress(v.age, v.gain_avg_dec_thr)[0])
correlation_per_study = df.groupby('study_id').apply(
    lambda v: linregress(v.age, v.loss_avg_dec_thr)[0])

correlation_df = correlation_per_study.reset_index()
correlation_df.columns = ['study_id', 'slope']

average_slope = np.mean(correlation_df.slope)
t, p_val = ttest_1samp(correlation_df.slope, 0)

print "Averaged slope across studies = %g (t=%g, p=%g)" % (average_slope, t,
                                                           p_val)
sns.violinplot(correlation_df.slope, inner="points", names=["studies"])
plt.ylabel("Linear regression slopes between age and decision threshold")
plt.axhline(0, color="red")

#sns.lmplot('age', 'gain_avg_dec_thr', data=df, hue='study_id', col='study_id', col_wrap=3)
sns.lmplot('age',
           'loss_avg_dec_thr',
           data=df,
           hue='study_id',
           col='study_id',
           col_wrap=3)
plt.show()
Beispiel #13
0
def plotExpBox_Main(inputFiles,headers,valcols,outputFile,sep,startRow,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,plotPvalueCluster,outputClusterPrefix,methodCluster,xlegendrotation,xlabe,ylabe,figsz,titl,showSampleSizes,trimToMinSize,relabels):

	#if plotPvalueCluster:
		#if pvalue cluster is needed:
	#	from Bio.Cluster.cluster import *
	#	from Bio.Cluster import *
		#endif



	#the real deal!
	plotData=[]	
	xtickLabels=[]
	
	
	minSize=-1

	for inputFile,header,cols in zip(inputFiles,headers,valcols):
		fin=generic_istream(inputFile)
		
		startIdx=len(plotData)

		for col in cols:
			plotData.append([])
			xtickLabels.append(header[col])

		colIndices=range(startIdx,startIdx+len(cols))

		lino=0
		for lin in fin:
			lino+=1
			if lino<startRow:
				continue		
			fields=lin.rstrip("\r\n").split(sep)
		
			for idx,col in zip(colIndices,cols):
				try:
					
					value=float(fields[col])			
					plotData[idx].append(value)
					
				except:
					pass		
		fin.close()
	
		
		if minSize==-1:
			minSize=len(plotData[idx]) #or startIDX?
		else:
			minSize=min([minSize,len(plotData[idx])])
		

	if trimToMinSize:
		print >> stderr,"trimming to min size =",minSize
		trimData(plotData,minSize)

	if len(relabels)>0:
		if len(relabels)!=len(xtickLabels):
			print >> stderr,"relabels doesn't have the same length as original label vectors",xtickLabels,"=>",relabels
			exit()

		xtickLabels=relabels
		
	

	print >> stdout,"student t-test (1 sample; mean=0)"
	print >> stdout,"sample","mean","p-val"


	for x in range(0,len(plotData)):
		#print >> stderr, len(plotData[x])
		print >> stdout, xtickLabels[x],mean(plotData[x]),ttest_1samp(plotData[x],0)[1]


	pvalueM=[]
	
	print >> stdout,""
	
	print >> stdout,"student t-test (2 samples)"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	
	print >> stdout,""

	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
			else:
				pvalue=ttest_ind(plotData[x],plotData[y])[1]
				print >> stdout, str(pvalue),
				pvalueRow.append(pvalue)
		print >> stdout,"";	

	
	print >> stdout,""

	


	if plotPvalueCluster:
		makePValueRawPlot(outputClusterPrefix+"_t_raw",xtickLabels,pvalueM)
		makePValueClusterPlot(outputClusterPrefix+"_t",xtickLabels,pvalueM,methodCluster)

	pvalueM=[]

	print >> stdout,"welch t-test"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	
	print >> stdout,""
	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
					
			else:
				pvalue=welchs_approximate_ttest_arr(plotData[x],plotData[y])[3]
				print >> stdout, str(pvalue),
				pvalueRow.append(pvalue)
		print >> stdout,"";

	

	if plotPvalueCluster:
		makePValueRawPlot(outputClusterPrefix+"_Welch_raw",xtickLabels,pvalueM)
		makePValueClusterPlot(outputClusterPrefix+"_Welch",xtickLabels,pvalueM,methodCluster)

	
	print >> stdout,""
	print >> stdout,"non-parametric (Mann-Whitney U)" #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	

	pvalueM=[]

	print >> stdout,""
	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
			else:
				#if max(len(plotData[x]),len(plotData[y]))<=20:
				pvalue=mannwhitneyu(plotData[x],plotData[y])[1]*2				
				print >> stdout,pvalue, #mann-whiteney need to mul by 2 (one tail to two tail)
				pvalueRow.append(pvalue)
				#else:
				#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
		print >> stdout,"";	
	
	

	if plotPvalueCluster:
		makePValueRawPlot(outputClusterPrefix+"_U_raw",xtickLabels,pvalueM)
		makePValueClusterPlot(outputClusterPrefix+"_U",xtickLabels,pvalueM,methodCluster)
	
	figure(figsize=figsz)
		
	if len(titl)==0:
		titl=outputFile

	plotExpBox(plotData,xtickLabels,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,xlegendrotation,xlabe,ylabe,titl,showSampleSizes)
	#ylim([0,200])

	savefig(outputFile,bbox_inches="tight")
def plotExpBox_Main(inputFile,header,cols,outputFile,sep,startRow,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,plotPvalueCluster,outputClusterPrefix,methodCluster,xlegendrotation,xlabe,ylabe,figsz,titl,showSampleSizes):

	#if plotPvalueCluster:
		#if pvalue cluster is needed:
	#	from Bio.Cluster.cluster import *
	#	from Bio.Cluster import *
		#endif



	#the real deal!


	fin=generic_istream(inputFile)
	
	plotData=[]	
	xtickLabels=[]
	for col in cols:
		plotData.append([])
		xtickLabels.append(header[col])

	colIndices=range(0,len(cols))

	lino=0
	for lin in fin:
		lino+=1
		if lino<startRow:
			continue		
		fields=lin.rstrip("\r\n").split(sep)
		
		for idx,col in zip(colIndices,cols):
			try:
				value=float(fields[col])			
				plotData[idx].append(value)
			except:
				pass		
	fin.close()

	print >> stdout,"student t-test (1 sample; mean=0)"
	print >> stdout,"sample","mean","p-val"


	for x in range(0,len(plotData)):
		print >> stdout, xtickLabels[x],mean(plotData[x]),ttest_1samp(plotData[x],0)[1]


	pvalueM=[]
	
	print >> stdout,""
	
	print >> stdout,"student t-test (2 samples)"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	
	print >> stdout,""

	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
			else:
				pvalue=ttest_ind(plotData[x],plotData[y])[1]
				print >> stdout, str(pvalue),
				pvalueRow.append(pvalue)
		print >> stdout,"";	

	
	print >> stdout,""

	makePValueRawPlot(outputClusterPrefix+"_t_raw",xtickLabels,pvalueM)


	if plotPvalueCluster:
		makePValueClusterPlot(outputClusterPrefix+"_t",xtickLabels,pvalueM,methodCluster)

	pvalueM=[]

	print >> stdout,"welch t-test"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	
	print >> stdout,""
	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
					
			else:
				pvalue=welchs_approximate_ttest_arr(plotData[x],plotData[y])[3]
				print >> stdout, str(pvalue),
				pvalueRow.append(pvalue)
		print >> stdout,"";

	makePValueRawPlot(outputClusterPrefix+"_Welch_raw",xtickLabels,pvalueM)

	if plotPvalueCluster:
		makePValueClusterPlot(outputClusterPrefix+"_Welch",xtickLabels,pvalueM,methodCluster)

	
	print >> stdout,""
	print >> stdout,"non-parametric (Mann-Whitney U)" #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	

	pvalueM=[]

	print >> stdout,""
	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
			else:
				#if max(len(plotData[x]),len(plotData[y]))<=20:
				pvalue=mannwhitneyu(plotData[x],plotData[y])[1]*2				
				print >> stdout,pvalue, #mann-whiteney need to mul by 2 (one tail to two tail)
				pvalueRow.append(pvalue)
				#else:
				#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
		print >> stdout,"";	
	
	makePValueRawPlot(outputClusterPrefix+"_U_raw",xtickLabels,pvalueM)

	if plotPvalueCluster:
		makePValueClusterPlot(outputClusterPrefix+"_U",xtickLabels,pvalueM,methodCluster)
	
	figure(figsize=figsz)
		
	if len(titl)==0:
		titl=outputFile

	plotExpBox(plotData,xtickLabels,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,xlegendrotation,xlabe,ylabe,titl,showSampleSizes)
	#ylim([0,200])

	savefig(outputFile,bbox_inches="tight")
Beispiel #15
0
results = np.zeros((REPEATS, ))
print("Stochastic repeats:")
for i in range(REPEATS):
    print("{}..".format(i + 1), end=" ", flush=True)
    results[i] = solver.solve(prob).getDistance() / 1000

print("\nDone!")

mono = results.copy()
mono[0] = min([mono[0], greedyDistance])
for i in range(1, REPEATS):
    mono[i] = min([mono[i], mono[i - 1]])

from matplotlib import pyplot as plt

plt.plot(range(1, REPEATS + 1), mono)
plt.plot(range(1, REPEATS + 1), [greedyDistance for x in mono])
plt.legend(['stochastic', 'greedy best first'])
plt.axis([0, REPEATS, 0.9 * mono.min(), 1.1 * mono.max()])
#plt.legend([[greedyDistance for x in mono],mono],['greedy best first','stochastic'])
plt.title('Length of solution by Number of Repeats')
plt.xlabel('#Repeats')
plt.ylabel('Solution Length')
plt.grid(color='gray', linestyle=':', linewidth=1)
plt.show()

# TODO : Part2 - Remove the exit and perform the t-test
mean_result = np.mean(results)
std_result = np.std(results)
tmp, pvalue = stats.ttest_1samp(results, greedyDistance)
print(mean_result, std_result, pvalue)
Beispiel #16
0
def summarise(projects):
    
    summDB = PDatabase(local='summary.fs')
    C = CorrelationAnalyser()
    figs = []
    for f in range(4):
        figs.append(plt.figure())
    
    gs = gridspec.GridSpec(5, 5, wspace=0.3, hspace=0.5)    
    i=0
    data=[]    
    print 'processing %s projects' %len(projects)
    for p in projects:
        print 'structure:',p
        DB = PDatabase(local=os.path.join(savepath,p))
        S = PEATTableModel(DB)           
        
        try:
            exp,pre = S.getColumns(['Exp','prediction'],allowempty=False)
            errs = [j[0]-j[1] for j in zip(exp,pre)]
        except:
            print 'no results'
            continue
            
        #DB.close()
        #add link to proj
        summDB.add(p)
        summDB.addField('project',fieldtype='Project')
        summDB[p]['project'] = {'server':'enzyme.ucd.ie','username':'******',
                              'project':p,'password':'******','port':'8080'}
        print summDB.isChanged()
        #stats
        cc,rmse,meanerr = C.getStats(pre,exp)
        #ttest for mean errs 0        
        ttp = round(stats.ttest_1samp(errs, 0)[1],2)
        #normality of errs
        w,swp = C.ShapiroWilk(errs)
        x={'name':p,'mutants':len(pre),'rmse':rmse,'corrcoef':cc,'meanerr':meanerr,
           'ttest':ttp,'shapirowilk':swp}
           
        '''ax = figs[0].add_subplot(gs[0, i])
        C.plotCorrelation(pre,exp,title=p,ms=2,axeslabels=False,ax=ax)
        ax = figs[1].add_subplot(gs[0, i])
        C.showHistogram([pre,exp],title=p,labels=['pre','exp'],ax=ax)                
        ax = figs[2].add_subplot(gs[0, i])
        C.plotNorm(errs,title=p,lw=1,ax=ax)
        #qqplot
        ax = figs[3].add_subplot(gs[0, i])
        C.QQplot(errs,title=p,ax=ax)'''
        
        #get PDB info
        parser = PDBParser()
        descr = parser.getDescription(p)
        x.update(descr)
        data.append(x)       
        i+=1              
        
    summDB.importDict(data)
    print summDB.isChanged()
    summDB.commit()    
    
    #add all peatsa jobs to summary proj also
    '''print 'adding peatsa job info'
    PS = PEATSAPlugin()
    PS.main(DB=summDB)
    #summDB.meta.peatsa_jobs = None
    #from ZODB.PersistentMapping import PersistentMapping
    #summDB.meta.peatsa_jobs = PersistentMapping()    
    PS.checkJobsDict()
    PS.jobManager.stopLogging()
    for p in projects:
        #print summDB.meta
        DB = PDatabase(local=os.path.join(savepath,p))
        job = DB.meta.peatsa_jobs['mycalc']
        summDB.meta.peatsa_jobs[p] = job
        print job
        #DB.close()
    print summDB.isChanged()
    print summDB.meta.peatsa_jobs
    summDB.commit()'''

    #for i in range(len(figs)):
    #    figs[i].savefig('fig%s.png' %i)
    #plt.show()
        
    return
def plotExpBox_Main(inputFiles,headers,valcols,outputFile,sep,startRow,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,plotPvalueCluster,outputClusterPrefix,methodCluster,xlegendrotation,xlabe,ylabe,figsz,titl,showSampleSizes,trimToMinSize,relabels,logb,plotHistogramToFile,plotMedianForGroups,botta):

	#if plotPvalueCluster:
		#if pvalue cluster is needed:
	#	from Bio.Cluster.cluster import *
	#	from Bio.Cluster import *
		#endif



	#the real deal!
	plotData=[]	
	xtickLabels=[]
	
	
	minSize=-1

	for inputFile,header,cols in zip(inputFiles,headers,valcols):
		fin=generic_istream(inputFile)
		
		startIdx=len(plotData)

		for col in cols:
			plotData.append([])
			xtickLabels.append(header[col])

		colIndices=range(startIdx,startIdx+len(cols))

		lino=0
		for lin in fin:
			lino+=1
			if lino<startRow:
				continue		
			fields=lin.rstrip("\r\n").split(sep)
		
			for idx,col in zip(colIndices,cols):
				try:
					
					value=float(fields[col])
					if logb!=0:
						if value==0.0:
							raise ValueError
						value=log(value)/logb							
					plotData[idx].append(value)
					
				except:
					pass		
		fin.close()
	
		
		if minSize==-1:
			minSize=len(plotData[idx]) #or startIDX?
		else:
			minSize=min([minSize,len(plotData[idx])])
		

	if trimToMinSize:
		print >> stderr,"trimming to min size =",minSize
		trimData(plotData,minSize)

	if len(relabels)>0:
		#if len(relabels)!=len(xtickLabels):
		#	print >> stderr,"relabels doesn't have the same length as original label vectors",xtickLabels,"=>",relabels
		#	exit()
		print >> stderr,xtickLabels
		print >> stderr,relabels
		for i,relabel in zip(range(0,len(relabels)),relabels):
			xtickLabels[i]=relabel
		
	
	for i in range(0,len(plotMedianForGroups)):
		plotMedianForGroups[i]=getCol0ListFromCol1ListStringAdv(xtickLabels,plotMedianForGroups[i])
			
	
	#drawing medians:
	medianToDraw=[]
	for mediangrouper in plotMedianForGroups:
		curD=[]		
		for c in mediangrouper:
			curD.extend(plotData[c])
		medianToDraw.append(median(curD))


	for c in range(len(plotData)-1,-1,-1):
		if len(plotData[c])==0:
			print >> stderr,xtickLabels[c],"discarded"
			del plotData[c]
			del xtickLabels[c]


	print >> stdout,"student t-test (1 sample; mean=0)"
	print >> stdout,"sample","mean","p-val"


	for x in range(0,len(plotData)):
		#print >> stderr, len(plotData[x])
		try:
			print >> stdout, xtickLabels[x],mean(plotData[x]),ttest_1samp(plotData[x],0)[1]
		except:
			print >> stdout, xtickLabels[x],"NA","NA"

	pvalueM=[]
	
	print >> stdout,""
	
	print >> stdout,"student t-test (2 samples)"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	
	print >> stdout,""

	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
			else:
				try:
					pvalue=ttest_ind(plotData[x],plotData[y])[1]
				except:
					pvalue=1.0

				print >> stdout, str(pvalue),
				pvalueRow.append(pvalue)
		print >> stdout,"";	

	
	print >> stdout,""

	


	if plotPvalueCluster:
		makePValueRawPlot(outputClusterPrefix+"_t_raw",xtickLabels,pvalueM)
		makePValueClusterPlot(outputClusterPrefix+"_t",xtickLabels,pvalueM,methodCluster)

	pvalueM=[]

	print >> stdout,"welch t-test"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	
	print >> stdout,""
	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
					
			else:
				pvalue=welchs_approximate_ttest_arr(plotData[x],plotData[y])[3]
				print >> stdout, str(pvalue),
				pvalueRow.append(pvalue)
		print >> stdout,"";

	

	if plotPvalueCluster:
		makePValueRawPlot(outputClusterPrefix+"_Welch_raw",xtickLabels,pvalueM)
		makePValueClusterPlot(outputClusterPrefix+"_Welch",xtickLabels,pvalueM,methodCluster)

	
	print >> stdout,""
	print >> stdout,"non-parametric (Mann-Whitney U)" #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	

	pvalueM=[]

	print >> stdout,""
	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
			else:
				#if max(len(plotData[x]),len(plotData[y]))<=20:
				try:
					pvalue=mannwhitneyu(plotData[x],plotData[y])[1]*2				
				except:
					pvalue=1.0
				print >> stdout,pvalue, #mann-whiteney need to mul by 2 (one tail to two tail)
				pvalueRow.append(pvalue)
				#else:
				#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
		print >> stdout,"";	
	
	

	if plotPvalueCluster:
		makePValueRawPlot(outputClusterPrefix+"_U_raw",xtickLabels,pvalueM)
		makePValueClusterPlot(outputClusterPrefix+"_U",xtickLabels,pvalueM,methodCluster)
	
	figure(figsize=figsz)
	subplots_adjust(top=0.9, bottom=botta, left=0.2, right=0.8)
	
	if len(titl)==0:
		titl=outputFile


	plotExpBox(plotData,xtickLabels,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,xlegendrotation,xlabe,ylabe,titl,showSampleSizes)
	
	#ylim([0,200])
	for m in medianToDraw:
		axhline(y=m,linestyle=':',color='gray')

	savefig(outputFile,bbox_inches="tight")

	if len(plotHistogramToFile)>0:
		drawHistogram(plotHistogramToFile,plotData,xtickLabels)
def plotExpBox_Main(inputFiles, headers, valcols, outputFile, sep, startRow,
                    showIndPoints, mark, markMean, showMean, notch, whisker,
                    outliers, plotPvalueCluster, outputClusterPrefix,
                    methodCluster, xlegendrotation, xlabe, ylabe, figsz, titl,
                    showSampleSizes, trimToMinSize, relabels, logb,
                    plotHistogramToFile, plotMedianForGroups, botta,
                    showViolin, showBox, firstColAnnot, plotTrend, showLegend,
                    makePzfxFile, makeBinMatrix, writeDataSummaryStat,
                    summaryStatRange, minuslog10pvalue, minNDataToKeep,
                    vfacecolor, valpha, outXYZPvalues, dividePlots):

    #if plotPvalueCluster:
    #if pvalue cluster is needed:
    #	from Bio.Cluster.cluster import *
    #	from Bio.Cluster import *
    #endif

    #the real deal!
    plotData = []
    xtickLabels = []

    trendData = {}
    annot = {}

    minSize = -1

    for inputFile, header, cols in zip(inputFiles, headers, valcols):
        fin = generic_istream(inputFile)

        startIdx = len(plotData)

        if firstColAnnot:
            colAnnot = cols[0]
            cols = cols[1:]
            annotThisFile = []
            annot[startIdx] = annotThisFile
        else:
            colAnnot = -1
            annotThisFile = None

        for col in cols:
            plotData.append([])
            xtickLabels.append(header[col])

        colIndices = range(startIdx, startIdx + len(cols))

        if plotTrend:
            #print >> stderr,"plotTrend"
            trendDataThisFile = []
            trendData[startIdx] = trendDataThisFile
        else:
            trendDataThisFile = None

        lino = 0
        for lin in fin:
            lino += 1
            if lino < startRow:
                continue
            fields = lin.rstrip("\r\n").split(sep)

            if plotTrend:
                #print >> stderr,"a"
                trendDataThisLine = []
            else:
                trendDataThisLine = None

            allDataOKThisLine = True

            if colAnnot >= 0:
                annotThisFile.append(fields[colAnnot])

            for idx, col in zip(colIndices, cols):
                try:
                    value = float(fields[col])
                    if logb != 0:
                        if value == 0.0:
                            raise ValueError
                        value = log(value) / logb
                    plotData[idx].append(value)

                    if plotTrend:
                        trendDataThisLine.append(value)
                        #print >> stderr,"value:",value

                except:
                    allDataOKThisLine = False

            if plotTrend:
                if allDataOKThisLine:
                    trendDataThisFile.append(trendDataThisLine)
                else:
                    trendDataThisFile.append(None)

        fin.close()

        if minSize == -1:
            minSize = len(plotData[idx])  #or startIDX?
        else:
            minSize = min([minSize, len(plotData[idx])])

    if trimToMinSize:
        print >> stderr, "trimming to min size =", minSize
        trimData(plotData, minSize)

    if len(relabels) > 0:
        #if len(relabels)!=len(xtickLabels):
        #	print >> stderr,"relabels doesn't have the same length as original label vectors",xtickLabels,"=>",relabels
        #	exit()
        print >> stderr, xtickLabels
        print >> stderr, relabels
        for i, relabel in zip(range(0, len(relabels)), relabels):
            xtickLabels[i] = relabel

    for i in range(0, len(plotMedianForGroups)):
        plotMedianForGroups[i] = getCol0ListFromCol1ListStringAdv(
            xtickLabels, plotMedianForGroups[i])

    #drawing medians:
    medianToDraw = []
    for mediangrouper in plotMedianForGroups:
        curD = []
        for c in mediangrouper:
            curD.extend(plotData[c])
        medianToDraw.append(median(curD))

    for c in range(len(plotData) - 1, -1, -1):
        if len(plotData[c]) < minNDataToKeep:
            print >> stderr, xtickLabels[c], "discarded because has only", len(
                plotData[c]), "data points <", minNDataToKeep
            del plotData[c]
            del xtickLabels[c]

    if not skipStat:
        print >> stdout, "student t-test (1 sample; mean=0)"
        print >> stdout, "sample", "mean", "p-val", "median"

        if writeDataSummaryStat:
            fDSS = open(writeDataSummaryStat, "w")
            print >> fDSS, "sample\tmean\tvar\tsd\tmin\tmax\tN\tNInRange[" + str(
                summaryStatRange[0]) + "," + str(
                    summaryStatRange[1]
                ) + "]\t%NInRange\tNbelowRange\t%Nbelow\tNAboveRange\t%NAbove"

        for x in range(0, len(plotData)):
            #print >> stderr, len(plotData[x])
            try:
                print >> stdout, xtickLabels[x], mean(
                    plotData[x]), ttest_1samp(plotData[x],
                                              0)[1], median(plotData[x])
            except:
                print >> stdout, xtickLabels[x], mean(
                    plotData[x]), "NA", median(plotData[x])

            if writeDataSummaryStat:
                sumData, N, NIN, NBelow, NAbove = filterDataInRangeInclusive(
                    plotData[x], summaryStatRange[0], summaryStatRange[1])

                if NIN > 1:
                    #print >> stderr,"sumData=",sumData
                    #print >> stderr,mean
                    mea = mean2(sumData)
                    DDOF = 1
                    sd = std(sumData, ddof=DDOF)
                    var = sd * sd
                    mi = min(sumData)
                    ma = max(sumData)
                else:
                    mea = "NA"
                    sd = "NA"
                    var = "NA"
                    mi = "NA"
                    ma = "NA"

                print >> fDSS, xtickLabels[x] + "\t" + str(mea) + "\t" + str(
                    var) + "\t" + str(sd) + "\t" + str(mi) + "\t" + str(
                        ma) + "\t" + str(N) + "\t" + str(NIN) + "\t" + str(
                            float(NIN) * 100 /
                            N) + "\t" + str(NBelow) + "\t" + str(
                                float(NBelow) * 100 /
                                N) + "\t" + str(NAbove) + "\t" + str(
                                    float(NAbove) * 100 / N)

        pvalueM = []

        if writeDataSummaryStat:
            fDSS.close()

        print >> stdout, ""

        print >> stdout, "student t-test (2 samples)"
        print >> stdout, "p-val",
        for x in range(0, len(plotData)):
            print >> stdout, xtickLabels[x],

        print >> stdout, ""

        for x in range(0, len(plotData)):
            pvalueRow = []
            pvalueM.append(pvalueRow)
            print >> stdout, xtickLabels[x],
            for y in range(0, len(plotData)):
                if y <= x:
                    print >> stdout, "",
                    if x == y:
                        if minuslog10pvalue:
                            pvalueRow.append(0.0)
                        else:
                            pvalueRow.append(1.0)
                    else:
                        pvalueRow.append(pvalueM[y][x])
                else:
                    try:
                        pvalue = ttest_ind(plotData[x], plotData[y])[1]
                    except:
                        pvalue = 1.0

                    if minuslog10pvalue and str(pvalue) != "NA":
                        try:
                            pvalue = -1 * log(pvalue, 10)
                        except:
                            pvalue = -1000.0

                    print >> stdout, str(pvalue),
                    pvalueRow.append(pvalue)
            print >> stdout, ""

        print >> stdout, ""

        if plotPvalueCluster:
            makePValueRawPlot(outputClusterPrefix + "_t_raw", xtickLabels,
                              pvalueM)
            makePValueClusterPlot(outputClusterPrefix + "_t", xtickLabels,
                                  pvalueM, methodCluster)

        pvalueM = []

        print >> stdout, "welch t-test"
        print >> stdout, "p-val",
        for x in range(0, len(plotData)):
            print >> stdout, xtickLabels[x],

        print >> stdout, ""
        for x in range(0, len(plotData)):
            pvalueRow = []
            pvalueM.append(pvalueRow)
            print >> stdout, xtickLabels[x],
            for y in range(0, len(plotData)):
                if y <= x:
                    print >> stdout, "",
                    if x == y:
                        if minuslog10pvalue:
                            pvalueRow.append(0.0)
                        else:
                            pvalueRow.append(1.0)
                    else:
                        pvalueRow.append(pvalueM[y][x])

                else:
                    try:
                        pvalue = welchs_approximate_ttest_arr(
                            plotData[x], plotData[y])[3]
                    except:
                        pvalue = 1.0

                    if minuslog10pvalue and str(pvalue) != "NA":
                        try:
                            pvalue = -1 * log(pvalue, 10)
                        except:
                            pvalue = -1000.0

                    print >> stdout, str(pvalue),
                    pvalueRow.append(pvalue)
            print >> stdout, ""

        if outXYZPvalues:
            writeXYZPvalues(outXYZPvalues + "_Welch.xyz", xtickLabels, pvalueM)

        if plotPvalueCluster:
            makePValueRawPlot(outputClusterPrefix + "_Welch_raw", xtickLabels,
                              pvalueM)
            makePValueClusterPlot(outputClusterPrefix + "_Welch", xtickLabels,
                                  pvalueM, methodCluster)

        print >> stdout, ""
        print >> stdout, "non-parametric (Mann-Whitney U)"  #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)"
        print >> stdout, "p-val",
        for x in range(0, len(plotData)):
            print >> stdout, xtickLabels[x],

        pvalueM = []

        print >> stdout, ""
        for x in range(0, len(plotData)):
            pvalueRow = []
            pvalueM.append(pvalueRow)
            print >> stdout, xtickLabels[x],
            for y in range(0, len(plotData)):
                if y <= x:
                    print >> stdout, "",
                    if x == y:
                        if minuslog10pvalue:
                            pvalueRow.append(0.0)
                        else:
                            pvalueRow.append(1.0)
                    else:
                        pvalueRow.append(pvalueM[y][x])
                else:
                    #if max(len(plotData[x]),len(plotData[y]))<=20:
                    try:
                        pvalue = mannwhitneyu(plotData[x], plotData[y])[1] * 2
                    except:
                        pvalue = 1.0

                    if minuslog10pvalue and str(pvalue) != "NA":
                        try:
                            pvalue = -1 * log(pvalue, 10)
                        except:
                            pvalue = -1000.0

                    print >> stdout, pvalue,  #mann-whiteney need to mul by 2 (one tail to two tail)
                    pvalueRow.append(pvalue)
                    #else:
                    #	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
            print >> stdout, ""

        if outXYZPvalues:
            writeXYZPvalues(outXYZPvalues + "_U.xyz", xtickLabels, pvalueM)

        if plotPvalueCluster:
            makePValueRawPlot(outputClusterPrefix + "_U_raw", xtickLabels,
                              pvalueM)
            makePValueClusterPlot(outputClusterPrefix + "_U", xtickLabels,
                                  pvalueM, methodCluster)

        #####now the variance tests

        print >> stdout, ""
        print >> stdout, "Ansari-Bradley Two-sample Test for difference in scale parameters "
        print >> stdout, "p-val",

        for x in range(0, len(plotData)):
            print >> stdout, xtickLabels[x],

        pvalueM = []

        print >> stdout, ""
        for x in range(0, len(plotData)):
            pvalueRow = []
            pvalueM.append(pvalueRow)
            print >> stdout, xtickLabels[x],
            for y in range(0, len(plotData)):
                if y <= x:
                    print >> stdout, "",
                    if x == y:
                        if minuslog10pvalue:
                            pvalueRow.append(0.0)
                        else:
                            pvalueRow.append(1.0)
                    else:
                        pvalueRow.append(pvalueM[y][x])
                else:
                    #if max(len(plotData[x]),len(plotData[y]))<=20:
                    try:
                        pvalue = ansari(plotData[x], plotData[y])[1]
                    except:
                        pvalue = "NA"

                    if minuslog10pvalue and str(pvalue) != "NA":
                        try:
                            pvalue = -1 * log(pvalue, 10)
                        except:
                            pvalue = -1000.0

                        #pvalue=1.0
                    print >> stdout, pvalue,
                    pvalueRow.append(pvalue)
                    #else:
                    #	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
            print >> stdout, ""

        if plotPvalueCluster:
            makePValueRawPlot(outputClusterPrefix + "_Ansari_raw", xtickLabels,
                              pvalueM)
            makePValueClusterPlot(outputClusterPrefix + "_Ansari", xtickLabels,
                                  pvalueM, methodCluster)

        #####

        #####now the variance tests

        print >> stdout, ""
        print >> stdout, "Fligner's Two-sample Test for equal variance (non-parametrics)"
        print >> stdout, "p-val",

        for x in range(0, len(plotData)):
            print >> stdout, xtickLabels[x],

        pvalueM = []

        print >> stdout, ""
        for x in range(0, len(plotData)):
            pvalueRow = []
            pvalueM.append(pvalueRow)
            print >> stdout, xtickLabels[x],
            for y in range(0, len(plotData)):
                if y <= x:
                    print >> stdout, "",
                    if x == y:
                        if minuslog10pvalue:
                            pvalueRow.append(0.0)
                        else:
                            pvalueRow.append(1.0)
                    else:
                        pvalueRow.append(pvalueM[y][x])
                else:
                    #if max(len(plotData[x]),len(plotData[y]))<=20:
                    try:
                        pvalue = fligner(plotData[x], plotData[y])[1]
                    except:
                        pvalue = "NA"
                        #pvalue=1.0

                    if minuslog10pvalue and str(pvalue) != "NA":
                        try:
                            pvalue = -1 * log(pvalue, 10)
                        except:
                            pvalue = -1000.0

                    print >> stdout, pvalue,
                    pvalueRow.append(pvalue)
                    #else:
                    #	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
            print >> stdout, ""

        if plotPvalueCluster:
            makePValueRawPlot(outputClusterPrefix + "_fligner_raw",
                              xtickLabels, pvalueM)
            makePValueClusterPlot(outputClusterPrefix + "_fligner",
                                  xtickLabels, pvalueM, methodCluster)

        #####

        #####now the variance tests

        print >> stdout, ""
        print >> stdout, "Levene's Two-sample Test for equal variance"
        print >> stdout, "p-val",

        for x in range(0, len(plotData)):
            print >> stdout, xtickLabels[x],

        pvalueM = []

        print >> stdout, ""
        for x in range(0, len(plotData)):
            pvalueRow = []
            pvalueM.append(pvalueRow)
            print >> stdout, xtickLabels[x],
            for y in range(0, len(plotData)):
                if y <= x:
                    print >> stdout, "",
                    if x == y:
                        if minuslog10pvalue:
                            pvalueRow.append(0.0)
                        else:
                            pvalueRow.append(1.0)
                    else:
                        pvalueRow.append(pvalueM[y][x])
                else:
                    #if max(len(plotData[x]),len(plotData[y]))<=20:
                    try:
                        pvalue = levene(plotData[x], plotData[y])[1]
                    except:
                        pvalue = "NA"
                        #pvalue=1.0

                    if minuslog10pvalue and str(pvalue) != "NA":
                        try:
                            pvalue = -1 * log(pvalue, 10)
                        except:
                            pvalue = -1000.0

                    print >> stdout, pvalue,
                    pvalueRow.append(pvalue)
                    #else:
                    #	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
            print >> stdout, ""

        if plotPvalueCluster:
            makePValueRawPlot(outputClusterPrefix + "_levene_raw", xtickLabels,
                              pvalueM)
            makePValueClusterPlot(outputClusterPrefix + "_levene", xtickLabels,
                                  pvalueM, methodCluster)

        #####

        #####now the variance tests

        print >> stdout, ""
        print >> stdout, "Bartlett's Two-sample Test for equal variance (for normal distributions)"
        print >> stdout, "p-val",

        for x in range(0, len(plotData)):
            print >> stdout, xtickLabels[x],

        pvalueM = []

        print >> stdout, ""
        for x in range(0, len(plotData)):
            pvalueRow = []
            pvalueM.append(pvalueRow)
            print >> stdout, xtickLabels[x],
            for y in range(0, len(plotData)):
                if y <= x:
                    print >> stdout, "",
                    if x == y:
                        if minuslog10pvalue:
                            pvalueRow.append(0.0)
                        else:
                            pvalueRow.append(1.0)
                    else:
                        pvalueRow.append(pvalueM[y][x])
                else:
                    #if max(len(plotData[x]),len(plotData[y]))<=20:
                    try:
                        pvalue = bartlett(plotData[x], plotData[y])[1]
                    except:
                        pvalue = "NA"
                        #pvalue=1.0

                    if minuslog10pvalue and str(pvalue) != "NA":
                        try:
                            pvalue = -1 * log(pvalue, 10)
                        except:
                            pvalue = -1000.0

                    print >> stdout, pvalue,
                    pvalueRow.append(pvalue)
                    #else:
                    #	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
            print >> stdout, ""

        if plotPvalueCluster:
            makePValueRawPlot(outputClusterPrefix + "_bartlett_raw",
                              xtickLabels, pvalueM)
            makePValueClusterPlot(outputClusterPrefix + "_bartlett",
                                  xtickLabels, pvalueM, methodCluster)

        #####

    figure(figsize=figsz)
    subplots_adjust(top=0.9, bottom=botta, left=0.2, right=0.8)

    if len(titl) == 0:
        titl = outputFile

    plotExpBox(plotData, xtickLabels, showIndPoints, mark, markMean, showMean,
               notch, whisker, outliers, xlegendrotation, xlabe, ylabe, titl,
               showSampleSizes, showViolin, showBox, annot, trendData,
               showLegend, makePzfxFile, makeBinMatrix, dividePlots)

    #ylim([0,200])
    for m in medianToDraw:
        axhline(y=m, linestyle=':', color='gray')

    savefig(outputFile, bbox_inches="tight")

    if len(plotHistogramToFile) > 0:
        drawHistogram(plotHistogramToFile, plotData, xtickLabels)
        drawDensigram(plotHistogramToFile + ".density.png", plotData,
                      xtickLabels)
def main(months=None, season="DJF", ax = None, clevels = None,
         labels = None, paths = None):
    if not months:
        months = [12, 1, 2]

    path_to_glaciers_land_sea_mask = "/b2_fs2/huziy/geophy_from_others/land_sea_glacier_mask_phy"

    land_sea_glaciers_mask = get_land_sea_glaciers_mask_from_geophysics_file(path=path_to_glaciers_land_sea_mask)
    p_current = "{0}-{1}".format(start_year_current, end_year_current)
    p_future = "{0}-{1}".format(start_year_future, end_year_future)

    lons2d = None
    lats2d = None

    x_index = None
    y_index = None
    mean_data = None

    b = None

    for the_path, label in zip(paths, labels):
        ds = Dataset(the_path)
        if lons2d is None:
            lons2d = ds.variables["longitude"][:]
            lats2d = ds.variables["latitude"][:]

            #b = get_arctic_basemap(lons2d, lats2d)
            b = get_arctic_basemap_nps(round = True)

            x_index = ds.variables["x_index"][:]
            y_index = ds.variables["y_index"][:]

        cache_file = "_".join([str(m) for m in months]) + "_{0}-{1}_{2}-{3}_{4}_mean_change_cache.bin".format(
            start_year_current, end_year_current, start_year_future, end_year_future, label)


        #os.remove(cache_file)
        if not os.path.isfile(cache_file):
            time_str = ds.variables["time"][:]
            times = [datetime.strptime("".join(t_s), TIME_FORMAT) for t_s in time_str]

            data = ds.variables["water_discharge_accumulated"][:]
            df = pandas.DataFrame(data=data, index=times)
            df["year"] = df.index.map(lambda d: d.year)
            df["month"] = df.index.map(lambda d: d.month)

            print(df.shape, df.columns)

            data_current = df.ix[
                           (df.year >= start_year_current) & (df.year <= end_year_current) & df.month.isin(months), :]
            print(data_current.columns)
            data_current = data_current.drop(["year", "month"], axis=1)
            seasonal_means_current = data_current.groupby(
                by=lambda d: d.year).mean()  #calculate mean for the season for each year

            data_future = df.ix[(df.year >= start_year_future) & (df.year <= end_year_future) & df.month.isin(months),
                          :]
            data_future = data_future.drop(["year", "month"], axis=1)
            seasonal_means_future = data_future.groupby(by=lambda d: d.year).mean()

            change = seasonal_means_future.values - seasonal_means_current.values

            mean_current = seasonal_means_current.values.mean(axis=0)
            mean_future = seasonal_means_future.values.mean(axis=0)

            ##axis0 - time, axis1 -  cell index

            mean_change = change.mean(axis=0)

            #print change[:,mean_change > 200000], seasonal_means_future.values[:,mean_change > 200000], seasonal_means_current.values[:,mean_change > 200000]

            t, pvalue = stats.ttest_1samp(change, 0, axis=0)

            data_map = {
                "current-mean": mean_current,
                "future-mean": mean_future,
                "change": mean_change,
                "p-value": pvalue
            }

            pickle.dump(data_map, open(cache_file, mode="w"))
        else:
            data_map = pickle.load(open(cache_file))
            mean_change = data_map["change"]
            pvalue = data_map["p-value"]
            mean_current = data_map["current-mean"]

        if ax is None:
            plt.figure()

        to_plot = np.ma.masked_all_like(lons2d)

        #mask nonsignificant changes

        #change_arr_significant = np.ma.masked_where(pvalue > 1, mean_change)

        #mean_change[mean_change > levels[-1]] = levels[-1] + 10


        to_plot[x_index, y_index] = mean_change

        print(to_plot.min(), to_plot.max())
        print(pvalue.min(), pvalue.max())

        x, y = b(lons2d, lats2d)

        cmap = cm.get_cmap(name="bwr", lut=len(clevels) - 1)
        #cmap = my_colormaps.get_cmap_from_ncl_spec_file(path="colormap_files/BlueRed.rgb", ncolors=len(levels) - 1)
        #cmap.set_over(cmap(levels[-1]))

        bn = BoundaryNorm(clevels, cmap.N)

        #mask glaciers and oceans
        to_plot = np.ma.masked_where(land_sea_glaciers_mask, to_plot)




        #b.pcolormesh(x, y, to_plot)
        #img = b.pcolormesh(x, y, to_plot, vmin = -100, vmax = 100)

        img = b.pcolormesh(x, y, to_plot, cmap=cmap, vmax=clevels[-1], vmin=clevels[0], norm=bn)
        if ax is None:
            cb = b.colorbar(img, extend="both", ticks=clevels)
            #cb = plt.colorbar(ticks = levels)
            cb.ax.set_title(r"${\rm m^3/s}$")

        b.drawcoastlines(linewidth=0.1)
        b.drawmapboundary(fill_color="0.75")
        #b.drawmeridians(meridians=np.arange(-180, 180, 60))
        #b.drawparallels(circles=np.arange(0, 90, 30))
        b.readshapefile("data/shp/wri_basins2/wribasin", "basin", color="k", linewidth=1)


        if ax is None:
            plt.tight_layout()
            imfile = "offline_rout_{0}_mean_abschange_map_{1}_({2})-({3}).jpeg".format(season, label, p_future, p_current)
            plt.savefig(imfile, dpi=400)

        return img
Beispiel #20
0
def plotExpBox_Main(inputFiles,headers,valcols,outputFile,sep,startRow,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,plotPvalueCluster,outputClusterPrefix,methodCluster,xlegendrotation,xlabe,ylabe,figsz,titl,showSampleSizes,trimToMinSize,relabels,logb,plotHistogramToFile,plotMedianForGroups,botta):

	#if plotPvalueCluster:
		#if pvalue cluster is needed:
	#	from Bio.Cluster.cluster import *
	#	from Bio.Cluster import *
		#endif



	#the real deal!
	plotData=[]	
	xtickLabels=[]
	
	
	minSize=-1

	for inputFile,header,cols in zip(inputFiles,headers,valcols):
		fin=generic_istream(inputFile)
		
		startIdx=len(plotData)

		for col in cols:
			plotData.append([])
			xtickLabels.append(header[col])

		colIndices=range(startIdx,startIdx+len(cols))

		lino=0
		for lin in fin:
			lino+=1
			if lino<startRow:
				continue		
			fields=lin.rstrip("\r\n").split(sep)
		
			for idx,col in zip(colIndices,cols):
				try:
					
					value=float(fields[col])
					if logb!=0:
						value=log(value)/logb	
						if value<-100000:
							raise ValueError						
					plotData[idx].append(value)
					
				except:
					pass		
		fin.close()
	
		
		if minSize==-1:
			minSize=len(plotData[idx]) #or startIDX?
		else:
			minSize=min([minSize,len(plotData[idx])])
		

	if trimToMinSize:
		print >> stderr,"trimming to min size =",minSize
		trimData(plotData,minSize)

	if len(relabels)>0:
		#if len(relabels)!=len(xtickLabels):
		#	print >> stderr,"relabels doesn't have the same length as original label vectors",xtickLabels,"=>",relabels
		#	exit()

		for i,relabel in zip(range(0,len(relabels)),relabels):
			xtickLabels[i]=relabel
		
	
	for i in range(0,len(plotMedianForGroups)):
		plotMedianForGroups[i]=getCol0ListFromCol1ListStringAdv(xtickLabels,plotMedianForGroups[i])
			
	
	#drawing medians:
	medianToDraw=[]
	for mediangrouper in plotMedianForGroups:
		curD=[]		
		for c in mediangrouper:
			curD.extend(plotData[c])
		medianToDraw.append(median(curD))


	for c in range(len(plotData)-1,-1,-1):
		if len(plotData[c])==0:
			print >> stderr,xtickLabels[c],"discarded"
			del plotData[c]
			del xtickLabels[c]


	print >> stdout,"student t-test (1 sample; mean=0)"
	print >> stdout,"sample","mean","p-val"


	for x in range(0,len(plotData)):
		#print >> stderr, len(plotData[x])
		try:
			print >> stdout, xtickLabels[x],mean(plotData[x]),ttest_1samp(plotData[x],0)[1]
		except:
			print >> stdout, xtickLabels[x],"NA","NA"

	pvalueM=[]
	
	print >> stdout,""
	
	print >> stdout,"student t-test (2 samples)"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	
	print >> stdout,""

	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
			else:
				try:
					pvalue=ttest_ind(plotData[x],plotData[y])[1]
				except:
					pvalue=1.0

				print >> stdout, str(pvalue),
				pvalueRow.append(pvalue)
		print >> stdout,"";	

	
	print >> stdout,""

	


	if plotPvalueCluster:
		makePValueRawPlot(outputClusterPrefix+"_t_raw",xtickLabels,pvalueM)
		makePValueClusterPlot(outputClusterPrefix+"_t",xtickLabels,pvalueM,methodCluster)

	pvalueM=[]

	print >> stdout,"welch t-test"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	
	print >> stdout,""
	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
					
			else:
				pvalue=welchs_approximate_ttest_arr(plotData[x],plotData[y])[3]
				print >> stdout, str(pvalue),
				pvalueRow.append(pvalue)
		print >> stdout,"";

	

	if plotPvalueCluster:
		makePValueRawPlot(outputClusterPrefix+"_Welch_raw",xtickLabels,pvalueM)
		makePValueClusterPlot(outputClusterPrefix+"_Welch",xtickLabels,pvalueM,methodCluster)

	
	print >> stdout,""
	print >> stdout,"non-parametric (Mann-Whitney U)" #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	

	pvalueM=[]

	print >> stdout,""
	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
			else:
				#if max(len(plotData[x]),len(plotData[y]))<=20:
				try:
					pvalue=mannwhitneyu(plotData[x],plotData[y])[1]*2				
				except:
					pvalue=1.0
				print >> stdout,pvalue, #mann-whiteney need to mul by 2 (one tail to two tail)
				pvalueRow.append(pvalue)
				#else:
				#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
		print >> stdout,"";	
	
	

	if plotPvalueCluster:
		makePValueRawPlot(outputClusterPrefix+"_U_raw",xtickLabels,pvalueM)
		makePValueClusterPlot(outputClusterPrefix+"_U",xtickLabels,pvalueM,methodCluster)
	
	figure(figsize=figsz)
	subplots_adjust(top=0.9, bottom=botta, left=0.2, right=0.8)
	
	if len(titl)==0:
		titl=outputFile


	plotExpBox(plotData,xtickLabels,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,xlegendrotation,xlabe,ylabe,titl,showSampleSizes)
	
	#ylim([0,200])
	for m in medianToDraw:
		axhline(y=m,linestyle=':',color='gray')

	savefig(outputFile,bbox_inches="tight")

	if len(plotHistogramToFile)>0:
		drawHistogram(plotHistogramToFile,plotData,xtickLabels)
Beispiel #21
0
def bootstrapping():
    model = build_nn_model()
    #model.load_weights("../result/model/20200118-085651-496.h5")   sample
    model.load_weights(
        "E:/experiments/MNIST_FL_1/model/20200317-171952-491-0.9456.h5")
    print("==> bootstrapping start")

    n_bootstraps = 10000
    rng_seed = 3033  # control reproducibility

    bootstrapped_auroc = []
    bootstrapped_auprc = []
    bootstrapped_sen = []
    bootstrapped_spe = []
    bootstrapped_bac = []
    bootstrapped_f1 = []
    bootstrapped_pre = []
    bootstrapped_NLR = []
    bootstrapped_PLR = []
    final = {}

    result = model.predict(test_images)
    auroc = metrics.roc_auc_score(test_labels, result, multi_class='ovr')
    print("auroc ovr : ", auroc)
    auroc_ovo = metrics.roc_auc_score(test_labels, result, multi_class='ovo')
    print("auroc ovo : ", auroc_ovo)

    result = np.argmax(result, axis=1)
    auprc = metrics.auc(test_labels, result)
    print("auprc : ", auprc)
    '''
    fpr = dict()
    tpr = dict()

    for i in range(10):
        fpr[i], tpr[i], _ = roc_curve(test_labels[:, i], result[:, i])

    print(fpr, tpr)
    
    fpr, tpr, thresholds = metrics.roc_curve(test_labels, result)
    #roc_auc = metrics.auc(fpr, tpr)
    '''
    (precisions, recalls,
     thresholds) = metrics.precision_recall_curve(test_labels, result)

    minpse = np.max([min(x, y) for (x, y) in zip(precisions, recalls)])

    result = np.argmax(result, axis=1)

    cf = metrics.confusion_matrix(test_labels, result)
    print(cf)
    cf = cf.astype(np.float32)

    acc = (cf[0][0] + cf[1][1]) / np.sum(cf)
    prec0 = cf[0][0] / (cf[0][0] + cf[1][0])
    prec1 = cf[1][1] / (cf[1][1] + cf[0][1])
    rec0 = cf[0][0] / (cf[0][0] + cf[0][1])
    rec1 = cf[1][1] / (cf[1][1] + cf[1][0])

    t = pd.concat([
        pd.DataFrame(thresholds),
        pd.DataFrame(tpr),
        pd.DataFrame(1 - fpr),
        pd.DataFrame(((1 - fpr + tpr) / 2))
    ],
                  axis=1)
    t.columns = ['threshold', 'sensitivity', 'specificity', 'bac']
    t_ = t.iloc[np.min(np.where(t['bac'] == max(t['bac']))), :]
    y_pred_ = (result >= t_['threshold']).astype(bool)

    cm_ = metrics.confusion_matrix(test_labels, result)
    tp = cm_[1, 1]
    fn = cm_[1, 0]
    fp = cm_[0, 1]
    tn = cm_[0, 0]

    bac = t_['bac']  # balanced accuracy
    sensitivity = t_['sensitivity']  # sensitivity
    specificity = t_['specificity']  # specificity
    precision = tp / (tp + fp)  # precision
    f1 = 2 * (
        (sensitivity * precision) / (sensitivity + precision))  # f1 score
    plr = sensitivity / (1 - specificity)  # PLR
    nlr = (1 - sensitivity) / specificity  # NLR

    rng = np.random.RandomState(rng_seed)

    y_true = np.array(test_labels)
    for j in range(n_bootstraps):
        indices = rng.random_integers(0, len(result) - 1, len(result))

        if len(np.unique(y_true[indices])) < 2:
            continue

        auroc_ = metrics.roc_auc_score(y_true[indices], result[indices])
        precision_, recall_, thresholds_ = metrics.precision_recall_curve(
            y_true[indices], result[indices])
        auprc_ = metrics.auc(recall_, precision_)
        CM = metrics.confusion_matrix(
            np.array(y_true)[indices], result.argmax(axis=1))
        TP = CM[1, 1]
        FN = CM[1, 0]
        FP = CM[0, 1]
        TN = CM[0, 0]

        TPV = TP / (TP + FN)  # sensitivity
        TNV = TN / (TN + FP)  # specificity
        PPV = TP / (TP + FP)  # precision
        BAAC = (TPV + TNV) / 2  # balanced accuracy
        F1 = 2 * ((PPV * TPV) / (PPV + TPV))  # f1 score
        PLR = TPV / (1 - TNV)  # LR+
        NLR = (1 - TPV) / TNV  # LR-

        bootstrapped_auroc.append(auroc_)  # AUROC
        bootstrapped_auprc.append(auprc_)  # AUPRC
        bootstrapped_sen.append(TPV)  # Sensitivity
        bootstrapped_spe.append(TNV)  # Specificity
        bootstrapped_bac.append(BAAC)  # Balanced Accuracy
        bootstrapped_f1.append(F1)  # F1 score
        bootstrapped_pre.append(PPV)  # Precision
        bootstrapped_NLR.append(NLR)  # Negative Likelihood Ratio
        bootstrapped_PLR.append(PLR)  # positive Likelihood Ratio

    sorted_auroc = np.array(bootstrapped_auroc)
    sorted_auroc.sort()
    sorted_auprc = np.array(bootstrapped_auprc)
    sorted_auprc.sort()
    sorted_sen = np.array(bootstrapped_sen)
    sorted_sen.sort()
    sorted_spe = np.array(bootstrapped_spe)
    sorted_spe.sort()
    sorted_bac = np.array(bootstrapped_bac)
    sorted_bac.sort()
    sorted_f1 = np.array(bootstrapped_f1)
    sorted_f1.sort()
    sorted_pre = np.array(bootstrapped_pre)
    sorted_pre.sort()
    sorted_NLR = np.array(bootstrapped_NLR)
    sorted_NLR.sort()
    sorted_PLR = np.array(bootstrapped_PLR)
    sorted_PLR.sort()

    auroc_lower = round(sorted_auroc[int(0.025 * len(sorted_auroc))], 4)
    auroc_upper = round(sorted_auroc[int(0.975 * len(sorted_auroc))], 4)
    auprc_lower = round(sorted_auprc[int(0.025 * len(sorted_auprc))], 4)
    auprc_upper = round(sorted_auprc[int(0.975 * len(sorted_auprc))], 4)
    sen_lower = round(sorted_sen[int(0.025 * len(sorted_sen))], 4)
    sen_upper = round(sorted_sen[int(0.975 * len(sorted_sen))], 4)
    spe_lower = round(sorted_spe[int(0.025 * len(sorted_spe))], 4)
    spe_upper = round(sorted_spe[int(0.975 * len(sorted_spe))], 4)
    bac_lower = round(sorted_bac[int(0.025 * len(sorted_bac))], 4)
    bac_upper = round(sorted_bac[int(0.975 * len(sorted_bac))], 4)
    f1_lower = round(sorted_f1[int(0.025 * len(sorted_f1))], 4)
    f1_upper = round(sorted_f1[int(0.975 * len(sorted_f1))], 4)
    pre_lower = round(sorted_pre[int(0.025 * len(sorted_pre))], 4)
    pre_upper = round(sorted_pre[int(0.975 * len(sorted_pre))], 4)
    NLR_lower = round(sorted_NLR[int(0.025 * len(sorted_NLR))], 4)
    NLR_upper = round(sorted_NLR[int(0.975 * len(sorted_NLR))], 4)
    PLR_lower = round(sorted_PLR[int(0.025 * len(sorted_PLR))], 4)
    PLR_upper = round(sorted_PLR[int(0.975 * len(sorted_PLR))], 4)

    auroc_true_ci = str(round(
        auroc, 4)) + " (" + str(auroc_lower) + ", " + str(auroc_upper) + ")"
    auprc_true_ci = str(round(
        auprc, 4)) + " (" + str(auprc_lower) + ", " + str(auprc_upper) + ")"
    sen_true_ci = str(round(
        sensitivity, 4)) + " (" + str(sen_lower) + ", " + str(sen_upper) + ")"
    spe_true_ci = str(round(
        specificity, 4)) + " (" + str(spe_lower) + ", " + str(spe_upper) + ")"
    bac_true_ci = str(round(
        bac, 4)) + " (" + str(bac_lower) + ", " + str(bac_upper) + ")"
    f1_true_ci = str(round(
        f1, 4)) + " (" + str(f1_lower) + ", " + str(f1_upper) + ")"
    pre_true_ci = str(round(
        precision, 4)) + " (" + str(pre_lower) + ", " + str(pre_upper) + ")"
    NLR_true_ci = str(round(
        nlr, 4)) + " (" + str(NLR_lower) + ", " + str(NLR_upper) + ")"
    PLR_true_ci = str(round(
        plr, 4)) + " (" + str(PLR_lower) + ", " + str(PLR_upper) + ")"
    #
    col_n = [
        'thresholds', 'sensitivity', 'specificity', 'precision', 'bacc', 'f1',
        'PLR', 'NLR', 'AUROC', 'AUPRC'
    ]

    final = {
        "thresholds": round(t_['threshold'], 4),
        "sensitivity": sen_true_ci,
        "specificity": spe_true_ci,
        "precision": pre_true_ci,
        "bacc": bac_true_ci,
        "f1": f1_true_ci,
        "PLR": PLR_true_ci,
        "NLR": NLR_true_ci,
        "AUROC": auroc_true_ci,
        "AUPRC": auprc_true_ci
    }
    final = pd.DataFrame(final, index=[0])
    #final1 = pd.DataFrame(final)
    final = final.reindex(columns=col_n)

    total_item = {
        "thresholds": round(t_['threshold'], 4),
        "sensitivity": sorted_sen,
        "specificity": sorted_spe,
        "precision": sorted_pre,
        "bacc": sorted_bac,
        "f1": sorted_f1,
        "PLR": sorted_PLR,
        "NLR": sorted_NLR,
        "AUROC": sorted_auroc,
        "AUPRC": sorted_auprc
    }
    total_pd = pd.DataFrame.from_dict(total_item, orient='columns')

    print(total_pd)

    final2 = pd.DataFrame.append(final, total_pd)
    final2.to_csv("fl_1_bootstrapping.csv", mode="w")

    print("==> bootstrapping end")

    t_test_result = stats.ttest_1samp(sorted_auroc, 0.999)

    print("t-test : ", t_test_result)
Beispiel #22
0
def markov_chain(data_set, no_iteration=10, no_of_simulation=10000, alpha=5):

    import_dataset_v1 = data_set.copy()

    import_dataset_v1 = (import_dataset_v1.reindex(
        import_dataset_v1.index.repeat(
            import_dataset_v1.conversions))).reset_index()
    # print(import_dataset_v1, '----2')

    import_dataset_v1['conversions'] = 1
    # print(import_dataset_v1['conversions'], '---3')

    import_dataset_v1 = import_dataset_v1[['path', 'conversions']]
    # print(import_dataset_v1 , '----4')

    import_dataset = (import_dataset_v1.groupby(['path']).sum()).reset_index()
    # print(import_dataset, '----5')

    import_dataset['probability'] = import_dataset[
        'conversions'] / import_dataset['conversions'].sum()
    # print(import_dataset["probability"], '----6')

    final = pd.DataFrame()

    for k in range(0, no_iteration):
        start = time.time()
        import_data = pd.DataFrame({
            'path':
            np.random.choice(import_dataset['path'],
                             size=import_dataset['conversions'].sum(),
                             p=import_dataset['probability'],
                             replace=True)
        })
        import_data['conversions'] = 1
        # print(import_data, '----7')
        tr_matrix = transition_matrix_func(import_data)
        channel_only = list(
            filter(lambda k0: k0 not in ['start', 'convert'],
                   tr_matrix.columns))

        ga_ex = pd.DataFrame()
        print(ga_ex)
        tr_mat = tr_matrix.copy()
        p = []

        i = 0
        while i < no_of_simulation:
            p.append(unique(simulation(tr_mat, 1000)))
            i = i + 1

        path = list(itertools.chain.from_iterable(p))
        counter = collections.Counter(path)

        df = pd.DataFrame({
            'path': list(counter.keys()),
            'count': list(counter.values())
        })
        df = df[['path', 'count']]
        ga_ex = ga_ex.append(df, ignore_index=True)

        df1 = (pd.DataFrame(ga_ex.groupby(['path'])[['count'
                                                     ]].sum())).reset_index()

        df1['removal_effects'] = df1['count'] / len(path)
        #df1['removal_effects']=df1['count']/sum(df1['count'][df1['path']=='convert'])
        df1 = df1[df1['path'].isin(channel_only)]
        df1['ass_conversion'] = df1['removal_effects'] / sum(
            df1['removal_effects'])

        df1['ass_conversion'] = df1['ass_conversion'] * sum(
            import_dataset['conversions'])

        final = final.append(df1, ignore_index=True)
        end = time.time()
        t1 = (end - start)
        print(t1)
    '''
    H0: u=0
    H1: u>0
    '''
    unique_channel = unique(final['path'])
    #final=(pd.DataFrame(final.groupby(['path'])[['ass_conversion']].mean())).reset_index()
    final_df = pd.DataFrame()

    for i in range(0, len(unique_channel)):

        x = (
            final['ass_conversion'][final['path'] == unique_channel[i]]).values
        final_df.loc[i, 0] = unique_channel[i]
        final_df.loc[i, 1] = x.mean()

        v = stats.ttest_1samp(x, 0)
        final_df.loc[i, 2] = v[1] / 2

        if v[1] / 2 <= alpha / 100:
            final_df.loc[i,
                         3] = str(100 - alpha) + '% statistically confidence'
        else:
            final_df.loc[i, 3] = str(100 -
                                     alpha) + '% statistically not confidence'

        final_df.loc[i, 4] = len(x)
        final_df.loc[i, 5] = statistics.stdev(x)
        final_df.loc[i, 6] = v[0]

    final_df.columns = [
        'channel', 'ass_conversion', 'p_value', 'confidence_status',
        'frequency', 'standard_deviation', 't_statistics'
    ]
    final_df['ass_conversion'] = sum(
        import_dataset['conversions']) * final_df['ass_conversion'] / sum(
            final_df['ass_conversion'])

    return final_df, final
Beispiel #23
0
def calculate_gene_expression_similarity(reduced_stat_map_data, mask="full"):
    store_file = "/ahba_data/store_max1_reduced.h5"
    subcortex_mask = "/ahba_data/subcortex_mask.npy"

    results_dfs = []
    with pd.HDFStore(store_file, 'r') as store:
        for donor_id in store.keys():
            print "Loading expression data (%s)" % donor_id
            expression_data = store.get(donor_id.replace(".", "_"))

            print "Getting statmap values (%s)" % donor_id
            nifti_values = reduced_stat_map_data[expression_data.columns]

            print "Removing missing values (%s)" % donor_id
            na_mask = np.isnan(nifti_values)
            if mask == "subcortex":
                na_mask = np.logical_or(
                    na_mask,
                    np.isnan(np.load(subcortex_mask)[expression_data.columns]))
            elif mask == "cortex":
                na_mask = np.logical_or(
                    na_mask,
                    np.logical_not(
                        np.isnan(
                            np.load(subcortex_mask)[expression_data.columns])))
            else:
                assert mask == "full"

            nifti_values = np.array(nifti_values)[np.logical_not(na_mask)]
            expression_data.drop(expression_data.columns[na_mask],
                                 axis=1,
                                 inplace=True)

            print "z scoring (%s)" % donor_id
            expression_data = pd.DataFrame(zscore(expression_data, axis=1),
                                           columns=expression_data.columns,
                                           index=expression_data.index)
            nifti_values = zscore(nifti_values)

            print "Calculating linear regressions (%s)" % donor_id
            regression_results = np.linalg.lstsq(
                np.c_[nifti_values, np.ones_like(nifti_values)],
                expression_data.T)
            results_df = pd.DataFrame({"slope": regression_results[0][0]},
                                      index=expression_data.index)

            results_df.columns = pd.MultiIndex.from_tuples(
                [(
                    donor_id[1:],
                    c,
                ) for c in results_df.columns],
                names=['donor_id', 'parameter'])

            results_dfs.append(results_df)

        print "Concatenating results"
        results_df = pd.concat(results_dfs, axis=1)
        del results_dfs

    t, p = ttest_1samp(results_df, 0.0, axis=1)
    group_results_df = pd.DataFrame({
        "t": t,
        "p": p
    },
                                    columns=['t', 'p'],
                                    index=expression_data.index)
    _, group_results_df["p (FDR corrected)"], _, _ = multipletests(
        group_results_df.p, method='fdr_bh')
    group_results_df["variance explained (mean)"] = (
        results_df.xs('slope', axis=1, level=1)**2 * 100).mean(axis=1)
    group_results_df["variance explained (std)"] = (
        results_df.xs('slope', axis=1, level=1)**2 * 100).std(axis=1)
    del results_df
    probe_info = pd.read_csv("/ahba_data/probe_info_max1.csv",
                             index_col=0).drop(['chromosome', "gene_id"],
                                               axis=1)
    group_results_df = group_results_df.join(probe_info)
    group_results_df = group_results_df[[
        "gene_symbol", "entrez_id.1", "gene_name", "t", "p",
        "p (FDR corrected)", "variance explained (mean)",
        "variance explained (std)"
    ]]

    return group_results_df
def plotExpBox_Main(inputFiles,headers,valcols,outputFile,sep,startRow,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,plotPvalueCluster,outputClusterPrefix,methodCluster,xlegendrotation,xlabe,ylabe,figsz,titl,showSampleSizes,trimToMinSize,relabels,logb,plotHistogramToFile,plotMedianForGroups,botta,showViolin,showBox,firstColAnnot,plotTrend,showLegend,makePzfxFile,makeBinMatrix,writeDataSummaryStat,summaryStatRange,minuslog10pvalue,minNDataToKeep,vfacecolor,valpha,outXYZPvalues,dividePlots):

	#if plotPvalueCluster:
		#if pvalue cluster is needed:
	#	from Bio.Cluster.cluster import *
	#	from Bio.Cluster import *
		#endif


	
	#the real deal!
	plotData=[]	
	xtickLabels=[]
	
	trendData={}
	annot={}
	
	minSize=-1

	for inputFile,header,cols in zip(inputFiles,headers,valcols):
		fin=generic_istream(inputFile)
		
		startIdx=len(plotData)
		
		if firstColAnnot:
			colAnnot=cols[0]
			cols=cols[1:]
			annotThisFile=[]
			annot[startIdx]=annotThisFile
		else:
			colAnnot=-1
			annotThisFile=None
			
		for col in cols:
			plotData.append([])
			xtickLabels.append(header[col])

		colIndices=range(startIdx,startIdx+len(cols))
		
		if plotTrend:
			#print >> stderr,"plotTrend"
			trendDataThisFile=[]
			trendData[startIdx]=trendDataThisFile
		else:
			trendDataThisFile=None
			
			
		lino=0
		for lin in fin:
			lino+=1
			if lino<startRow:
				continue		
			fields=lin.rstrip("\r\n").split(sep)
			
			if plotTrend:
				#print >> stderr,"a"
				trendDataThisLine=[]
			else:
				trendDataThisLine=None
			
			allDataOKThisLine=True
			
			if colAnnot>=0:
				annotThisFile.append(fields[colAnnot])
			
			for idx,col in zip(colIndices,cols):
				try:
					value=float(fields[col])
					if logb!=0:
						if value==0.0:
							raise ValueError
						value=log(value)/logb							
					plotData[idx].append(value)
					
					if plotTrend:
						trendDataThisLine.append(value)
						#print >> stderr,"value:",value
					
				except:
					allDataOKThisLine=False	
				
			if plotTrend:
				if allDataOKThisLine:
					trendDataThisFile.append(trendDataThisLine)
				else:
					trendDataThisFile.append(None)
			
		fin.close()
	
		
		if minSize==-1:
			minSize=len(plotData[idx]) #or startIDX?
		else:
			minSize=min([minSize,len(plotData[idx])])
		

	if trimToMinSize:
		print >> stderr,"trimming to min size =",minSize
		trimData(plotData,minSize)

	if len(relabels)>0:
		#if len(relabels)!=len(xtickLabels):
		#	print >> stderr,"relabels doesn't have the same length as original label vectors",xtickLabels,"=>",relabels
		#	exit()
		print >> stderr,xtickLabels
		print >> stderr,relabels
		for i,relabel in zip(range(0,len(relabels)),relabels):
			xtickLabels[i]=relabel
		
	
	for i in range(0,len(plotMedianForGroups)):
		plotMedianForGroups[i]=getCol0ListFromCol1ListStringAdv(xtickLabels,plotMedianForGroups[i])
			
	
	#drawing medians:
	medianToDraw=[]
	for mediangrouper in plotMedianForGroups:
		curD=[]		
		for c in mediangrouper:
			curD.extend(plotData[c])
		medianToDraw.append(median(curD))


	for c in range(len(plotData)-1,-1,-1):
		if len(plotData[c])<minNDataToKeep:
			print >> stderr,xtickLabels[c],"discarded because has only",len(plotData[c]),"data points <",minNDataToKeep
			del plotData[c]
			del xtickLabels[c]

	if not skipStat:
		print >> stdout,"student t-test (1 sample; mean=0)"
		print >> stdout,"sample","mean","p-val","median"
	
		if writeDataSummaryStat:
			fDSS=open(writeDataSummaryStat,"w")
			print >> fDSS,"sample\tmean\tvar\tsd\tmin\tmax\tN\tNInRange["+str(summaryStatRange[0])+","+str(summaryStatRange[1])+"]\t%NInRange\tNbelowRange\t%Nbelow\tNAboveRange\t%NAbove"
			
		for x in range(0,len(plotData)):
			#print >> stderr, len(plotData[x])
			try:
				print >> stdout, xtickLabels[x],mean(plotData[x]),ttest_1samp(plotData[x],0)[1],median(plotData[x])
			except:
				print >> stdout, xtickLabels[x],mean(plotData[x]),"NA",median(plotData[x])
			
			if writeDataSummaryStat:
				sumData,N,NIN,NBelow,NAbove=filterDataInRangeInclusive(plotData[x],summaryStatRange[0],summaryStatRange[1])
				
				if NIN>1:
					#print >> stderr,"sumData=",sumData
					#print >> stderr,mean
					mea=mean2(sumData)
					DDOF=1
					sd=std(sumData,ddof=DDOF)
					var=sd*sd
					mi=min(sumData)
					ma=max(sumData)
				else:
					mea="NA"
					sd="NA"
					var="NA"
					mi="NA"
					ma="NA"
				
			
					
				print >> fDSS,xtickLabels[x]+"\t"+str(mea)+"\t"+str(var)+"\t"+str(sd)+"\t"+str(mi)+"\t"+str(ma)+"\t"+str(N)+"\t"+str(NIN)+"\t"+str(float(NIN)*100/N)+"\t"+str(NBelow)+"\t"+str(float(NBelow)*100/N)+"\t"+str(NAbove)+"\t"+str(float(NAbove)*100/N)
			
	
		pvalueM=[]
		
		if writeDataSummaryStat:
			fDSS.close()
		
		print >> stdout,""
		
		print >> stdout,"student t-test (2 samples)"
		print >> stdout,"p-val",
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
		print >> stdout,""
	
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
				else:
					try:
						pvalue=ttest_ind(plotData[x],plotData[y])[1]
					except:
						pvalue=1.0
					
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
					
					print >> stdout, str(pvalue),
					pvalueRow.append(pvalue)
			print >> stdout,"";	
	
		
		print >> stdout,""
	
		
	
	
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_t_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_t",xtickLabels,pvalueM,methodCluster)
	
	
			
		pvalueM=[]
	
		print >> stdout,"welch t-test"
		print >> stdout,"p-val",
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
		print >> stdout,""
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
						
				else:
					try:
						pvalue=welchs_approximate_ttest_arr(plotData[x],plotData[y])[3]
					except:
						pvalue=1.0
	
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
	
					
					print >> stdout, str(pvalue),
					pvalueRow.append(pvalue)
			print >> stdout,"";
	
		if outXYZPvalues:
			writeXYZPvalues(outXYZPvalues+"_Welch.xyz",xtickLabels,pvalueM)
	
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_Welch_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_Welch",xtickLabels,pvalueM,methodCluster)
	
		
		print >> stdout,""
		print >> stdout,"non-parametric (Mann-Whitney U)" #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)"
		print >> stdout,"p-val",
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
	
		pvalueM=[]
	
		print >> stdout,""
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
				else:
					#if max(len(plotData[x]),len(plotData[y]))<=20:
					try:
						pvalue=mannwhitneyu(plotData[x],plotData[y])[1]*2				
					except:
						pvalue=1.0
	
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
	
	
					print >> stdout,pvalue, #mann-whiteney need to mul by 2 (one tail to two tail)
					pvalueRow.append(pvalue)
					#else:
					#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
			print >> stdout,"";	
	
		if outXYZPvalues:
			writeXYZPvalues(outXYZPvalues+"_U.xyz",xtickLabels,pvalueM)
		
	
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_U_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_U",xtickLabels,pvalueM,methodCluster)
		
		#####now the variance tests
		
		print >> stdout,""
		print >> stdout,"Ansari-Bradley Two-sample Test for difference in scale parameters " 
		print >> stdout,"p-val",
		
		
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
	
		pvalueM=[]
	
		print >> stdout,""
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
				else:
					#if max(len(plotData[x]),len(plotData[y]))<=20:
					try:
						pvalue=ansari(plotData[x],plotData[y])[1]		
					except:
						pvalue="NA"
	
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
	
	
						#pvalue=1.0
					print >> stdout,pvalue,
					pvalueRow.append(pvalue)
					#else:
					#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
			print >> stdout,"";	
		
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_Ansari_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_Ansari",xtickLabels,pvalueM,methodCluster)	
		
		
		#####
	
		#####now the variance tests
		
		print >> stdout,""
		print >> stdout,"Fligner's Two-sample Test for equal variance (non-parametrics)" 
		print >> stdout,"p-val",
		
		
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
	
		pvalueM=[]
	
		print >> stdout,""
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
				else:
					#if max(len(plotData[x]),len(plotData[y]))<=20:
					try:
						pvalue=fligner(plotData[x],plotData[y])[1]		
					except:
						pvalue="NA"
						#pvalue=1.0
						
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
	
	
					print >> stdout,pvalue,
					pvalueRow.append(pvalue)
					#else:
					#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
			print >> stdout,"";	
		
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_fligner_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_fligner",xtickLabels,pvalueM,methodCluster)	
		
		
		#####
	
		#####now the variance tests
		
		print >> stdout,""
		print >> stdout,"Levene's Two-sample Test for equal variance" 
		print >> stdout,"p-val",
		
		
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
	
		pvalueM=[]
	
		print >> stdout,""
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
				else:
					#if max(len(plotData[x]),len(plotData[y]))<=20:
					try:
						pvalue=levene(plotData[x],plotData[y])[1]		
					except:
						pvalue="NA"
						#pvalue=1.0
						
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
	
	
					print >> stdout,pvalue,
					pvalueRow.append(pvalue)
					#else:
					#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
			print >> stdout,"";	
		
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_levene_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_levene",xtickLabels,pvalueM,methodCluster)	
		
		
		#####
	
		#####now the variance tests
		
		print >> stdout,""
		print >> stdout,"Bartlett's Two-sample Test for equal variance (for normal distributions)" 
		print >> stdout,"p-val",
		
		
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
	
		pvalueM=[]
	
		print >> stdout,""
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
				else:
					#if max(len(plotData[x]),len(plotData[y]))<=20:
					try:
						pvalue=bartlett(plotData[x],plotData[y])[1]		
					except:
						pvalue="NA"
						#pvalue=1.0
	
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
	
	
					print >> stdout,pvalue,
					pvalueRow.append(pvalue)
					#else:
					#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
			print >> stdout,"";	
		
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_bartlett_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_bartlett",xtickLabels,pvalueM,methodCluster)	
		
		
		#####

	figure(figsize=figsz)
	subplots_adjust(top=0.9, bottom=botta, left=0.2, right=0.8)
	
	if len(titl)==0:
		titl=outputFile


	plotExpBox(plotData,xtickLabels,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,xlegendrotation,xlabe,ylabe,titl,showSampleSizes,showViolin,showBox,annot,trendData,showLegend,makePzfxFile,makeBinMatrix,dividePlots)
	
	#ylim([0,200])
	for m in medianToDraw:
		axhline(y=m,linestyle=':',color='gray')

	savefig(outputFile,bbox_inches="tight")

	if len(plotHistogramToFile)>0:
		drawHistogram(plotHistogramToFile,plotData,xtickLabels)
		drawDensigram(plotHistogramToFile+".density.png",plotData,xtickLabels)
Beispiel #25
0
def main(months=None,
         season="DJF",
         ax=None,
         clevels=None,
         labels=None,
         paths=None):
    if not months:
        months = [12, 1, 2]

    path_to_glaciers_land_sea_mask = "/b2_fs2/huziy/geophy_from_others/land_sea_glacier_mask_phy"

    land_sea_glaciers_mask = get_land_sea_glaciers_mask_from_geophysics_file(
        path=path_to_glaciers_land_sea_mask)
    p_current = "{0}-{1}".format(start_year_current, end_year_current)
    p_future = "{0}-{1}".format(start_year_future, end_year_future)

    lons2d = None
    lats2d = None

    x_index = None
    y_index = None
    mean_data = None

    b = None

    for the_path, label in zip(paths, labels):
        ds = Dataset(the_path)
        if lons2d is None:
            lons2d = ds.variables["longitude"][:]
            lats2d = ds.variables["latitude"][:]

            #b = get_arctic_basemap(lons2d, lats2d)
            b = get_arctic_basemap_nps(round=True)

            x_index = ds.variables["x_index"][:]
            y_index = ds.variables["y_index"][:]

        cache_file = "_".join([
            str(m) for m in months
        ]) + "_{0}-{1}_{2}-{3}_{4}_mean_change_cache.bin".format(
            start_year_current, end_year_current, start_year_future,
            end_year_future, label)

        #os.remove(cache_file)
        if not os.path.isfile(cache_file):
            time_str = ds.variables["time"][:]
            times = [
                datetime.strptime("".join(t_s), TIME_FORMAT)
                for t_s in time_str
            ]

            data = ds.variables["water_discharge_accumulated"][:]
            df = pandas.DataFrame(data=data, index=times)
            df["year"] = df.index.map(lambda d: d.year)
            df["month"] = df.index.map(lambda d: d.month)

            print(df.shape, df.columns)

            data_current = df.ix[(df.year >= start_year_current) &
                                 (df.year <= end_year_current)
                                 & df.month.isin(months), :]
            print(data_current.columns)
            data_current = data_current.drop(["year", "month"], axis=1)
            seasonal_means_current = data_current.groupby(
                by=lambda d: d.year).mean(
                )  #calculate mean for the season for each year

            data_future = df.ix[(df.year >= start_year_future) &
                                (df.year <= end_year_future)
                                & df.month.isin(months), :]
            data_future = data_future.drop(["year", "month"], axis=1)
            seasonal_means_future = data_future.groupby(
                by=lambda d: d.year).mean()

            change = seasonal_means_future.values - seasonal_means_current.values

            mean_current = seasonal_means_current.values.mean(axis=0)
            mean_future = seasonal_means_future.values.mean(axis=0)

            ##axis0 - time, axis1 -  cell index

            mean_change = change.mean(axis=0)

            #print change[:,mean_change > 200000], seasonal_means_future.values[:,mean_change > 200000], seasonal_means_current.values[:,mean_change > 200000]

            t, pvalue = stats.ttest_1samp(change, 0, axis=0)

            data_map = {
                "current-mean": mean_current,
                "future-mean": mean_future,
                "change": mean_change,
                "p-value": pvalue
            }

            pickle.dump(data_map, open(cache_file, mode="w"))
        else:
            data_map = pickle.load(open(cache_file))
            mean_change = data_map["change"]
            pvalue = data_map["p-value"]
            mean_current = data_map["current-mean"]

        if ax is None:
            plt.figure()

        to_plot = np.ma.masked_all_like(lons2d)

        #mask nonsignificant changes

        #change_arr_significant = np.ma.masked_where(pvalue > 1, mean_change)

        #mean_change[mean_change > levels[-1]] = levels[-1] + 10

        to_plot[x_index, y_index] = mean_change

        print(to_plot.min(), to_plot.max())
        print(pvalue.min(), pvalue.max())

        x, y = b(lons2d, lats2d)

        cmap = cm.get_cmap(name="bwr", lut=len(clevels) - 1)
        #cmap = my_colormaps.get_cmap_from_ncl_spec_file(path="colormap_files/BlueRed.rgb", ncolors=len(levels) - 1)
        #cmap.set_over(cmap(levels[-1]))

        bn = BoundaryNorm(clevels, cmap.N)

        #mask glaciers and oceans
        to_plot = np.ma.masked_where(land_sea_glaciers_mask, to_plot)

        #b.pcolormesh(x, y, to_plot)
        #img = b.pcolormesh(x, y, to_plot, vmin = -100, vmax = 100)

        img = b.pcolormesh(x,
                           y,
                           to_plot,
                           cmap=cmap,
                           vmax=clevels[-1],
                           vmin=clevels[0],
                           norm=bn)
        if ax is None:
            cb = b.colorbar(img, extend="both", ticks=clevels)
            #cb = plt.colorbar(ticks = levels)
            cb.ax.set_title(r"${\rm m^3/s}$")

        b.drawcoastlines(linewidth=0.1)
        b.drawmapboundary(fill_color="0.75")
        #b.drawmeridians(meridians=np.arange(-180, 180, 60))
        #b.drawparallels(circles=np.arange(0, 90, 30))
        b.readshapefile("data/shp/wri_basins2/wribasin",
                        "basin",
                        color="k",
                        linewidth=1)

        if ax is None:
            plt.tight_layout()
            imfile = "offline_rout_{0}_mean_abschange_map_{1}_({2})-({3}).jpeg".format(
                season, label, p_future, p_current)
            plt.savefig(imfile, dpi=400)

        return img
Beispiel #26
0
        bounds = np.where(np.diff(np.argmax(ev.segments_[0], axis=1)))[0]
        
        match = np.zeros(nPerm+1)
        perm_bounds = bounds.copy()

        for p in range(nPerm+1):
            for hb in human_bounds:
                if np.any(np.abs(perm_bounds - hb) <= w):
                    match[p] += 1
            match[p] /= len(human_bounds)
            np.random.seed(p)
            perm_bounds = np.random.choice(nTR,K-1,replace=False)
        
        z_scores[k,s] = (match[0] - np.mean(match[1:]))/np.std(match[1:])
 
    t_scores[k] = stats.ttest_1samp(z_scores[k,:],0,axis=0)[0]

savedir = '/jukebox/norman/jamalw/MES/prototype/link/scripts/data/searchlight_output/HMM_searchlight_human_bounds_srm/plots/bilateral_mPFC/'   
np.save(savedir + 'zscores',z_scores)
np.save(savedir + 'tstats', t_scores)

fig, ax1 = plt.subplots()

xp = np.linspace(3,features[-1:],100)

p1 = np.poly1d(np.polyfit(features,np.mean(z_scores,axis=1),2))
ax1.plot(features,np.mean(z_scores,axis=1),'.',xp, p1(xp), '-',color='k',linewidth=3,markersize=15)
ax1.set_ylabel('average z', color='k', fontsize=18)
ax1.tick_params(labelsize=15)

ax2 = ax1.twinx()