def apply_stats(data,runTTest): peakList = getPeakList(data) tempList = list() colNames = ['Fatty Acid Type', #1 'Peak Name', #2 'Pearson Coefficient', #3 'Pearson P Value', #4 'Spearman Coefficient', #5 'Spearman P Value', #6 'P Geometric Mean (%)', #7 'Q Geometric Mean (ug/ml)', #8 'P Mean (%)', #9 'P Stdev', #10 'Q Mean (ug/ml)', #11 'Q Stdev', #12 'P T-test', #13 'P T-test P value', #14 'Q T-test', #15 'Q T-test P value', #16 'Common Name'] #17 for entry in peakList: try: pearson = pearsonr(entry['p'],entry['q']) spearman = spearmanr(entry['p'],entry['q']) if runTTest == 'y': ttestP = ttest_1samp(entry['p'],0) ttestQ = ttest_1samp(entry['q'],0) else: ttestP = ('-','-') ttestQ = ('-','-') tempList += [entry['FAtype'], #1 entry['peakName'], #2 pearson[0], #3 pearson[1], #4 spearman[0], #5 spearman[1], #6 gmean(entry['p']), #7 gmean(entry['q']), #8 np.mean(entry['p']), #9 np.std(entry['p'],ddof=1), #10 np.mean(entry['q']), #11 np.std(entry['q'],ddof=1), #12 ttestP[0], #13 ttestP[1], #14 ttestQ[0], #15 ttestQ[1], #16 entry['common']], #17 except: pass return pd.DataFrame(tempList, columns=colNames)
def doStudentT(allData): fileIndex = 0 for data in allData: print('***************************** FILE: test_', fileIndex, '******************************') resultv0 = stats.ttest_1samp(data['v0'], 0) resultv1 = stats.ttest_1samp(data['v1'], 0) resultv2 = stats.ttest_1samp(data['v2'], 0) resultv3 = stats.ttest_1samp(data['v3'], 0) resultv4 = stats.ttest_1samp(data['v4'], 0) resultv5 = stats.ttest_1samp(data['v5'], 0) resultv6 = stats.ttest_1samp(data['v6'], 0) resultv7 = stats.ttest_1samp(data['v7'], 0) plt.plot(resultv0, label="v0") plt.plot(resultv1, label="v1") plt.plot(resultv2, label="v2") plt.plot(resultv3, label="v3") plt.plot(resultv4, label="v4") plt.plot(resultv5, label="v5") plt.plot(resultv6, label="v6") plt.plot(resultv7, label="v7") plt.xlabel('P value') plt.ylabel('T statistic') plt.title("Histogram for dataset " + str(fileIndex)) plt.legend() plt.show() fileIndex = fileIndex + 1
def detect_trend(self, time_series_x: np.ndarray, time_series_y: np.ndarray): """ Method that performs the Innovative Trend Analysis to the given time-series or signal. This method is visual so the result will be the creation of a file with the plot of the result. :param time_series_x: time variable of the time series to analyze :param time_series_y: value of the time series to analyze """ # Odd time series are problematic if time_series_y.shape[0] // 2 != 0: time_series_y = time_series_y[:-1] first_half, second_half = np.split(time_series_y, indices_or_sections=2) first_half = np.sort(first_half) second_half = np.sort(second_half) self._plot_ita(first_half=first_half, second_half=second_half, time_series_min=np.min(time_series_y), time_series_max=np.max(time_series_y)) second_half = second_half - first_half np.random.shuffle(second_half) # comparing with no trend line mean if second_half.shape[0] < 30: _, p_score = stats.ttest_1samp(second_half, 0.0) else: _, p_score = ztest(second_half, value=0.0) trend = p_score <= self.confidence_level return trend,
def approximate_random_effects(data, labels, group): correlation_per_donor = {} for donor_id in set(data[group]): correlation_per_donor[donor_id], _, _, _, _ = linregress(list(data[labels[0]][data[group] == donor_id]),list(data[labels[1]][data[group] == donor_id])) average_slope = np.array(correlation_per_donor.values()).mean() t, p_val = ttest_1samp(correlation_per_donor.values(), 0) print "Averaged slope across donors = %g (t=%g, p=%g)"%(average_slope, t, p_val) return average_slope, t, p_val
def calculate_gene_expression_similarity(reduced_stat_map_data, mask="full"): store_file = "/ahba_data/store_max1_reduced.h5" subcortex_mask = "/ahba_data/subcortex_mask.npy" results_dfs = [] with pd.HDFStore(store_file, 'r') as store: for donor_id in store.keys(): print "Loading expression data (%s)" % donor_id expression_data = store.get(donor_id.replace(".", "_")) print "Getting statmap values (%s)" % donor_id nifti_values = reduced_stat_map_data[expression_data.columns] print "Removing missing values (%s)" % donor_id na_mask = np.isnan(nifti_values) if mask == "subcortex": na_mask = np.logical_or(na_mask, np.isnan(np.load(subcortex_mask)[expression_data.columns])) elif mask == "cortex": na_mask = np.logical_or(na_mask, np.logical_not(np.isnan( np.load(subcortex_mask)[expression_data.columns]))) else: assert mask == "full" nifti_values = np.array(nifti_values)[np.logical_not(na_mask)] expression_data.drop(expression_data.columns[na_mask], axis=1, inplace=True) print "z scoring (%s)" % donor_id expression_data = pd.DataFrame(zscore(expression_data, axis=1), columns=expression_data.columns, index=expression_data.index) nifti_values = zscore(nifti_values) print "Calculating linear regressions (%s)" % donor_id regression_results = np.linalg.lstsq(np.c_[nifti_values, np.ones_like(nifti_values)], expression_data.T) results_df = pd.DataFrame({"slope": regression_results[0][0]}, index=expression_data.index) results_df.columns = pd.MultiIndex.from_tuples([(donor_id[1:], c,) for c in results_df.columns], names=['donor_id', 'parameter']) results_dfs.append(results_df) print "Concatenating results" results_df = pd.concat(results_dfs, axis=1) del results_dfs t, p = ttest_1samp(results_df, 0.0, axis=1) group_results_df = pd.DataFrame({"t": t, "p": p}, columns=['t', 'p'], index=expression_data.index) _, group_results_df["p (FDR corrected)"], _, _ = multipletests(group_results_df.p, method='fdr_bh') group_results_df["variance explained (mean)"] = (results_df.xs('slope', axis=1, level=1) ** 2 * 100).mean(axis=1) group_results_df["variance explained (std)"] = (results_df.xs('slope', axis=1, level=1) ** 2 * 100).std(axis=1) del results_df probe_info = pd.read_csv("/ahba_data/probe_info_max1.csv", index_col=0).drop(['chromosome', "gene_id"], axis=1) group_results_df = group_results_df.join(probe_info) group_results_df = group_results_df[["gene_symbol", "entrez_id.1", "gene_name","t", "p", "p (FDR corrected)", "variance explained (mean)", "variance explained (std)"]] return group_results_df
def approximate_random_effects(data, labels, group): slope_per_donor = np.array([]) rval_per_donor = np.array([]) #print "Performing approximate random effect analysis..." for donor_id in set( data[group]): #for donor_id in donorids, perform linear regression #print "Total usable datapoints of donor %s: %d" % (donor_id, len(list(data[labels[0]][data[group] == donor_id]))) #shows usable datapoints per donor slope, _, rval, p_val, stderr = linregress( list(data[labels[0]][data[group] == donor_id]), list(data[labels[1]][data[group] == donor_id])) slope_per_donor = np.append(slope_per_donor, slope) rval_per_donor = np.append(rval_per_donor, rval) #average_slope = round(slope_per_donor.mean(),6) #get mean r-value across donors #average_rval = round(rval_per_donor.mean(),6) #get mean r-value across donors average_slope = round(np.nanmean(slope_per_donor), 6) #get mean r-value across donors average_rval = round(np.nanmean(rval_per_donor), 6) #get mean r-value across donors t_value, p_value = ttest_1samp( slope_per_donor, 0) #t-test (redundant information for downstream analyses) with open(output_file, 'a') as f: #saving full data to .csv w = csv.writer(f) #print "Saving the analysis results..." w.writerow([ gene, average_rval, average_slope, rval_per_donor[0], rval_per_donor[1], rval_per_donor[2], rval_per_donor[3], rval_per_donor[4], rval_per_donor[5], t_value, p_value ]) with open(output_file_GSEA, 'a') as f: #saving GSEA input data to .csv w = csv.writer(f, delimiter='\t') #print "Saving to GSEA input file..." w.writerow([gene, average_rval]) #Scatterplot of gene expression against reverse inference fMRI map z-score print "Plotting the correlation graph..." ax = sns.lmplot(labels[0], labels[1], data, hue=group, legend=True, fit_reg=True) #comment-out for no plotting ax.set(xlabel="%s map z-score value" % (cog_function.capitalize())) ax = plot.title(gene) print "Saving the correlation graph..." plot.savefig(plot_pdf, format='pdf') plot.close() return
def approximate_random_effects(data, labels, group): correlation_per_donor = {} for donor_id in set(data[group]): correlation_per_donor[donor_id], _, _, _, _ = linregress(list(data[labels[0]][data[group] == donor_id]), list(data[labels[1]][data[group] == donor_id])) average_slope = np.array(correlation_per_donor.values()).mean() t, p_val = ttest_1samp(correlation_per_donor.values(), 0) print "Averaged slope across donors = %g (t=%g, p=%g)" % (average_slope, t, p_val) sns.violinplot([correlation_per_donor.values()], inner="points", names=["donors"]) plt.ylabel("Linear regression slopes between %s and %s" % (labels[0], labels[1])) plt.axhline(0, color="red") sns.lmplot(labels[0], labels[1], data, hue=group, col=group, col_wrap=3) plt.show() return average_slope, t, p_val
def approximate_random_effects(data, labels, group): correlation_per_donor = {} for donor_id in set(data[group]): correlation_per_donor[donor_id], _, _, _, _ = linregress(list(data[labels[0]][data[group] == donor_id]), list(data[labels[1]][data[group] == donor_id])) average_slope = np.array(correlation_per_donor.values()).mean() t, p_val = ttest_1samp(correlation_per_donor.values(), 0) print "Averaged slope across donors = %g (t=%g, p=%g)"%(average_slope, t, p_val) sns.violinplot([correlation_per_donor.values()], inner="points", names=["donors"]) plt.ylabel("Linear regression slopes between %s and %s"%(labels[0],labels[1])) plt.axhline(0, color="red") sns.lmplot(labels[0], labels[1], data, hue=group, col=group, col_wrap=3) plt.show() return average_slope, t, p_val
def roc_stats(): path = "/root/robbis/fmri/carlo_ofp/0_results/" import glob ##### ROC t-test #### rocfile = glob.glob(os.path.join(path, "0_roc*total.nii.gz")) rocimg = ni.load(rocfile[0]) t, p = ttest_1samp(rocimg.get_data(), 0.5, axis=3) q = np.zeros_like(p) q[np.logical_not(p == 0)] = 1 - p[np.logical_not(p == 0)] t[np.isinf(t)] = 0 p[np.isnan(p)] = 0 ni.save(ni.Nifti1Image(q, rocimg.affine), os.path.join(path, "0_roc_ttest_q.nii.gz")) ni.save(ni.Nifti1Image(p, rocimg.affine), os.path.join(path, "0_roc_ttest_p.nii.gz")) ni.save(ni.Nifti1Image(t, rocimg.affine), os.path.join(path, "0_roc_ttest_t.nii.gz"))
def transform(self, labels): ds = self.dataset conditions = self.conditions single_value = self.sample_value if conditions == single_value == None: raise ValueError() elif len(conditions) > 2: raise ValueError() if single_value != None: t, p = ttest_1samp(ds, single_value, axis=0) return t, p t, p = ttest_ind(ds[labels == conditions[0]], ds[labels == conditions[1]], axis=0) #print ds.shape #print t.shape t[np.isnan(t)] = 1 return t
def run(self, labels): ds = self.dataset conditions = self.conditions single_value = self.sample_value if conditions == single_value == None: raise ValueError() elif len(conditions)>2: raise ValueError() if single_value != None: t, p = ttest_1samp(ds, single_value, axis=0) return t, p t, p = ttest_ind(ds[labels == conditions[0]], ds[labels == conditions[1]], axis=0 ) #print ds.shape #print t.shape t[np.isnan(t)] = 1 return t
# read the data frame df = pd.read_csv('data_hier.csv') # approximate_random_effects #slope[0], intercept[1], r_value[2], p_value[3], std_err[4] correlation_per_study = {} #correlation_per_study = df.groupby('study_id').apply(lambda v: linregress(v.age, v.gain_avg_dec_thr)[0]) correlation_per_study = df.groupby('study_id').apply( lambda v: linregress(v.age, v.loss_avg_dec_thr)[0]) correlation_df = correlation_per_study.reset_index() correlation_df.columns = ['study_id', 'slope'] average_slope = np.mean(correlation_df.slope) t, p_val = ttest_1samp(correlation_df.slope, 0) print "Averaged slope across studies = %g (t=%g, p=%g)" % (average_slope, t, p_val) sns.violinplot(correlation_df.slope, inner="points", names=["studies"]) plt.ylabel("Linear regression slopes between age and decision threshold") plt.axhline(0, color="red") #sns.lmplot('age', 'gain_avg_dec_thr', data=df, hue='study_id', col='study_id', col_wrap=3) sns.lmplot('age', 'loss_avg_dec_thr', data=df, hue='study_id', col='study_id', col_wrap=3) plt.show()
def plotExpBox_Main(inputFiles,headers,valcols,outputFile,sep,startRow,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,plotPvalueCluster,outputClusterPrefix,methodCluster,xlegendrotation,xlabe,ylabe,figsz,titl,showSampleSizes,trimToMinSize,relabels): #if plotPvalueCluster: #if pvalue cluster is needed: # from Bio.Cluster.cluster import * # from Bio.Cluster import * #endif #the real deal! plotData=[] xtickLabels=[] minSize=-1 for inputFile,header,cols in zip(inputFiles,headers,valcols): fin=generic_istream(inputFile) startIdx=len(plotData) for col in cols: plotData.append([]) xtickLabels.append(header[col]) colIndices=range(startIdx,startIdx+len(cols)) lino=0 for lin in fin: lino+=1 if lino<startRow: continue fields=lin.rstrip("\r\n").split(sep) for idx,col in zip(colIndices,cols): try: value=float(fields[col]) plotData[idx].append(value) except: pass fin.close() if minSize==-1: minSize=len(plotData[idx]) #or startIDX? else: minSize=min([minSize,len(plotData[idx])]) if trimToMinSize: print >> stderr,"trimming to min size =",minSize trimData(plotData,minSize) if len(relabels)>0: if len(relabels)!=len(xtickLabels): print >> stderr,"relabels doesn't have the same length as original label vectors",xtickLabels,"=>",relabels exit() xtickLabels=relabels print >> stdout,"student t-test (1 sample; mean=0)" print >> stdout,"sample","mean","p-val" for x in range(0,len(plotData)): #print >> stderr, len(plotData[x]) print >> stdout, xtickLabels[x],mean(plotData[x]),ttest_1samp(plotData[x],0)[1] pvalueM=[] print >> stdout,"" print >> stdout,"student t-test (2 samples)" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: pvalue=ttest_ind(plotData[x],plotData[y])[1] print >> stdout, str(pvalue), pvalueRow.append(pvalue) print >> stdout,""; print >> stdout,"" if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_t_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_t",xtickLabels,pvalueM,methodCluster) pvalueM=[] print >> stdout,"welch t-test" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: pvalue=welchs_approximate_ttest_arr(plotData[x],plotData[y])[3] print >> stdout, str(pvalue), pvalueRow.append(pvalue) print >> stdout,""; if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_Welch_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_Welch",xtickLabels,pvalueM,methodCluster) print >> stdout,"" print >> stdout,"non-parametric (Mann-Whitney U)" #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], pvalueM=[] print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: pvalue=mannwhitneyu(plotData[x],plotData[y])[1]*2 print >> stdout,pvalue, #mann-whiteney need to mul by 2 (one tail to two tail) pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout,""; if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_U_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_U",xtickLabels,pvalueM,methodCluster) figure(figsize=figsz) if len(titl)==0: titl=outputFile plotExpBox(plotData,xtickLabels,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,xlegendrotation,xlabe,ylabe,titl,showSampleSizes) #ylim([0,200]) savefig(outputFile,bbox_inches="tight")
def plotExpBox_Main(inputFile,header,cols,outputFile,sep,startRow,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,plotPvalueCluster,outputClusterPrefix,methodCluster,xlegendrotation,xlabe,ylabe,figsz,titl,showSampleSizes): #if plotPvalueCluster: #if pvalue cluster is needed: # from Bio.Cluster.cluster import * # from Bio.Cluster import * #endif #the real deal! fin=generic_istream(inputFile) plotData=[] xtickLabels=[] for col in cols: plotData.append([]) xtickLabels.append(header[col]) colIndices=range(0,len(cols)) lino=0 for lin in fin: lino+=1 if lino<startRow: continue fields=lin.rstrip("\r\n").split(sep) for idx,col in zip(colIndices,cols): try: value=float(fields[col]) plotData[idx].append(value) except: pass fin.close() print >> stdout,"student t-test (1 sample; mean=0)" print >> stdout,"sample","mean","p-val" for x in range(0,len(plotData)): print >> stdout, xtickLabels[x],mean(plotData[x]),ttest_1samp(plotData[x],0)[1] pvalueM=[] print >> stdout,"" print >> stdout,"student t-test (2 samples)" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: pvalue=ttest_ind(plotData[x],plotData[y])[1] print >> stdout, str(pvalue), pvalueRow.append(pvalue) print >> stdout,""; print >> stdout,"" makePValueRawPlot(outputClusterPrefix+"_t_raw",xtickLabels,pvalueM) if plotPvalueCluster: makePValueClusterPlot(outputClusterPrefix+"_t",xtickLabels,pvalueM,methodCluster) pvalueM=[] print >> stdout,"welch t-test" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: pvalue=welchs_approximate_ttest_arr(plotData[x],plotData[y])[3] print >> stdout, str(pvalue), pvalueRow.append(pvalue) print >> stdout,""; makePValueRawPlot(outputClusterPrefix+"_Welch_raw",xtickLabels,pvalueM) if plotPvalueCluster: makePValueClusterPlot(outputClusterPrefix+"_Welch",xtickLabels,pvalueM,methodCluster) print >> stdout,"" print >> stdout,"non-parametric (Mann-Whitney U)" #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], pvalueM=[] print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: pvalue=mannwhitneyu(plotData[x],plotData[y])[1]*2 print >> stdout,pvalue, #mann-whiteney need to mul by 2 (one tail to two tail) pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout,""; makePValueRawPlot(outputClusterPrefix+"_U_raw",xtickLabels,pvalueM) if plotPvalueCluster: makePValueClusterPlot(outputClusterPrefix+"_U",xtickLabels,pvalueM,methodCluster) figure(figsize=figsz) if len(titl)==0: titl=outputFile plotExpBox(plotData,xtickLabels,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,xlegendrotation,xlabe,ylabe,titl,showSampleSizes) #ylim([0,200]) savefig(outputFile,bbox_inches="tight")
results = np.zeros((REPEATS, )) print("Stochastic repeats:") for i in range(REPEATS): print("{}..".format(i + 1), end=" ", flush=True) results[i] = solver.solve(prob).getDistance() / 1000 print("\nDone!") mono = results.copy() mono[0] = min([mono[0], greedyDistance]) for i in range(1, REPEATS): mono[i] = min([mono[i], mono[i - 1]]) from matplotlib import pyplot as plt plt.plot(range(1, REPEATS + 1), mono) plt.plot(range(1, REPEATS + 1), [greedyDistance for x in mono]) plt.legend(['stochastic', 'greedy best first']) plt.axis([0, REPEATS, 0.9 * mono.min(), 1.1 * mono.max()]) #plt.legend([[greedyDistance for x in mono],mono],['greedy best first','stochastic']) plt.title('Length of solution by Number of Repeats') plt.xlabel('#Repeats') plt.ylabel('Solution Length') plt.grid(color='gray', linestyle=':', linewidth=1) plt.show() # TODO : Part2 - Remove the exit and perform the t-test mean_result = np.mean(results) std_result = np.std(results) tmp, pvalue = stats.ttest_1samp(results, greedyDistance) print(mean_result, std_result, pvalue)
def summarise(projects): summDB = PDatabase(local='summary.fs') C = CorrelationAnalyser() figs = [] for f in range(4): figs.append(plt.figure()) gs = gridspec.GridSpec(5, 5, wspace=0.3, hspace=0.5) i=0 data=[] print 'processing %s projects' %len(projects) for p in projects: print 'structure:',p DB = PDatabase(local=os.path.join(savepath,p)) S = PEATTableModel(DB) try: exp,pre = S.getColumns(['Exp','prediction'],allowempty=False) errs = [j[0]-j[1] for j in zip(exp,pre)] except: print 'no results' continue #DB.close() #add link to proj summDB.add(p) summDB.addField('project',fieldtype='Project') summDB[p]['project'] = {'server':'enzyme.ucd.ie','username':'******', 'project':p,'password':'******','port':'8080'} print summDB.isChanged() #stats cc,rmse,meanerr = C.getStats(pre,exp) #ttest for mean errs 0 ttp = round(stats.ttest_1samp(errs, 0)[1],2) #normality of errs w,swp = C.ShapiroWilk(errs) x={'name':p,'mutants':len(pre),'rmse':rmse,'corrcoef':cc,'meanerr':meanerr, 'ttest':ttp,'shapirowilk':swp} '''ax = figs[0].add_subplot(gs[0, i]) C.plotCorrelation(pre,exp,title=p,ms=2,axeslabels=False,ax=ax) ax = figs[1].add_subplot(gs[0, i]) C.showHistogram([pre,exp],title=p,labels=['pre','exp'],ax=ax) ax = figs[2].add_subplot(gs[0, i]) C.plotNorm(errs,title=p,lw=1,ax=ax) #qqplot ax = figs[3].add_subplot(gs[0, i]) C.QQplot(errs,title=p,ax=ax)''' #get PDB info parser = PDBParser() descr = parser.getDescription(p) x.update(descr) data.append(x) i+=1 summDB.importDict(data) print summDB.isChanged() summDB.commit() #add all peatsa jobs to summary proj also '''print 'adding peatsa job info' PS = PEATSAPlugin() PS.main(DB=summDB) #summDB.meta.peatsa_jobs = None #from ZODB.PersistentMapping import PersistentMapping #summDB.meta.peatsa_jobs = PersistentMapping() PS.checkJobsDict() PS.jobManager.stopLogging() for p in projects: #print summDB.meta DB = PDatabase(local=os.path.join(savepath,p)) job = DB.meta.peatsa_jobs['mycalc'] summDB.meta.peatsa_jobs[p] = job print job #DB.close() print summDB.isChanged() print summDB.meta.peatsa_jobs summDB.commit()''' #for i in range(len(figs)): # figs[i].savefig('fig%s.png' %i) #plt.show() return
def plotExpBox_Main(inputFiles,headers,valcols,outputFile,sep,startRow,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,plotPvalueCluster,outputClusterPrefix,methodCluster,xlegendrotation,xlabe,ylabe,figsz,titl,showSampleSizes,trimToMinSize,relabels,logb,plotHistogramToFile,plotMedianForGroups,botta): #if plotPvalueCluster: #if pvalue cluster is needed: # from Bio.Cluster.cluster import * # from Bio.Cluster import * #endif #the real deal! plotData=[] xtickLabels=[] minSize=-1 for inputFile,header,cols in zip(inputFiles,headers,valcols): fin=generic_istream(inputFile) startIdx=len(plotData) for col in cols: plotData.append([]) xtickLabels.append(header[col]) colIndices=range(startIdx,startIdx+len(cols)) lino=0 for lin in fin: lino+=1 if lino<startRow: continue fields=lin.rstrip("\r\n").split(sep) for idx,col in zip(colIndices,cols): try: value=float(fields[col]) if logb!=0: if value==0.0: raise ValueError value=log(value)/logb plotData[idx].append(value) except: pass fin.close() if minSize==-1: minSize=len(plotData[idx]) #or startIDX? else: minSize=min([minSize,len(plotData[idx])]) if trimToMinSize: print >> stderr,"trimming to min size =",minSize trimData(plotData,minSize) if len(relabels)>0: #if len(relabels)!=len(xtickLabels): # print >> stderr,"relabels doesn't have the same length as original label vectors",xtickLabels,"=>",relabels # exit() print >> stderr,xtickLabels print >> stderr,relabels for i,relabel in zip(range(0,len(relabels)),relabels): xtickLabels[i]=relabel for i in range(0,len(plotMedianForGroups)): plotMedianForGroups[i]=getCol0ListFromCol1ListStringAdv(xtickLabels,plotMedianForGroups[i]) #drawing medians: medianToDraw=[] for mediangrouper in plotMedianForGroups: curD=[] for c in mediangrouper: curD.extend(plotData[c]) medianToDraw.append(median(curD)) for c in range(len(plotData)-1,-1,-1): if len(plotData[c])==0: print >> stderr,xtickLabels[c],"discarded" del plotData[c] del xtickLabels[c] print >> stdout,"student t-test (1 sample; mean=0)" print >> stdout,"sample","mean","p-val" for x in range(0,len(plotData)): #print >> stderr, len(plotData[x]) try: print >> stdout, xtickLabels[x],mean(plotData[x]),ttest_1samp(plotData[x],0)[1] except: print >> stdout, xtickLabels[x],"NA","NA" pvalueM=[] print >> stdout,"" print >> stdout,"student t-test (2 samples)" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: try: pvalue=ttest_ind(plotData[x],plotData[y])[1] except: pvalue=1.0 print >> stdout, str(pvalue), pvalueRow.append(pvalue) print >> stdout,""; print >> stdout,"" if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_t_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_t",xtickLabels,pvalueM,methodCluster) pvalueM=[] print >> stdout,"welch t-test" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: pvalue=welchs_approximate_ttest_arr(plotData[x],plotData[y])[3] print >> stdout, str(pvalue), pvalueRow.append(pvalue) print >> stdout,""; if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_Welch_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_Welch",xtickLabels,pvalueM,methodCluster) print >> stdout,"" print >> stdout,"non-parametric (Mann-Whitney U)" #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], pvalueM=[] print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: try: pvalue=mannwhitneyu(plotData[x],plotData[y])[1]*2 except: pvalue=1.0 print >> stdout,pvalue, #mann-whiteney need to mul by 2 (one tail to two tail) pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout,""; if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_U_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_U",xtickLabels,pvalueM,methodCluster) figure(figsize=figsz) subplots_adjust(top=0.9, bottom=botta, left=0.2, right=0.8) if len(titl)==0: titl=outputFile plotExpBox(plotData,xtickLabels,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,xlegendrotation,xlabe,ylabe,titl,showSampleSizes) #ylim([0,200]) for m in medianToDraw: axhline(y=m,linestyle=':',color='gray') savefig(outputFile,bbox_inches="tight") if len(plotHistogramToFile)>0: drawHistogram(plotHistogramToFile,plotData,xtickLabels)
def plotExpBox_Main(inputFiles, headers, valcols, outputFile, sep, startRow, showIndPoints, mark, markMean, showMean, notch, whisker, outliers, plotPvalueCluster, outputClusterPrefix, methodCluster, xlegendrotation, xlabe, ylabe, figsz, titl, showSampleSizes, trimToMinSize, relabels, logb, plotHistogramToFile, plotMedianForGroups, botta, showViolin, showBox, firstColAnnot, plotTrend, showLegend, makePzfxFile, makeBinMatrix, writeDataSummaryStat, summaryStatRange, minuslog10pvalue, minNDataToKeep, vfacecolor, valpha, outXYZPvalues, dividePlots): #if plotPvalueCluster: #if pvalue cluster is needed: # from Bio.Cluster.cluster import * # from Bio.Cluster import * #endif #the real deal! plotData = [] xtickLabels = [] trendData = {} annot = {} minSize = -1 for inputFile, header, cols in zip(inputFiles, headers, valcols): fin = generic_istream(inputFile) startIdx = len(plotData) if firstColAnnot: colAnnot = cols[0] cols = cols[1:] annotThisFile = [] annot[startIdx] = annotThisFile else: colAnnot = -1 annotThisFile = None for col in cols: plotData.append([]) xtickLabels.append(header[col]) colIndices = range(startIdx, startIdx + len(cols)) if plotTrend: #print >> stderr,"plotTrend" trendDataThisFile = [] trendData[startIdx] = trendDataThisFile else: trendDataThisFile = None lino = 0 for lin in fin: lino += 1 if lino < startRow: continue fields = lin.rstrip("\r\n").split(sep) if plotTrend: #print >> stderr,"a" trendDataThisLine = [] else: trendDataThisLine = None allDataOKThisLine = True if colAnnot >= 0: annotThisFile.append(fields[colAnnot]) for idx, col in zip(colIndices, cols): try: value = float(fields[col]) if logb != 0: if value == 0.0: raise ValueError value = log(value) / logb plotData[idx].append(value) if plotTrend: trendDataThisLine.append(value) #print >> stderr,"value:",value except: allDataOKThisLine = False if plotTrend: if allDataOKThisLine: trendDataThisFile.append(trendDataThisLine) else: trendDataThisFile.append(None) fin.close() if minSize == -1: minSize = len(plotData[idx]) #or startIDX? else: minSize = min([minSize, len(plotData[idx])]) if trimToMinSize: print >> stderr, "trimming to min size =", minSize trimData(plotData, minSize) if len(relabels) > 0: #if len(relabels)!=len(xtickLabels): # print >> stderr,"relabels doesn't have the same length as original label vectors",xtickLabels,"=>",relabels # exit() print >> stderr, xtickLabels print >> stderr, relabels for i, relabel in zip(range(0, len(relabels)), relabels): xtickLabels[i] = relabel for i in range(0, len(plotMedianForGroups)): plotMedianForGroups[i] = getCol0ListFromCol1ListStringAdv( xtickLabels, plotMedianForGroups[i]) #drawing medians: medianToDraw = [] for mediangrouper in plotMedianForGroups: curD = [] for c in mediangrouper: curD.extend(plotData[c]) medianToDraw.append(median(curD)) for c in range(len(plotData) - 1, -1, -1): if len(plotData[c]) < minNDataToKeep: print >> stderr, xtickLabels[c], "discarded because has only", len( plotData[c]), "data points <", minNDataToKeep del plotData[c] del xtickLabels[c] if not skipStat: print >> stdout, "student t-test (1 sample; mean=0)" print >> stdout, "sample", "mean", "p-val", "median" if writeDataSummaryStat: fDSS = open(writeDataSummaryStat, "w") print >> fDSS, "sample\tmean\tvar\tsd\tmin\tmax\tN\tNInRange[" + str( summaryStatRange[0]) + "," + str( summaryStatRange[1] ) + "]\t%NInRange\tNbelowRange\t%Nbelow\tNAboveRange\t%NAbove" for x in range(0, len(plotData)): #print >> stderr, len(plotData[x]) try: print >> stdout, xtickLabels[x], mean( plotData[x]), ttest_1samp(plotData[x], 0)[1], median(plotData[x]) except: print >> stdout, xtickLabels[x], mean( plotData[x]), "NA", median(plotData[x]) if writeDataSummaryStat: sumData, N, NIN, NBelow, NAbove = filterDataInRangeInclusive( plotData[x], summaryStatRange[0], summaryStatRange[1]) if NIN > 1: #print >> stderr,"sumData=",sumData #print >> stderr,mean mea = mean2(sumData) DDOF = 1 sd = std(sumData, ddof=DDOF) var = sd * sd mi = min(sumData) ma = max(sumData) else: mea = "NA" sd = "NA" var = "NA" mi = "NA" ma = "NA" print >> fDSS, xtickLabels[x] + "\t" + str(mea) + "\t" + str( var) + "\t" + str(sd) + "\t" + str(mi) + "\t" + str( ma) + "\t" + str(N) + "\t" + str(NIN) + "\t" + str( float(NIN) * 100 / N) + "\t" + str(NBelow) + "\t" + str( float(NBelow) * 100 / N) + "\t" + str(NAbove) + "\t" + str( float(NAbove) * 100 / N) pvalueM = [] if writeDataSummaryStat: fDSS.close() print >> stdout, "" print >> stdout, "student t-test (2 samples)" print >> stdout, "p-val", for x in range(0, len(plotData)): print >> stdout, xtickLabels[x], print >> stdout, "" for x in range(0, len(plotData)): pvalueRow = [] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0, len(plotData)): if y <= x: print >> stdout, "", if x == y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: try: pvalue = ttest_ind(plotData[x], plotData[y])[1] except: pvalue = 1.0 if minuslog10pvalue and str(pvalue) != "NA": try: pvalue = -1 * log(pvalue, 10) except: pvalue = -1000.0 print >> stdout, str(pvalue), pvalueRow.append(pvalue) print >> stdout, "" print >> stdout, "" if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix + "_t_raw", xtickLabels, pvalueM) makePValueClusterPlot(outputClusterPrefix + "_t", xtickLabels, pvalueM, methodCluster) pvalueM = [] print >> stdout, "welch t-test" print >> stdout, "p-val", for x in range(0, len(plotData)): print >> stdout, xtickLabels[x], print >> stdout, "" for x in range(0, len(plotData)): pvalueRow = [] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0, len(plotData)): if y <= x: print >> stdout, "", if x == y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: try: pvalue = welchs_approximate_ttest_arr( plotData[x], plotData[y])[3] except: pvalue = 1.0 if minuslog10pvalue and str(pvalue) != "NA": try: pvalue = -1 * log(pvalue, 10) except: pvalue = -1000.0 print >> stdout, str(pvalue), pvalueRow.append(pvalue) print >> stdout, "" if outXYZPvalues: writeXYZPvalues(outXYZPvalues + "_Welch.xyz", xtickLabels, pvalueM) if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix + "_Welch_raw", xtickLabels, pvalueM) makePValueClusterPlot(outputClusterPrefix + "_Welch", xtickLabels, pvalueM, methodCluster) print >> stdout, "" print >> stdout, "non-parametric (Mann-Whitney U)" #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)" print >> stdout, "p-val", for x in range(0, len(plotData)): print >> stdout, xtickLabels[x], pvalueM = [] print >> stdout, "" for x in range(0, len(plotData)): pvalueRow = [] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0, len(plotData)): if y <= x: print >> stdout, "", if x == y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: try: pvalue = mannwhitneyu(plotData[x], plotData[y])[1] * 2 except: pvalue = 1.0 if minuslog10pvalue and str(pvalue) != "NA": try: pvalue = -1 * log(pvalue, 10) except: pvalue = -1000.0 print >> stdout, pvalue, #mann-whiteney need to mul by 2 (one tail to two tail) pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout, "" if outXYZPvalues: writeXYZPvalues(outXYZPvalues + "_U.xyz", xtickLabels, pvalueM) if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix + "_U_raw", xtickLabels, pvalueM) makePValueClusterPlot(outputClusterPrefix + "_U", xtickLabels, pvalueM, methodCluster) #####now the variance tests print >> stdout, "" print >> stdout, "Ansari-Bradley Two-sample Test for difference in scale parameters " print >> stdout, "p-val", for x in range(0, len(plotData)): print >> stdout, xtickLabels[x], pvalueM = [] print >> stdout, "" for x in range(0, len(plotData)): pvalueRow = [] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0, len(plotData)): if y <= x: print >> stdout, "", if x == y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: try: pvalue = ansari(plotData[x], plotData[y])[1] except: pvalue = "NA" if minuslog10pvalue and str(pvalue) != "NA": try: pvalue = -1 * log(pvalue, 10) except: pvalue = -1000.0 #pvalue=1.0 print >> stdout, pvalue, pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout, "" if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix + "_Ansari_raw", xtickLabels, pvalueM) makePValueClusterPlot(outputClusterPrefix + "_Ansari", xtickLabels, pvalueM, methodCluster) ##### #####now the variance tests print >> stdout, "" print >> stdout, "Fligner's Two-sample Test for equal variance (non-parametrics)" print >> stdout, "p-val", for x in range(0, len(plotData)): print >> stdout, xtickLabels[x], pvalueM = [] print >> stdout, "" for x in range(0, len(plotData)): pvalueRow = [] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0, len(plotData)): if y <= x: print >> stdout, "", if x == y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: try: pvalue = fligner(plotData[x], plotData[y])[1] except: pvalue = "NA" #pvalue=1.0 if minuslog10pvalue and str(pvalue) != "NA": try: pvalue = -1 * log(pvalue, 10) except: pvalue = -1000.0 print >> stdout, pvalue, pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout, "" if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix + "_fligner_raw", xtickLabels, pvalueM) makePValueClusterPlot(outputClusterPrefix + "_fligner", xtickLabels, pvalueM, methodCluster) ##### #####now the variance tests print >> stdout, "" print >> stdout, "Levene's Two-sample Test for equal variance" print >> stdout, "p-val", for x in range(0, len(plotData)): print >> stdout, xtickLabels[x], pvalueM = [] print >> stdout, "" for x in range(0, len(plotData)): pvalueRow = [] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0, len(plotData)): if y <= x: print >> stdout, "", if x == y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: try: pvalue = levene(plotData[x], plotData[y])[1] except: pvalue = "NA" #pvalue=1.0 if minuslog10pvalue and str(pvalue) != "NA": try: pvalue = -1 * log(pvalue, 10) except: pvalue = -1000.0 print >> stdout, pvalue, pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout, "" if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix + "_levene_raw", xtickLabels, pvalueM) makePValueClusterPlot(outputClusterPrefix + "_levene", xtickLabels, pvalueM, methodCluster) ##### #####now the variance tests print >> stdout, "" print >> stdout, "Bartlett's Two-sample Test for equal variance (for normal distributions)" print >> stdout, "p-val", for x in range(0, len(plotData)): print >> stdout, xtickLabels[x], pvalueM = [] print >> stdout, "" for x in range(0, len(plotData)): pvalueRow = [] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0, len(plotData)): if y <= x: print >> stdout, "", if x == y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: try: pvalue = bartlett(plotData[x], plotData[y])[1] except: pvalue = "NA" #pvalue=1.0 if minuslog10pvalue and str(pvalue) != "NA": try: pvalue = -1 * log(pvalue, 10) except: pvalue = -1000.0 print >> stdout, pvalue, pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout, "" if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix + "_bartlett_raw", xtickLabels, pvalueM) makePValueClusterPlot(outputClusterPrefix + "_bartlett", xtickLabels, pvalueM, methodCluster) ##### figure(figsize=figsz) subplots_adjust(top=0.9, bottom=botta, left=0.2, right=0.8) if len(titl) == 0: titl = outputFile plotExpBox(plotData, xtickLabels, showIndPoints, mark, markMean, showMean, notch, whisker, outliers, xlegendrotation, xlabe, ylabe, titl, showSampleSizes, showViolin, showBox, annot, trendData, showLegend, makePzfxFile, makeBinMatrix, dividePlots) #ylim([0,200]) for m in medianToDraw: axhline(y=m, linestyle=':', color='gray') savefig(outputFile, bbox_inches="tight") if len(plotHistogramToFile) > 0: drawHistogram(plotHistogramToFile, plotData, xtickLabels) drawDensigram(plotHistogramToFile + ".density.png", plotData, xtickLabels)
def main(months=None, season="DJF", ax = None, clevels = None, labels = None, paths = None): if not months: months = [12, 1, 2] path_to_glaciers_land_sea_mask = "/b2_fs2/huziy/geophy_from_others/land_sea_glacier_mask_phy" land_sea_glaciers_mask = get_land_sea_glaciers_mask_from_geophysics_file(path=path_to_glaciers_land_sea_mask) p_current = "{0}-{1}".format(start_year_current, end_year_current) p_future = "{0}-{1}".format(start_year_future, end_year_future) lons2d = None lats2d = None x_index = None y_index = None mean_data = None b = None for the_path, label in zip(paths, labels): ds = Dataset(the_path) if lons2d is None: lons2d = ds.variables["longitude"][:] lats2d = ds.variables["latitude"][:] #b = get_arctic_basemap(lons2d, lats2d) b = get_arctic_basemap_nps(round = True) x_index = ds.variables["x_index"][:] y_index = ds.variables["y_index"][:] cache_file = "_".join([str(m) for m in months]) + "_{0}-{1}_{2}-{3}_{4}_mean_change_cache.bin".format( start_year_current, end_year_current, start_year_future, end_year_future, label) #os.remove(cache_file) if not os.path.isfile(cache_file): time_str = ds.variables["time"][:] times = [datetime.strptime("".join(t_s), TIME_FORMAT) for t_s in time_str] data = ds.variables["water_discharge_accumulated"][:] df = pandas.DataFrame(data=data, index=times) df["year"] = df.index.map(lambda d: d.year) df["month"] = df.index.map(lambda d: d.month) print(df.shape, df.columns) data_current = df.ix[ (df.year >= start_year_current) & (df.year <= end_year_current) & df.month.isin(months), :] print(data_current.columns) data_current = data_current.drop(["year", "month"], axis=1) seasonal_means_current = data_current.groupby( by=lambda d: d.year).mean() #calculate mean for the season for each year data_future = df.ix[(df.year >= start_year_future) & (df.year <= end_year_future) & df.month.isin(months), :] data_future = data_future.drop(["year", "month"], axis=1) seasonal_means_future = data_future.groupby(by=lambda d: d.year).mean() change = seasonal_means_future.values - seasonal_means_current.values mean_current = seasonal_means_current.values.mean(axis=0) mean_future = seasonal_means_future.values.mean(axis=0) ##axis0 - time, axis1 - cell index mean_change = change.mean(axis=0) #print change[:,mean_change > 200000], seasonal_means_future.values[:,mean_change > 200000], seasonal_means_current.values[:,mean_change > 200000] t, pvalue = stats.ttest_1samp(change, 0, axis=0) data_map = { "current-mean": mean_current, "future-mean": mean_future, "change": mean_change, "p-value": pvalue } pickle.dump(data_map, open(cache_file, mode="w")) else: data_map = pickle.load(open(cache_file)) mean_change = data_map["change"] pvalue = data_map["p-value"] mean_current = data_map["current-mean"] if ax is None: plt.figure() to_plot = np.ma.masked_all_like(lons2d) #mask nonsignificant changes #change_arr_significant = np.ma.masked_where(pvalue > 1, mean_change) #mean_change[mean_change > levels[-1]] = levels[-1] + 10 to_plot[x_index, y_index] = mean_change print(to_plot.min(), to_plot.max()) print(pvalue.min(), pvalue.max()) x, y = b(lons2d, lats2d) cmap = cm.get_cmap(name="bwr", lut=len(clevels) - 1) #cmap = my_colormaps.get_cmap_from_ncl_spec_file(path="colormap_files/BlueRed.rgb", ncolors=len(levels) - 1) #cmap.set_over(cmap(levels[-1])) bn = BoundaryNorm(clevels, cmap.N) #mask glaciers and oceans to_plot = np.ma.masked_where(land_sea_glaciers_mask, to_plot) #b.pcolormesh(x, y, to_plot) #img = b.pcolormesh(x, y, to_plot, vmin = -100, vmax = 100) img = b.pcolormesh(x, y, to_plot, cmap=cmap, vmax=clevels[-1], vmin=clevels[0], norm=bn) if ax is None: cb = b.colorbar(img, extend="both", ticks=clevels) #cb = plt.colorbar(ticks = levels) cb.ax.set_title(r"${\rm m^3/s}$") b.drawcoastlines(linewidth=0.1) b.drawmapboundary(fill_color="0.75") #b.drawmeridians(meridians=np.arange(-180, 180, 60)) #b.drawparallels(circles=np.arange(0, 90, 30)) b.readshapefile("data/shp/wri_basins2/wribasin", "basin", color="k", linewidth=1) if ax is None: plt.tight_layout() imfile = "offline_rout_{0}_mean_abschange_map_{1}_({2})-({3}).jpeg".format(season, label, p_future, p_current) plt.savefig(imfile, dpi=400) return img
def plotExpBox_Main(inputFiles,headers,valcols,outputFile,sep,startRow,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,plotPvalueCluster,outputClusterPrefix,methodCluster,xlegendrotation,xlabe,ylabe,figsz,titl,showSampleSizes,trimToMinSize,relabels,logb,plotHistogramToFile,plotMedianForGroups,botta): #if plotPvalueCluster: #if pvalue cluster is needed: # from Bio.Cluster.cluster import * # from Bio.Cluster import * #endif #the real deal! plotData=[] xtickLabels=[] minSize=-1 for inputFile,header,cols in zip(inputFiles,headers,valcols): fin=generic_istream(inputFile) startIdx=len(plotData) for col in cols: plotData.append([]) xtickLabels.append(header[col]) colIndices=range(startIdx,startIdx+len(cols)) lino=0 for lin in fin: lino+=1 if lino<startRow: continue fields=lin.rstrip("\r\n").split(sep) for idx,col in zip(colIndices,cols): try: value=float(fields[col]) if logb!=0: value=log(value)/logb if value<-100000: raise ValueError plotData[idx].append(value) except: pass fin.close() if minSize==-1: minSize=len(plotData[idx]) #or startIDX? else: minSize=min([minSize,len(plotData[idx])]) if trimToMinSize: print >> stderr,"trimming to min size =",minSize trimData(plotData,minSize) if len(relabels)>0: #if len(relabels)!=len(xtickLabels): # print >> stderr,"relabels doesn't have the same length as original label vectors",xtickLabels,"=>",relabels # exit() for i,relabel in zip(range(0,len(relabels)),relabels): xtickLabels[i]=relabel for i in range(0,len(plotMedianForGroups)): plotMedianForGroups[i]=getCol0ListFromCol1ListStringAdv(xtickLabels,plotMedianForGroups[i]) #drawing medians: medianToDraw=[] for mediangrouper in plotMedianForGroups: curD=[] for c in mediangrouper: curD.extend(plotData[c]) medianToDraw.append(median(curD)) for c in range(len(plotData)-1,-1,-1): if len(plotData[c])==0: print >> stderr,xtickLabels[c],"discarded" del plotData[c] del xtickLabels[c] print >> stdout,"student t-test (1 sample; mean=0)" print >> stdout,"sample","mean","p-val" for x in range(0,len(plotData)): #print >> stderr, len(plotData[x]) try: print >> stdout, xtickLabels[x],mean(plotData[x]),ttest_1samp(plotData[x],0)[1] except: print >> stdout, xtickLabels[x],"NA","NA" pvalueM=[] print >> stdout,"" print >> stdout,"student t-test (2 samples)" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: try: pvalue=ttest_ind(plotData[x],plotData[y])[1] except: pvalue=1.0 print >> stdout, str(pvalue), pvalueRow.append(pvalue) print >> stdout,""; print >> stdout,"" if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_t_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_t",xtickLabels,pvalueM,methodCluster) pvalueM=[] print >> stdout,"welch t-test" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: pvalue=welchs_approximate_ttest_arr(plotData[x],plotData[y])[3] print >> stdout, str(pvalue), pvalueRow.append(pvalue) print >> stdout,""; if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_Welch_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_Welch",xtickLabels,pvalueM,methodCluster) print >> stdout,"" print >> stdout,"non-parametric (Mann-Whitney U)" #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], pvalueM=[] print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: try: pvalue=mannwhitneyu(plotData[x],plotData[y])[1]*2 except: pvalue=1.0 print >> stdout,pvalue, #mann-whiteney need to mul by 2 (one tail to two tail) pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout,""; if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_U_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_U",xtickLabels,pvalueM,methodCluster) figure(figsize=figsz) subplots_adjust(top=0.9, bottom=botta, left=0.2, right=0.8) if len(titl)==0: titl=outputFile plotExpBox(plotData,xtickLabels,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,xlegendrotation,xlabe,ylabe,titl,showSampleSizes) #ylim([0,200]) for m in medianToDraw: axhline(y=m,linestyle=':',color='gray') savefig(outputFile,bbox_inches="tight") if len(plotHistogramToFile)>0: drawHistogram(plotHistogramToFile,plotData,xtickLabels)
def bootstrapping(): model = build_nn_model() #model.load_weights("../result/model/20200118-085651-496.h5") sample model.load_weights( "E:/experiments/MNIST_FL_1/model/20200317-171952-491-0.9456.h5") print("==> bootstrapping start") n_bootstraps = 10000 rng_seed = 3033 # control reproducibility bootstrapped_auroc = [] bootstrapped_auprc = [] bootstrapped_sen = [] bootstrapped_spe = [] bootstrapped_bac = [] bootstrapped_f1 = [] bootstrapped_pre = [] bootstrapped_NLR = [] bootstrapped_PLR = [] final = {} result = model.predict(test_images) auroc = metrics.roc_auc_score(test_labels, result, multi_class='ovr') print("auroc ovr : ", auroc) auroc_ovo = metrics.roc_auc_score(test_labels, result, multi_class='ovo') print("auroc ovo : ", auroc_ovo) result = np.argmax(result, axis=1) auprc = metrics.auc(test_labels, result) print("auprc : ", auprc) ''' fpr = dict() tpr = dict() for i in range(10): fpr[i], tpr[i], _ = roc_curve(test_labels[:, i], result[:, i]) print(fpr, tpr) fpr, tpr, thresholds = metrics.roc_curve(test_labels, result) #roc_auc = metrics.auc(fpr, tpr) ''' (precisions, recalls, thresholds) = metrics.precision_recall_curve(test_labels, result) minpse = np.max([min(x, y) for (x, y) in zip(precisions, recalls)]) result = np.argmax(result, axis=1) cf = metrics.confusion_matrix(test_labels, result) print(cf) cf = cf.astype(np.float32) acc = (cf[0][0] + cf[1][1]) / np.sum(cf) prec0 = cf[0][0] / (cf[0][0] + cf[1][0]) prec1 = cf[1][1] / (cf[1][1] + cf[0][1]) rec0 = cf[0][0] / (cf[0][0] + cf[0][1]) rec1 = cf[1][1] / (cf[1][1] + cf[1][0]) t = pd.concat([ pd.DataFrame(thresholds), pd.DataFrame(tpr), pd.DataFrame(1 - fpr), pd.DataFrame(((1 - fpr + tpr) / 2)) ], axis=1) t.columns = ['threshold', 'sensitivity', 'specificity', 'bac'] t_ = t.iloc[np.min(np.where(t['bac'] == max(t['bac']))), :] y_pred_ = (result >= t_['threshold']).astype(bool) cm_ = metrics.confusion_matrix(test_labels, result) tp = cm_[1, 1] fn = cm_[1, 0] fp = cm_[0, 1] tn = cm_[0, 0] bac = t_['bac'] # balanced accuracy sensitivity = t_['sensitivity'] # sensitivity specificity = t_['specificity'] # specificity precision = tp / (tp + fp) # precision f1 = 2 * ( (sensitivity * precision) / (sensitivity + precision)) # f1 score plr = sensitivity / (1 - specificity) # PLR nlr = (1 - sensitivity) / specificity # NLR rng = np.random.RandomState(rng_seed) y_true = np.array(test_labels) for j in range(n_bootstraps): indices = rng.random_integers(0, len(result) - 1, len(result)) if len(np.unique(y_true[indices])) < 2: continue auroc_ = metrics.roc_auc_score(y_true[indices], result[indices]) precision_, recall_, thresholds_ = metrics.precision_recall_curve( y_true[indices], result[indices]) auprc_ = metrics.auc(recall_, precision_) CM = metrics.confusion_matrix( np.array(y_true)[indices], result.argmax(axis=1)) TP = CM[1, 1] FN = CM[1, 0] FP = CM[0, 1] TN = CM[0, 0] TPV = TP / (TP + FN) # sensitivity TNV = TN / (TN + FP) # specificity PPV = TP / (TP + FP) # precision BAAC = (TPV + TNV) / 2 # balanced accuracy F1 = 2 * ((PPV * TPV) / (PPV + TPV)) # f1 score PLR = TPV / (1 - TNV) # LR+ NLR = (1 - TPV) / TNV # LR- bootstrapped_auroc.append(auroc_) # AUROC bootstrapped_auprc.append(auprc_) # AUPRC bootstrapped_sen.append(TPV) # Sensitivity bootstrapped_spe.append(TNV) # Specificity bootstrapped_bac.append(BAAC) # Balanced Accuracy bootstrapped_f1.append(F1) # F1 score bootstrapped_pre.append(PPV) # Precision bootstrapped_NLR.append(NLR) # Negative Likelihood Ratio bootstrapped_PLR.append(PLR) # positive Likelihood Ratio sorted_auroc = np.array(bootstrapped_auroc) sorted_auroc.sort() sorted_auprc = np.array(bootstrapped_auprc) sorted_auprc.sort() sorted_sen = np.array(bootstrapped_sen) sorted_sen.sort() sorted_spe = np.array(bootstrapped_spe) sorted_spe.sort() sorted_bac = np.array(bootstrapped_bac) sorted_bac.sort() sorted_f1 = np.array(bootstrapped_f1) sorted_f1.sort() sorted_pre = np.array(bootstrapped_pre) sorted_pre.sort() sorted_NLR = np.array(bootstrapped_NLR) sorted_NLR.sort() sorted_PLR = np.array(bootstrapped_PLR) sorted_PLR.sort() auroc_lower = round(sorted_auroc[int(0.025 * len(sorted_auroc))], 4) auroc_upper = round(sorted_auroc[int(0.975 * len(sorted_auroc))], 4) auprc_lower = round(sorted_auprc[int(0.025 * len(sorted_auprc))], 4) auprc_upper = round(sorted_auprc[int(0.975 * len(sorted_auprc))], 4) sen_lower = round(sorted_sen[int(0.025 * len(sorted_sen))], 4) sen_upper = round(sorted_sen[int(0.975 * len(sorted_sen))], 4) spe_lower = round(sorted_spe[int(0.025 * len(sorted_spe))], 4) spe_upper = round(sorted_spe[int(0.975 * len(sorted_spe))], 4) bac_lower = round(sorted_bac[int(0.025 * len(sorted_bac))], 4) bac_upper = round(sorted_bac[int(0.975 * len(sorted_bac))], 4) f1_lower = round(sorted_f1[int(0.025 * len(sorted_f1))], 4) f1_upper = round(sorted_f1[int(0.975 * len(sorted_f1))], 4) pre_lower = round(sorted_pre[int(0.025 * len(sorted_pre))], 4) pre_upper = round(sorted_pre[int(0.975 * len(sorted_pre))], 4) NLR_lower = round(sorted_NLR[int(0.025 * len(sorted_NLR))], 4) NLR_upper = round(sorted_NLR[int(0.975 * len(sorted_NLR))], 4) PLR_lower = round(sorted_PLR[int(0.025 * len(sorted_PLR))], 4) PLR_upper = round(sorted_PLR[int(0.975 * len(sorted_PLR))], 4) auroc_true_ci = str(round( auroc, 4)) + " (" + str(auroc_lower) + ", " + str(auroc_upper) + ")" auprc_true_ci = str(round( auprc, 4)) + " (" + str(auprc_lower) + ", " + str(auprc_upper) + ")" sen_true_ci = str(round( sensitivity, 4)) + " (" + str(sen_lower) + ", " + str(sen_upper) + ")" spe_true_ci = str(round( specificity, 4)) + " (" + str(spe_lower) + ", " + str(spe_upper) + ")" bac_true_ci = str(round( bac, 4)) + " (" + str(bac_lower) + ", " + str(bac_upper) + ")" f1_true_ci = str(round( f1, 4)) + " (" + str(f1_lower) + ", " + str(f1_upper) + ")" pre_true_ci = str(round( precision, 4)) + " (" + str(pre_lower) + ", " + str(pre_upper) + ")" NLR_true_ci = str(round( nlr, 4)) + " (" + str(NLR_lower) + ", " + str(NLR_upper) + ")" PLR_true_ci = str(round( plr, 4)) + " (" + str(PLR_lower) + ", " + str(PLR_upper) + ")" # col_n = [ 'thresholds', 'sensitivity', 'specificity', 'precision', 'bacc', 'f1', 'PLR', 'NLR', 'AUROC', 'AUPRC' ] final = { "thresholds": round(t_['threshold'], 4), "sensitivity": sen_true_ci, "specificity": spe_true_ci, "precision": pre_true_ci, "bacc": bac_true_ci, "f1": f1_true_ci, "PLR": PLR_true_ci, "NLR": NLR_true_ci, "AUROC": auroc_true_ci, "AUPRC": auprc_true_ci } final = pd.DataFrame(final, index=[0]) #final1 = pd.DataFrame(final) final = final.reindex(columns=col_n) total_item = { "thresholds": round(t_['threshold'], 4), "sensitivity": sorted_sen, "specificity": sorted_spe, "precision": sorted_pre, "bacc": sorted_bac, "f1": sorted_f1, "PLR": sorted_PLR, "NLR": sorted_NLR, "AUROC": sorted_auroc, "AUPRC": sorted_auprc } total_pd = pd.DataFrame.from_dict(total_item, orient='columns') print(total_pd) final2 = pd.DataFrame.append(final, total_pd) final2.to_csv("fl_1_bootstrapping.csv", mode="w") print("==> bootstrapping end") t_test_result = stats.ttest_1samp(sorted_auroc, 0.999) print("t-test : ", t_test_result)
def markov_chain(data_set, no_iteration=10, no_of_simulation=10000, alpha=5): import_dataset_v1 = data_set.copy() import_dataset_v1 = (import_dataset_v1.reindex( import_dataset_v1.index.repeat( import_dataset_v1.conversions))).reset_index() # print(import_dataset_v1, '----2') import_dataset_v1['conversions'] = 1 # print(import_dataset_v1['conversions'], '---3') import_dataset_v1 = import_dataset_v1[['path', 'conversions']] # print(import_dataset_v1 , '----4') import_dataset = (import_dataset_v1.groupby(['path']).sum()).reset_index() # print(import_dataset, '----5') import_dataset['probability'] = import_dataset[ 'conversions'] / import_dataset['conversions'].sum() # print(import_dataset["probability"], '----6') final = pd.DataFrame() for k in range(0, no_iteration): start = time.time() import_data = pd.DataFrame({ 'path': np.random.choice(import_dataset['path'], size=import_dataset['conversions'].sum(), p=import_dataset['probability'], replace=True) }) import_data['conversions'] = 1 # print(import_data, '----7') tr_matrix = transition_matrix_func(import_data) channel_only = list( filter(lambda k0: k0 not in ['start', 'convert'], tr_matrix.columns)) ga_ex = pd.DataFrame() print(ga_ex) tr_mat = tr_matrix.copy() p = [] i = 0 while i < no_of_simulation: p.append(unique(simulation(tr_mat, 1000))) i = i + 1 path = list(itertools.chain.from_iterable(p)) counter = collections.Counter(path) df = pd.DataFrame({ 'path': list(counter.keys()), 'count': list(counter.values()) }) df = df[['path', 'count']] ga_ex = ga_ex.append(df, ignore_index=True) df1 = (pd.DataFrame(ga_ex.groupby(['path'])[['count' ]].sum())).reset_index() df1['removal_effects'] = df1['count'] / len(path) #df1['removal_effects']=df1['count']/sum(df1['count'][df1['path']=='convert']) df1 = df1[df1['path'].isin(channel_only)] df1['ass_conversion'] = df1['removal_effects'] / sum( df1['removal_effects']) df1['ass_conversion'] = df1['ass_conversion'] * sum( import_dataset['conversions']) final = final.append(df1, ignore_index=True) end = time.time() t1 = (end - start) print(t1) ''' H0: u=0 H1: u>0 ''' unique_channel = unique(final['path']) #final=(pd.DataFrame(final.groupby(['path'])[['ass_conversion']].mean())).reset_index() final_df = pd.DataFrame() for i in range(0, len(unique_channel)): x = ( final['ass_conversion'][final['path'] == unique_channel[i]]).values final_df.loc[i, 0] = unique_channel[i] final_df.loc[i, 1] = x.mean() v = stats.ttest_1samp(x, 0) final_df.loc[i, 2] = v[1] / 2 if v[1] / 2 <= alpha / 100: final_df.loc[i, 3] = str(100 - alpha) + '% statistically confidence' else: final_df.loc[i, 3] = str(100 - alpha) + '% statistically not confidence' final_df.loc[i, 4] = len(x) final_df.loc[i, 5] = statistics.stdev(x) final_df.loc[i, 6] = v[0] final_df.columns = [ 'channel', 'ass_conversion', 'p_value', 'confidence_status', 'frequency', 'standard_deviation', 't_statistics' ] final_df['ass_conversion'] = sum( import_dataset['conversions']) * final_df['ass_conversion'] / sum( final_df['ass_conversion']) return final_df, final
def calculate_gene_expression_similarity(reduced_stat_map_data, mask="full"): store_file = "/ahba_data/store_max1_reduced.h5" subcortex_mask = "/ahba_data/subcortex_mask.npy" results_dfs = [] with pd.HDFStore(store_file, 'r') as store: for donor_id in store.keys(): print "Loading expression data (%s)" % donor_id expression_data = store.get(donor_id.replace(".", "_")) print "Getting statmap values (%s)" % donor_id nifti_values = reduced_stat_map_data[expression_data.columns] print "Removing missing values (%s)" % donor_id na_mask = np.isnan(nifti_values) if mask == "subcortex": na_mask = np.logical_or( na_mask, np.isnan(np.load(subcortex_mask)[expression_data.columns])) elif mask == "cortex": na_mask = np.logical_or( na_mask, np.logical_not( np.isnan( np.load(subcortex_mask)[expression_data.columns]))) else: assert mask == "full" nifti_values = np.array(nifti_values)[np.logical_not(na_mask)] expression_data.drop(expression_data.columns[na_mask], axis=1, inplace=True) print "z scoring (%s)" % donor_id expression_data = pd.DataFrame(zscore(expression_data, axis=1), columns=expression_data.columns, index=expression_data.index) nifti_values = zscore(nifti_values) print "Calculating linear regressions (%s)" % donor_id regression_results = np.linalg.lstsq( np.c_[nifti_values, np.ones_like(nifti_values)], expression_data.T) results_df = pd.DataFrame({"slope": regression_results[0][0]}, index=expression_data.index) results_df.columns = pd.MultiIndex.from_tuples( [( donor_id[1:], c, ) for c in results_df.columns], names=['donor_id', 'parameter']) results_dfs.append(results_df) print "Concatenating results" results_df = pd.concat(results_dfs, axis=1) del results_dfs t, p = ttest_1samp(results_df, 0.0, axis=1) group_results_df = pd.DataFrame({ "t": t, "p": p }, columns=['t', 'p'], index=expression_data.index) _, group_results_df["p (FDR corrected)"], _, _ = multipletests( group_results_df.p, method='fdr_bh') group_results_df["variance explained (mean)"] = ( results_df.xs('slope', axis=1, level=1)**2 * 100).mean(axis=1) group_results_df["variance explained (std)"] = ( results_df.xs('slope', axis=1, level=1)**2 * 100).std(axis=1) del results_df probe_info = pd.read_csv("/ahba_data/probe_info_max1.csv", index_col=0).drop(['chromosome', "gene_id"], axis=1) group_results_df = group_results_df.join(probe_info) group_results_df = group_results_df[[ "gene_symbol", "entrez_id.1", "gene_name", "t", "p", "p (FDR corrected)", "variance explained (mean)", "variance explained (std)" ]] return group_results_df
def plotExpBox_Main(inputFiles,headers,valcols,outputFile,sep,startRow,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,plotPvalueCluster,outputClusterPrefix,methodCluster,xlegendrotation,xlabe,ylabe,figsz,titl,showSampleSizes,trimToMinSize,relabels,logb,plotHistogramToFile,plotMedianForGroups,botta,showViolin,showBox,firstColAnnot,plotTrend,showLegend,makePzfxFile,makeBinMatrix,writeDataSummaryStat,summaryStatRange,minuslog10pvalue,minNDataToKeep,vfacecolor,valpha,outXYZPvalues,dividePlots): #if plotPvalueCluster: #if pvalue cluster is needed: # from Bio.Cluster.cluster import * # from Bio.Cluster import * #endif #the real deal! plotData=[] xtickLabels=[] trendData={} annot={} minSize=-1 for inputFile,header,cols in zip(inputFiles,headers,valcols): fin=generic_istream(inputFile) startIdx=len(plotData) if firstColAnnot: colAnnot=cols[0] cols=cols[1:] annotThisFile=[] annot[startIdx]=annotThisFile else: colAnnot=-1 annotThisFile=None for col in cols: plotData.append([]) xtickLabels.append(header[col]) colIndices=range(startIdx,startIdx+len(cols)) if plotTrend: #print >> stderr,"plotTrend" trendDataThisFile=[] trendData[startIdx]=trendDataThisFile else: trendDataThisFile=None lino=0 for lin in fin: lino+=1 if lino<startRow: continue fields=lin.rstrip("\r\n").split(sep) if plotTrend: #print >> stderr,"a" trendDataThisLine=[] else: trendDataThisLine=None allDataOKThisLine=True if colAnnot>=0: annotThisFile.append(fields[colAnnot]) for idx,col in zip(colIndices,cols): try: value=float(fields[col]) if logb!=0: if value==0.0: raise ValueError value=log(value)/logb plotData[idx].append(value) if plotTrend: trendDataThisLine.append(value) #print >> stderr,"value:",value except: allDataOKThisLine=False if plotTrend: if allDataOKThisLine: trendDataThisFile.append(trendDataThisLine) else: trendDataThisFile.append(None) fin.close() if minSize==-1: minSize=len(plotData[idx]) #or startIDX? else: minSize=min([minSize,len(plotData[idx])]) if trimToMinSize: print >> stderr,"trimming to min size =",minSize trimData(plotData,minSize) if len(relabels)>0: #if len(relabels)!=len(xtickLabels): # print >> stderr,"relabels doesn't have the same length as original label vectors",xtickLabels,"=>",relabels # exit() print >> stderr,xtickLabels print >> stderr,relabels for i,relabel in zip(range(0,len(relabels)),relabels): xtickLabels[i]=relabel for i in range(0,len(plotMedianForGroups)): plotMedianForGroups[i]=getCol0ListFromCol1ListStringAdv(xtickLabels,plotMedianForGroups[i]) #drawing medians: medianToDraw=[] for mediangrouper in plotMedianForGroups: curD=[] for c in mediangrouper: curD.extend(plotData[c]) medianToDraw.append(median(curD)) for c in range(len(plotData)-1,-1,-1): if len(plotData[c])<minNDataToKeep: print >> stderr,xtickLabels[c],"discarded because has only",len(plotData[c]),"data points <",minNDataToKeep del plotData[c] del xtickLabels[c] if not skipStat: print >> stdout,"student t-test (1 sample; mean=0)" print >> stdout,"sample","mean","p-val","median" if writeDataSummaryStat: fDSS=open(writeDataSummaryStat,"w") print >> fDSS,"sample\tmean\tvar\tsd\tmin\tmax\tN\tNInRange["+str(summaryStatRange[0])+","+str(summaryStatRange[1])+"]\t%NInRange\tNbelowRange\t%Nbelow\tNAboveRange\t%NAbove" for x in range(0,len(plotData)): #print >> stderr, len(plotData[x]) try: print >> stdout, xtickLabels[x],mean(plotData[x]),ttest_1samp(plotData[x],0)[1],median(plotData[x]) except: print >> stdout, xtickLabels[x],mean(plotData[x]),"NA",median(plotData[x]) if writeDataSummaryStat: sumData,N,NIN,NBelow,NAbove=filterDataInRangeInclusive(plotData[x],summaryStatRange[0],summaryStatRange[1]) if NIN>1: #print >> stderr,"sumData=",sumData #print >> stderr,mean mea=mean2(sumData) DDOF=1 sd=std(sumData,ddof=DDOF) var=sd*sd mi=min(sumData) ma=max(sumData) else: mea="NA" sd="NA" var="NA" mi="NA" ma="NA" print >> fDSS,xtickLabels[x]+"\t"+str(mea)+"\t"+str(var)+"\t"+str(sd)+"\t"+str(mi)+"\t"+str(ma)+"\t"+str(N)+"\t"+str(NIN)+"\t"+str(float(NIN)*100/N)+"\t"+str(NBelow)+"\t"+str(float(NBelow)*100/N)+"\t"+str(NAbove)+"\t"+str(float(NAbove)*100/N) pvalueM=[] if writeDataSummaryStat: fDSS.close() print >> stdout,"" print >> stdout,"student t-test (2 samples)" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: try: pvalue=ttest_ind(plotData[x],plotData[y])[1] except: pvalue=1.0 if minuslog10pvalue and str(pvalue)!="NA": try: pvalue=-1*log(pvalue,10) except: pvalue=-1000.0 print >> stdout, str(pvalue), pvalueRow.append(pvalue) print >> stdout,""; print >> stdout,"" if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_t_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_t",xtickLabels,pvalueM,methodCluster) pvalueM=[] print >> stdout,"welch t-test" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: try: pvalue=welchs_approximate_ttest_arr(plotData[x],plotData[y])[3] except: pvalue=1.0 if minuslog10pvalue and str(pvalue)!="NA": try: pvalue=-1*log(pvalue,10) except: pvalue=-1000.0 print >> stdout, str(pvalue), pvalueRow.append(pvalue) print >> stdout,""; if outXYZPvalues: writeXYZPvalues(outXYZPvalues+"_Welch.xyz",xtickLabels,pvalueM) if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_Welch_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_Welch",xtickLabels,pvalueM,methodCluster) print >> stdout,"" print >> stdout,"non-parametric (Mann-Whitney U)" #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], pvalueM=[] print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: try: pvalue=mannwhitneyu(plotData[x],plotData[y])[1]*2 except: pvalue=1.0 if minuslog10pvalue and str(pvalue)!="NA": try: pvalue=-1*log(pvalue,10) except: pvalue=-1000.0 print >> stdout,pvalue, #mann-whiteney need to mul by 2 (one tail to two tail) pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout,""; if outXYZPvalues: writeXYZPvalues(outXYZPvalues+"_U.xyz",xtickLabels,pvalueM) if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_U_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_U",xtickLabels,pvalueM,methodCluster) #####now the variance tests print >> stdout,"" print >> stdout,"Ansari-Bradley Two-sample Test for difference in scale parameters " print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], pvalueM=[] print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: try: pvalue=ansari(plotData[x],plotData[y])[1] except: pvalue="NA" if minuslog10pvalue and str(pvalue)!="NA": try: pvalue=-1*log(pvalue,10) except: pvalue=-1000.0 #pvalue=1.0 print >> stdout,pvalue, pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout,""; if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_Ansari_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_Ansari",xtickLabels,pvalueM,methodCluster) ##### #####now the variance tests print >> stdout,"" print >> stdout,"Fligner's Two-sample Test for equal variance (non-parametrics)" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], pvalueM=[] print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: try: pvalue=fligner(plotData[x],plotData[y])[1] except: pvalue="NA" #pvalue=1.0 if minuslog10pvalue and str(pvalue)!="NA": try: pvalue=-1*log(pvalue,10) except: pvalue=-1000.0 print >> stdout,pvalue, pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout,""; if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_fligner_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_fligner",xtickLabels,pvalueM,methodCluster) ##### #####now the variance tests print >> stdout,"" print >> stdout,"Levene's Two-sample Test for equal variance" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], pvalueM=[] print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: try: pvalue=levene(plotData[x],plotData[y])[1] except: pvalue="NA" #pvalue=1.0 if minuslog10pvalue and str(pvalue)!="NA": try: pvalue=-1*log(pvalue,10) except: pvalue=-1000.0 print >> stdout,pvalue, pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout,""; if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_levene_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_levene",xtickLabels,pvalueM,methodCluster) ##### #####now the variance tests print >> stdout,"" print >> stdout,"Bartlett's Two-sample Test for equal variance (for normal distributions)" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], pvalueM=[] print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: try: pvalue=bartlett(plotData[x],plotData[y])[1] except: pvalue="NA" #pvalue=1.0 if minuslog10pvalue and str(pvalue)!="NA": try: pvalue=-1*log(pvalue,10) except: pvalue=-1000.0 print >> stdout,pvalue, pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout,""; if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_bartlett_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_bartlett",xtickLabels,pvalueM,methodCluster) ##### figure(figsize=figsz) subplots_adjust(top=0.9, bottom=botta, left=0.2, right=0.8) if len(titl)==0: titl=outputFile plotExpBox(plotData,xtickLabels,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,xlegendrotation,xlabe,ylabe,titl,showSampleSizes,showViolin,showBox,annot,trendData,showLegend,makePzfxFile,makeBinMatrix,dividePlots) #ylim([0,200]) for m in medianToDraw: axhline(y=m,linestyle=':',color='gray') savefig(outputFile,bbox_inches="tight") if len(plotHistogramToFile)>0: drawHistogram(plotHistogramToFile,plotData,xtickLabels) drawDensigram(plotHistogramToFile+".density.png",plotData,xtickLabels)
def main(months=None, season="DJF", ax=None, clevels=None, labels=None, paths=None): if not months: months = [12, 1, 2] path_to_glaciers_land_sea_mask = "/b2_fs2/huziy/geophy_from_others/land_sea_glacier_mask_phy" land_sea_glaciers_mask = get_land_sea_glaciers_mask_from_geophysics_file( path=path_to_glaciers_land_sea_mask) p_current = "{0}-{1}".format(start_year_current, end_year_current) p_future = "{0}-{1}".format(start_year_future, end_year_future) lons2d = None lats2d = None x_index = None y_index = None mean_data = None b = None for the_path, label in zip(paths, labels): ds = Dataset(the_path) if lons2d is None: lons2d = ds.variables["longitude"][:] lats2d = ds.variables["latitude"][:] #b = get_arctic_basemap(lons2d, lats2d) b = get_arctic_basemap_nps(round=True) x_index = ds.variables["x_index"][:] y_index = ds.variables["y_index"][:] cache_file = "_".join([ str(m) for m in months ]) + "_{0}-{1}_{2}-{3}_{4}_mean_change_cache.bin".format( start_year_current, end_year_current, start_year_future, end_year_future, label) #os.remove(cache_file) if not os.path.isfile(cache_file): time_str = ds.variables["time"][:] times = [ datetime.strptime("".join(t_s), TIME_FORMAT) for t_s in time_str ] data = ds.variables["water_discharge_accumulated"][:] df = pandas.DataFrame(data=data, index=times) df["year"] = df.index.map(lambda d: d.year) df["month"] = df.index.map(lambda d: d.month) print(df.shape, df.columns) data_current = df.ix[(df.year >= start_year_current) & (df.year <= end_year_current) & df.month.isin(months), :] print(data_current.columns) data_current = data_current.drop(["year", "month"], axis=1) seasonal_means_current = data_current.groupby( by=lambda d: d.year).mean( ) #calculate mean for the season for each year data_future = df.ix[(df.year >= start_year_future) & (df.year <= end_year_future) & df.month.isin(months), :] data_future = data_future.drop(["year", "month"], axis=1) seasonal_means_future = data_future.groupby( by=lambda d: d.year).mean() change = seasonal_means_future.values - seasonal_means_current.values mean_current = seasonal_means_current.values.mean(axis=0) mean_future = seasonal_means_future.values.mean(axis=0) ##axis0 - time, axis1 - cell index mean_change = change.mean(axis=0) #print change[:,mean_change > 200000], seasonal_means_future.values[:,mean_change > 200000], seasonal_means_current.values[:,mean_change > 200000] t, pvalue = stats.ttest_1samp(change, 0, axis=0) data_map = { "current-mean": mean_current, "future-mean": mean_future, "change": mean_change, "p-value": pvalue } pickle.dump(data_map, open(cache_file, mode="w")) else: data_map = pickle.load(open(cache_file)) mean_change = data_map["change"] pvalue = data_map["p-value"] mean_current = data_map["current-mean"] if ax is None: plt.figure() to_plot = np.ma.masked_all_like(lons2d) #mask nonsignificant changes #change_arr_significant = np.ma.masked_where(pvalue > 1, mean_change) #mean_change[mean_change > levels[-1]] = levels[-1] + 10 to_plot[x_index, y_index] = mean_change print(to_plot.min(), to_plot.max()) print(pvalue.min(), pvalue.max()) x, y = b(lons2d, lats2d) cmap = cm.get_cmap(name="bwr", lut=len(clevels) - 1) #cmap = my_colormaps.get_cmap_from_ncl_spec_file(path="colormap_files/BlueRed.rgb", ncolors=len(levels) - 1) #cmap.set_over(cmap(levels[-1])) bn = BoundaryNorm(clevels, cmap.N) #mask glaciers and oceans to_plot = np.ma.masked_where(land_sea_glaciers_mask, to_plot) #b.pcolormesh(x, y, to_plot) #img = b.pcolormesh(x, y, to_plot, vmin = -100, vmax = 100) img = b.pcolormesh(x, y, to_plot, cmap=cmap, vmax=clevels[-1], vmin=clevels[0], norm=bn) if ax is None: cb = b.colorbar(img, extend="both", ticks=clevels) #cb = plt.colorbar(ticks = levels) cb.ax.set_title(r"${\rm m^3/s}$") b.drawcoastlines(linewidth=0.1) b.drawmapboundary(fill_color="0.75") #b.drawmeridians(meridians=np.arange(-180, 180, 60)) #b.drawparallels(circles=np.arange(0, 90, 30)) b.readshapefile("data/shp/wri_basins2/wribasin", "basin", color="k", linewidth=1) if ax is None: plt.tight_layout() imfile = "offline_rout_{0}_mean_abschange_map_{1}_({2})-({3}).jpeg".format( season, label, p_future, p_current) plt.savefig(imfile, dpi=400) return img
bounds = np.where(np.diff(np.argmax(ev.segments_[0], axis=1)))[0] match = np.zeros(nPerm+1) perm_bounds = bounds.copy() for p in range(nPerm+1): for hb in human_bounds: if np.any(np.abs(perm_bounds - hb) <= w): match[p] += 1 match[p] /= len(human_bounds) np.random.seed(p) perm_bounds = np.random.choice(nTR,K-1,replace=False) z_scores[k,s] = (match[0] - np.mean(match[1:]))/np.std(match[1:]) t_scores[k] = stats.ttest_1samp(z_scores[k,:],0,axis=0)[0] savedir = '/jukebox/norman/jamalw/MES/prototype/link/scripts/data/searchlight_output/HMM_searchlight_human_bounds_srm/plots/bilateral_mPFC/' np.save(savedir + 'zscores',z_scores) np.save(savedir + 'tstats', t_scores) fig, ax1 = plt.subplots() xp = np.linspace(3,features[-1:],100) p1 = np.poly1d(np.polyfit(features,np.mean(z_scores,axis=1),2)) ax1.plot(features,np.mean(z_scores,axis=1),'.',xp, p1(xp), '-',color='k',linewidth=3,markersize=15) ax1.set_ylabel('average z', color='k', fontsize=18) ax1.tick_params(labelsize=15) ax2 = ax1.twinx()